You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

1751 lines
59 KiB

  1. <?php
  2. /**
  3. * (c) Kitodo. Key to digital objects e.V. <contact@kitodo.org>
  4. *
  5. * This file is part of the Kitodo and TYPO3 projects.
  6. *
  7. * @license GNU General Public License version 3 or later.
  8. * For the full copyright and license information, please read the
  9. * LICENSE.txt file that was distributed with this source code.
  10. */
  11. namespace Kitodo\Dlf\Common;
  12. use TYPO3\CMS\Core\Database\ConnectionPool;
  13. use TYPO3\CMS\Core\Utility\GeneralUtility;
  14. use TYPO3\CMS\Core\Utility\MathUtility;
  15. use Ubl\Iiif\Presentation\Common\Model\Resources\IiifResourceInterface;
  16. use Ubl\Iiif\Tools\IiifHelper;
  17. /**
  18. * Document class for the 'dlf' extension
  19. *
  20. * @author Sebastian Meyer <sebastian.meyer@slub-dresden.de>
  21. * @author Henrik Lochmann <dev@mentalmotive.com>
  22. * @package TYPO3
  23. * @subpackage dlf
  24. * @access public
  25. * @property-write int $cPid This holds the PID for the configuration
  26. * @property-read bool $hasFulltext Are there any fulltext files available?
  27. * @property-read string $location This holds the documents location
  28. * @property-read array $metadataArray This holds the documents' parsed metadata array
  29. * @property-read int $numPages The holds the total number of pages
  30. * @property-read int $parentId This holds the UID of the parent document or zero if not multi-volumed
  31. * @property-read array $physicalStructure This holds the physical structure
  32. * @property-read array $physicalStructureInfo This holds the physical structure metadata
  33. * @property-read int $pid This holds the PID of the document or zero if not in database
  34. * @property-read bool $ready Is the document instantiated successfully?
  35. * @property-read string $recordId The METS file's / IIIF manifest's record identifier
  36. * @property-read int $rootId This holds the UID of the root document or zero if not multi-volumed
  37. * @property-read array $smLinks This holds the smLinks between logical and physical structMap
  38. * @property-read array $tableOfContents This holds the logical structure
  39. * @property-read string $thumbnail This holds the document's thumbnail location
  40. * @property-read string $toplevelId This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
  41. * @property-read mixed $uid This holds the UID or the URL of the document
  42. * @abstract
  43. */
  44. abstract class Document
  45. {
  46. /**
  47. * This holds the PID for the configuration
  48. *
  49. * @var int
  50. * @access protected
  51. */
  52. protected $cPid = 0;
  53. /**
  54. * The extension key
  55. *
  56. * @var string
  57. * @access public
  58. */
  59. public static $extKey = 'dlf';
  60. /**
  61. * This holds the configuration for all supported metadata encodings
  62. * @see loadFormats()
  63. *
  64. * @var array
  65. * @access protected
  66. */
  67. protected $formats = [
  68. 'OAI' => [
  69. 'rootElement' => 'OAI-PMH',
  70. 'namespaceURI' => 'http://www.openarchives.org/OAI/2.0/',
  71. ],
  72. 'METS' => [
  73. 'rootElement' => 'mets',
  74. 'namespaceURI' => 'http://www.loc.gov/METS/',
  75. ],
  76. 'XLINK' => [
  77. 'rootElement' => 'xlink',
  78. 'namespaceURI' => 'http://www.w3.org/1999/xlink',
  79. ]
  80. ];
  81. /**
  82. * Are the available metadata formats loaded?
  83. * @see $formats
  84. *
  85. * @var bool
  86. * @access protected
  87. */
  88. protected $formatsLoaded = false;
  89. /**
  90. * Are there any fulltext files available? This also includes IIIF text annotations
  91. * with motivation 'painting' if Kitodo.Presentation is configured to store text
  92. * annotations as fulltext.
  93. *
  94. * @var bool
  95. * @access protected
  96. */
  97. protected $hasFulltext = false;
  98. /**
  99. * Last searched logical and physical page
  100. *
  101. * @var array
  102. * @access protected
  103. */
  104. protected $lastSearchedPhysicalPage = ['logicalPage' => null, 'physicalPage' => null];
  105. /**
  106. * This holds the documents location
  107. *
  108. * @var string
  109. * @access protected
  110. */
  111. protected $location = '';
  112. /**
  113. * This holds the logical units
  114. *
  115. * @var array
  116. * @access protected
  117. */
  118. protected $logicalUnits = [];
  119. /**
  120. * This holds the documents' parsed metadata array with their corresponding
  121. * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
  122. *
  123. * @var array
  124. * @access protected
  125. */
  126. protected $metadataArray = [];
  127. /**
  128. * Is the metadata array loaded?
  129. * @see $metadataArray
  130. *
  131. * @var bool
  132. * @access protected
  133. */
  134. protected $metadataArrayLoaded = false;
  135. /**
  136. * The holds the total number of pages
  137. *
  138. * @var int
  139. * @access protected
  140. */
  141. protected $numPages = 0;
  142. /**
  143. * This holds the UID of the parent document or zero if not multi-volumed
  144. *
  145. * @var int
  146. * @access protected
  147. */
  148. protected $parentId = 0;
  149. /**
  150. * This holds the physical structure
  151. *
  152. * @var array
  153. * @access protected
  154. */
  155. protected $physicalStructure = [];
  156. /**
  157. * This holds the physical structure metadata
  158. *
  159. * @var array
  160. * @access protected
  161. */
  162. protected $physicalStructureInfo = [];
  163. /**
  164. * Is the physical structure loaded?
  165. * @see $physicalStructure
  166. *
  167. * @var bool
  168. * @access protected
  169. */
  170. protected $physicalStructureLoaded = false;
  171. /**
  172. * This holds the PID of the document or zero if not in database
  173. *
  174. * @var int
  175. * @access protected
  176. */
  177. protected $pid = 0;
  178. /**
  179. * This holds the documents' raw text pages with their corresponding
  180. * structMap//div's ID (METS) or Range / Manifest / Sequence ID (IIIF) as array key
  181. *
  182. * @var array
  183. * @access protected
  184. */
  185. protected $rawTextArray = [];
  186. /**
  187. * Is the document instantiated successfully?
  188. *
  189. * @var bool
  190. * @access protected
  191. */
  192. protected $ready = false;
  193. /**
  194. * The METS file's / IIIF manifest's record identifier
  195. *
  196. * @var string
  197. * @access protected
  198. */
  199. protected $recordId;
  200. /**
  201. * This holds the singleton object of the document
  202. *
  203. * @var array (\Kitodo\Dlf\Common\Document)
  204. * @static
  205. * @access protected
  206. */
  207. protected static $registry = [];
  208. /**
  209. * This holds the UID of the root document or zero if not multi-volumed
  210. *
  211. * @var int
  212. * @access protected
  213. */
  214. protected $rootId = 0;
  215. /**
  216. * Is the root id loaded?
  217. * @see $rootId
  218. *
  219. * @var bool
  220. * @access protected
  221. */
  222. protected $rootIdLoaded = false;
  223. /**
  224. * This holds the smLinks between logical and physical structMap
  225. *
  226. * @var array
  227. * @access protected
  228. */
  229. protected $smLinks = ['l2p' => [], 'p2l' => []];
  230. /**
  231. * Are the smLinks loaded?
  232. * @see $smLinks
  233. *
  234. * @var bool
  235. * @access protected
  236. */
  237. protected $smLinksLoaded = false;
  238. /**
  239. * This holds the logical structure
  240. *
  241. * @var array
  242. * @access protected
  243. */
  244. protected $tableOfContents = [];
  245. /**
  246. * Is the table of contents loaded?
  247. * @see $tableOfContents
  248. *
  249. * @var bool
  250. * @access protected
  251. */
  252. protected $tableOfContentsLoaded = false;
  253. /**
  254. * This holds the document's thumbnail location
  255. *
  256. * @var string
  257. * @access protected
  258. */
  259. protected $thumbnail = '';
  260. /**
  261. * Is the document's thumbnail location loaded?
  262. * @see $thumbnail
  263. *
  264. * @var bool
  265. * @access protected
  266. */
  267. protected $thumbnailLoaded = false;
  268. /**
  269. * This holds the toplevel structure's @ID (METS) or the manifest's @id (IIIF)
  270. *
  271. * @var string
  272. * @access protected
  273. */
  274. protected $toplevelId = '';
  275. /**
  276. * This holds the UID or the URL of the document
  277. *
  278. * @var mixed
  279. * @access protected
  280. */
  281. protected $uid = 0;
  282. /**
  283. * This holds the whole XML file as \SimpleXMLElement object
  284. *
  285. * @var \SimpleXMLElement
  286. * @access protected
  287. */
  288. protected $xml;
  289. /**
  290. * This clears the static registry to prevent memory exhaustion
  291. *
  292. * @access public
  293. *
  294. * @static
  295. *
  296. * @return void
  297. */
  298. public static function clearRegistry()
  299. {
  300. // Reset registry array.
  301. self::$registry = [];
  302. }
  303. /**
  304. * This ensures that the recordId, if existent, is retrieved from the document
  305. *
  306. * @access protected
  307. *
  308. * @abstract
  309. *
  310. * @param int $pid: ID of the configuration page with the recordId config
  311. *
  312. */
  313. protected abstract function establishRecordId($pid);
  314. /**
  315. * Source document PHP object which is represented by a Document instance
  316. *
  317. * @access protected
  318. *
  319. * @abstract
  320. *
  321. * @return \SimpleXMLElement|IiifResourceInterface An PHP object representation of
  322. * the current document. SimpleXMLElement for METS, IiifResourceInterface for IIIF
  323. */
  324. protected abstract function getDocument();
  325. /**
  326. * This gets the location of a downloadable file for a physical page or track
  327. *
  328. * @access public
  329. *
  330. * @abstract
  331. *
  332. * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
  333. *
  334. * @return string The file's location as URL
  335. */
  336. public abstract function getDownloadLocation($id);
  337. /**
  338. * This gets the location of a file representing a physical page or track
  339. *
  340. * @access public
  341. *
  342. * @abstract
  343. *
  344. * @param string $id: The @ID attribute of the file node (METS) or the @id property of the IIIF resource
  345. *
  346. * @return string The file's location as URL
  347. */
  348. public abstract function getFileLocation($id);
  349. /**
  350. * This gets the MIME type of a file representing a physical page or track
  351. *
  352. * @access public
  353. *
  354. * @abstract
  355. *
  356. * @param string $id: The @ID attribute of the file node
  357. *
  358. * @return string The file's MIME type
  359. */
  360. public abstract function getFileMimeType($id);
  361. /**
  362. * This is a singleton class, thus an instance must be created by this method
  363. *
  364. * @access public
  365. *
  366. * @static
  367. *
  368. * @param mixed $uid: The unique identifier of the document to parse, the URL of XML file or the IRI of the IIIF resource
  369. * @param int $pid: If > 0, then only document with this PID gets loaded
  370. * @param bool $forceReload: Force reloading the document instead of returning the cached instance
  371. *
  372. * @return \Kitodo\Dlf\Common\Document Instance of this class, either MetsDocument or IiifManifest
  373. */
  374. public static function &getInstance($uid, $pid = 0, $forceReload = false)
  375. {
  376. // Sanitize input.
  377. $pid = max(intval($pid), 0);
  378. if (!$forceReload) {
  379. $regObj = Helper::digest($uid);
  380. if (
  381. is_object(self::$registry[$regObj])
  382. && self::$registry[$regObj] instanceof self
  383. ) {
  384. // Check if instance has given PID.
  385. if (
  386. !$pid
  387. || !self::$registry[$regObj]->pid
  388. || $pid == self::$registry[$regObj]->pid
  389. ) {
  390. // Return singleton instance if available.
  391. return self::$registry[$regObj];
  392. }
  393. } else {
  394. // Check the user's session...
  395. $sessionData = Helper::loadFromSession(get_called_class());
  396. if (
  397. is_object($sessionData[$regObj])
  398. && $sessionData[$regObj] instanceof self
  399. ) {
  400. // Check if instance has given PID.
  401. if (
  402. !$pid
  403. || !$sessionData[$regObj]->pid
  404. || $pid == $sessionData[$regObj]->pid
  405. ) {
  406. // ...and restore registry.
  407. self::$registry[$regObj] = $sessionData[$regObj];
  408. return self::$registry[$regObj];
  409. }
  410. }
  411. }
  412. }
  413. // Create new instance depending on format (METS or IIIF) ...
  414. $instance = null;
  415. $documentFormat = null;
  416. $xml = null;
  417. $iiif = null;
  418. // Try to get document format from database
  419. if (MathUtility::canBeInterpretedAsInteger($uid)) {
  420. $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
  421. ->getQueryBuilderForTable('tx_dlf_documents');
  422. $queryBuilder
  423. ->select(
  424. 'tx_dlf_documents.location AS location',
  425. 'tx_dlf_documents.document_format AS document_format'
  426. )
  427. ->from('tx_dlf_documents');
  428. // Get UID of document with given record identifier.
  429. if ($pid) {
  430. $queryBuilder
  431. ->where(
  432. $queryBuilder->expr()->eq('tx_dlf_documents.uid', intval($uid)),
  433. $queryBuilder->expr()->eq('tx_dlf_documents.pid', intval($pid)),
  434. Helper::whereExpression('tx_dlf_documents')
  435. );
  436. } else {
  437. $queryBuilder
  438. ->where(
  439. $queryBuilder->expr()->eq('tx_dlf_documents.uid', intval($uid)),
  440. Helper::whereExpression('tx_dlf_documents')
  441. );
  442. }
  443. $result = $queryBuilder
  444. ->setMaxResults(1)
  445. ->execute();
  446. if ($resArray = $result->fetch()) {
  447. $documentFormat = $resArray['document_format'];
  448. }
  449. } else {
  450. // Get document format from content of remote document
  451. // Cast to string for safety reasons.
  452. $location = (string) $uid;
  453. // Try to load a file from the url
  454. if (GeneralUtility::isValidUrl($location)) {
  455. // Load extension configuration
  456. $extConf = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['dlf']);
  457. // Set user-agent to identify self when fetching XML data.
  458. if (!empty($extConf['useragent'])) {
  459. @ini_set('user_agent', $extConf['useragent']);
  460. }
  461. $content = GeneralUtility::getUrl($location);
  462. if ($content !== false) {
  463. // TODO use single place to load xml
  464. // Turn off libxml's error logging.
  465. $libxmlErrors = libxml_use_internal_errors(true);
  466. // Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept
  467. $previousValueOfEntityLoader = libxml_disable_entity_loader(true);
  468. // Try to load XML from file.
  469. $xml = simplexml_load_string($content);
  470. // reset entity loader setting
  471. libxml_disable_entity_loader($previousValueOfEntityLoader);
  472. // Reset libxml's error logging.
  473. libxml_use_internal_errors($libxmlErrors);
  474. if ($xml !== false) {
  475. /* @var $xml \SimpleXMLElement */
  476. $xml->registerXPathNamespace('mets', 'http://www.loc.gov/METS/');
  477. $xpathResult = $xml->xpath('//mets:mets');
  478. $documentFormat = !empty($xpathResult) ? 'METS' : null;
  479. } else {
  480. // Try to load file as IIIF resource instead.
  481. $contentAsJsonArray = json_decode($content, true);
  482. if ($contentAsJsonArray !== null) {
  483. // Load plugin configuration.
  484. $conf = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf'][self::$extKey]);
  485. IiifHelper::setUrlReader(IiifUrlReader::getInstance());
  486. IiifHelper::setMaxThumbnailHeight($conf['iiifThumbnailHeight']);
  487. IiifHelper::setMaxThumbnailWidth($conf['iiifThumbnailWidth']);
  488. $iiif = IiifHelper::loadIiifResource($contentAsJsonArray);
  489. if ($iiif instanceof IiifResourceInterface) {
  490. $documentFormat = 'IIIF';
  491. }
  492. }
  493. }
  494. }
  495. }
  496. }
  497. // Sanitize input.
  498. $pid = max(intval($pid), 0);
  499. if ($documentFormat == 'METS') {
  500. $instance = new MetsDocument($uid, $pid, $xml);
  501. } elseif ($documentFormat == 'IIIF') {
  502. $instance = new IiifManifest($uid, $pid, $iiif);
  503. }
  504. // Save instance to registry.
  505. if (
  506. $instance instanceof self
  507. && $instance->ready) {
  508. self::$registry[Helper::digest($instance->uid)] = $instance;
  509. if ($instance->uid != $instance->location) {
  510. self::$registry[Helper::digest($instance->location)] = $instance;
  511. }
  512. // Load extension configuration
  513. $extConf = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['dlf']);
  514. // Save registry to session if caching is enabled.
  515. if (!empty($extConf['caching'])) {
  516. Helper::saveToSession(self::$registry, get_class($instance));
  517. }
  518. }
  519. // Return new instance.
  520. return $instance;
  521. }
  522. /**
  523. * This gets details about a logical structure element
  524. *
  525. * @access public
  526. *
  527. * @abstract
  528. *
  529. * @param string $id: The @ID attribute of the logical structure node (METS) or
  530. * the @id property of the Manifest / Range (IIIF)
  531. * @param bool $recursive: Whether to include the child elements / resources
  532. *
  533. * @return array Array of the element's id, label, type and physical page indexes/mptr link
  534. */
  535. public abstract function getLogicalStructure($id, $recursive = false);
  536. /**
  537. * This extracts all the metadata for a logical structure node
  538. *
  539. * @access public
  540. *
  541. * @abstract
  542. *
  543. * @param string $id: The @ID attribute of the logical structure node (METS) or the @id property
  544. * of the Manifest / Range (IIIF)
  545. * @param int $cPid: The PID for the metadata definitions
  546. * (defaults to $this->cPid or $this->pid)
  547. *
  548. * @return array The logical structure node's / the IIIF resource's parsed metadata array
  549. */
  550. public abstract function getMetadata($id, $cPid = 0);
  551. /**
  552. * This returns the first corresponding physical page number of a given logical page label
  553. *
  554. * @access public
  555. *
  556. * @param string $logicalPage: The label (or a part of the label) of the logical page
  557. *
  558. * @return int The physical page number
  559. */
  560. public function getPhysicalPage($logicalPage)
  561. {
  562. if (
  563. !empty($this->lastSearchedPhysicalPage['logicalPage'])
  564. && $this->lastSearchedPhysicalPage['logicalPage'] == $logicalPage
  565. ) {
  566. return $this->lastSearchedPhysicalPage['physicalPage'];
  567. } else {
  568. $physicalPage = 0;
  569. foreach ($this->physicalStructureInfo as $page) {
  570. if (strpos($page['orderlabel'], $logicalPage) !== false) {
  571. $this->lastSearchedPhysicalPage['logicalPage'] = $logicalPage;
  572. $this->lastSearchedPhysicalPage['physicalPage'] = $physicalPage;
  573. return $physicalPage;
  574. }
  575. $physicalPage++;
  576. }
  577. }
  578. return 1;
  579. }
  580. /**
  581. * This extracts the raw text for a physical structure node / IIIF Manifest / Canvas. Text might be
  582. * given as ALTO for METS or as annotations or ALTO for IIIF resources. If IIIF plain text annotations
  583. * with the motivation "painting" should be treated as full text representations, the extension has to be
  584. * configured accordingly.
  585. *
  586. * @access public
  587. *
  588. * @abstract
  589. *
  590. * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
  591. * of the Manifest / Range (IIIF)
  592. *
  593. * @return string The physical structure node's / IIIF resource's raw text
  594. */
  595. public abstract function getRawText($id);
  596. /**
  597. * This extracts the raw text for a physical structure node / IIIF Manifest / Canvas from an
  598. * XML fulltext representation (currently only ALTO). For IIIF manifests, ALTO documents have
  599. * to be given in the Canvas' / Manifest's "seeAlso" property.
  600. *
  601. * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
  602. * of the Manifest / Range (IIIF)
  603. *
  604. * @return string The physical structure node's / IIIF resource's raw text from XML
  605. */
  606. protected function getRawTextFromXml($id)
  607. {
  608. $rawText = '';
  609. // Load available text formats, ...
  610. $this->loadFormats();
  611. // ... physical structure ...
  612. $this->_getPhysicalStructure();
  613. // ... and extension configuration.
  614. $extConf = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf'][self::$extKey]);
  615. $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
  616. if (!empty($this->physicalStructureInfo[$id])) {
  617. while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
  618. if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
  619. // Get fulltext file.
  620. $file = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
  621. if ($file !== false) {
  622. // Turn off libxml's error logging.
  623. $libxmlErrors = libxml_use_internal_errors(true);
  624. // Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept.
  625. $previousValueOfEntityLoader = libxml_disable_entity_loader(true);
  626. // Load XML from file.
  627. $rawTextXml = simplexml_load_string($file);
  628. // Reset entity loader setting.
  629. libxml_disable_entity_loader($previousValueOfEntityLoader);
  630. // Reset libxml's error logging.
  631. libxml_use_internal_errors($libxmlErrors);
  632. // Get the root element's name as text format.
  633. $textFormat = strtoupper($rawTextXml->getName());
  634. } else {
  635. Helper::devLog('Couln\'t load fulltext file for structure node @ID "' . $id . '"', DEVLOG_SEVERITY_WARNING);
  636. return $rawText;
  637. }
  638. break;
  639. }
  640. }
  641. } else {
  642. Helper::devLog('Invalid structure node @ID "' . $id . '"', DEVLOG_SEVERITY_WARNING);
  643. return $rawText;
  644. }
  645. // Is this text format supported?
  646. if (
  647. !empty($rawTextXml)
  648. && !empty($this->formats[$textFormat])
  649. ) {
  650. if (!empty($this->formats[$textFormat]['class'])) {
  651. $class = $this->formats[$textFormat]['class'];
  652. // Get the raw text from class.
  653. if (
  654. class_exists($class)
  655. && ($obj = GeneralUtility::makeInstance($class)) instanceof FulltextInterface
  656. ) {
  657. $rawText = $obj->getRawText($rawTextXml);
  658. $this->rawTextArray[$id] = $rawText;
  659. } else {
  660. Helper::devLog('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"', DEVLOG_SEVERITY_WARNING);
  661. }
  662. }
  663. } else {
  664. Helper::devLog('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"', DEVLOG_SEVERITY_WARNING);
  665. }
  666. return $rawText;
  667. }
  668. /**
  669. * This determines a title for the given document
  670. *
  671. * @access public
  672. *
  673. * @static
  674. *
  675. * @param int $uid: The UID of the document
  676. * @param bool $recursive: Search superior documents for a title, too?
  677. *
  678. * @return string The title of the document itself or a parent document
  679. */
  680. public static function getTitle($uid, $recursive = false)
  681. {
  682. $title = '';
  683. // Sanitize input.
  684. $uid = max(intval($uid), 0);
  685. if ($uid) {
  686. $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
  687. ->getQueryBuilderForTable('tx_dlf_documents');
  688. $result = $queryBuilder
  689. ->select(
  690. 'tx_dlf_documents.title',
  691. 'tx_dlf_documents.partof'
  692. )
  693. ->from('tx_dlf_documents')
  694. ->where(
  695. $queryBuilder->expr()->eq('tx_dlf_documents.uid', $uid),
  696. Helper::whereExpression('tx_dlf_documents')
  697. )
  698. ->setMaxResults(1)
  699. ->execute();
  700. if ($resArray = $result->fetch()) {
  701. // Get title information.
  702. $title = $resArray['title'];
  703. $partof = $resArray['partof'];
  704. // Search parent documents recursively for a title?
  705. if (
  706. $recursive
  707. && empty($title)
  708. && intval($partof)
  709. && $partof != $uid
  710. ) {
  711. $title = self::getTitle($partof, true);
  712. }
  713. } else {
  714. Helper::devLog('No document with UID ' . $uid . ' found or document not accessible', DEVLOG_SEVERITY_WARNING);
  715. }
  716. } else {
  717. Helper::devLog('Invalid UID ' . $uid . ' for document', DEVLOG_SEVERITY_ERROR);
  718. }
  719. return $title;
  720. }
  721. /**
  722. * This extracts all the metadata for the toplevel logical structure node / resource
  723. *
  724. * @access public
  725. *
  726. * @param int $cPid: The PID for the metadata definitions
  727. *
  728. * @return array The logical structure node's / resource's parsed metadata array
  729. */
  730. public function getTitledata($cPid = 0)
  731. {
  732. $titledata = $this->getMetadata($this->_getToplevelId(), $cPid);
  733. // Add information from METS structural map to titledata array.
  734. if ($this instanceof MetsDocument) {
  735. $this->addMetadataFromMets($titledata, $this->_getToplevelId());
  736. }
  737. // Set record identifier for METS file / IIIF manifest if not present.
  738. if (
  739. is_array($titledata)
  740. && array_key_exists('record_id', $titledata)
  741. ) {
  742. if (
  743. !empty($this->recordId)
  744. && !in_array($this->recordId, $titledata['record_id'])
  745. ) {
  746. array_unshift($titledata['record_id'], $this->recordId);
  747. }
  748. }
  749. return $titledata;
  750. }
  751. /**
  752. * Traverse a logical (sub-) structure tree to find the structure with the requested logical id and return it's depth.
  753. *
  754. * @access protected
  755. *
  756. * @param array $structure: logical structure array
  757. * @param int $depth: current tree depth
  758. * @param string $logId: ID of the logical structure whose depth is requested
  759. *
  760. * @return int|bool: false if structure with $logId is not a child of this substructure,
  761. * or the actual depth.
  762. */
  763. protected function getTreeDepth($structure, $depth, $logId)
  764. {
  765. foreach ($structure as $element) {
  766. if ($element['id'] == $logId) {
  767. return $depth;
  768. } elseif (array_key_exists('children', $element)) {
  769. $foundInChildren = $this->getTreeDepth($element['children'], $depth + 1, $logId);
  770. if ($foundInChildren !== false) {
  771. return $foundInChildren;
  772. }
  773. }
  774. }
  775. return false;
  776. }
  777. /**
  778. * Get the tree depth of a logical structure element within the table of content
  779. *
  780. * @access public
  781. *
  782. * @param string $logId: The id of the logical structure element whose depth is requested
  783. * @return int|bool tree depth as integer or false if no element with $logId exists within the TOC.
  784. */
  785. public function getStructureDepth($logId)
  786. {
  787. return $this->getTreeDepth($this->_getTableOfContents(), 1, $logId);
  788. }
  789. /**
  790. * This sets some basic class properties
  791. *
  792. * @access protected
  793. *
  794. * @abstract
  795. *
  796. * @return void
  797. */
  798. protected abstract function init();
  799. /**
  800. * Reuse any document object that might have been already loaded to determine wether document is METS or IIIF
  801. *
  802. * @access protected
  803. *
  804. * @abstract
  805. *
  806. * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: any instance that has already been loaded
  807. *
  808. * @return bool true if $preloadedDocument can actually be reused, false if it has to be loaded again
  809. */
  810. protected abstract function setPreloadedDocument($preloadedDocument);
  811. /**
  812. * METS/IIIF specific part of loading a location
  813. *
  814. * @access protected
  815. *
  816. * @abstract
  817. *
  818. * @param string $location: The URL of the file to load
  819. *
  820. * @return bool true on success or false on failure
  821. */
  822. protected abstract function loadLocation($location);
  823. /**
  824. * Load XML file / IIIF resource from URL
  825. *
  826. * @access protected
  827. *
  828. * @param string $location: The URL of the file to load
  829. *
  830. * @return bool true on success or false on failure
  831. */
  832. protected function load($location)
  833. {
  834. // Load XML / JSON-LD file.
  835. if (\TYPO3\CMS\Core\Utility\GeneralUtility::isValidUrl($location)) {
  836. // Load extension configuration
  837. $extConf = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['dlf']);
  838. // Set user-agent to identify self when fetching XML / JSON-LD data.
  839. if (!empty($extConf['useragent'])) {
  840. @ini_set('user_agent', $extConf['useragent']);
  841. }
  842. // the actual loading is format specific
  843. return $this->loadLocation($location);
  844. } else {
  845. Helper::devLog('Invalid file location "' . $location . '" for document loading', DEVLOG_SEVERITY_ERROR);
  846. }
  847. return false;
  848. }
  849. /**
  850. * Analyze the document if it contains any fulltext that needs to be indexed.
  851. *
  852. * @access protected
  853. *
  854. * @abstract
  855. */
  856. protected abstract function ensureHasFulltextIsSet();
  857. /**
  858. * Register all available data formats
  859. *
  860. * @access protected
  861. *
  862. * @return void
  863. */
  864. protected function loadFormats()
  865. {
  866. if (!$this->formatsLoaded) {
  867. $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
  868. ->getQueryBuilderForTable('tx_dlf_formats');
  869. // Get available data formats from database.
  870. $result = $queryBuilder
  871. ->select(
  872. 'tx_dlf_formats.type AS type',
  873. 'tx_dlf_formats.root AS root',
  874. 'tx_dlf_formats.namespace AS namespace',
  875. 'tx_dlf_formats.class AS class'
  876. )
  877. ->from('tx_dlf_formats')
  878. ->where(
  879. $queryBuilder->expr()->eq('tx_dlf_formats.pid', 0)
  880. )
  881. ->execute();
  882. while ($resArray = $result->fetch()) {
  883. // Update format registry.
  884. $this->formats[$resArray['type']] = [
  885. 'rootElement' => $resArray['root'],
  886. 'namespaceURI' => $resArray['namespace'],
  887. 'class' => $resArray['class']
  888. ];
  889. }
  890. $this->formatsLoaded = true;
  891. }
  892. }
  893. /**
  894. * Register all available namespaces for a \SimpleXMLElement object
  895. *
  896. * @access public
  897. *
  898. * @param \SimpleXMLElement|\DOMXPath &$obj: \SimpleXMLElement or \DOMXPath object
  899. *
  900. * @return void
  901. */
  902. public function registerNamespaces(&$obj)
  903. {
  904. // TODO Check usage. XML specific method does not seem to be used anywhere outside this class within the project, but it is public and may be used by extensions.
  905. $this->loadFormats();
  906. // Do we have a \SimpleXMLElement or \DOMXPath object?
  907. if ($obj instanceof \SimpleXMLElement) {
  908. $method = 'registerXPathNamespace';
  909. } elseif ($obj instanceof \DOMXPath) {
  910. $method = 'registerNamespace';
  911. } else {
  912. Helper::devLog('Given object is neither a SimpleXMLElement nor a DOMXPath instance', DEVLOG_SEVERITY_ERROR);
  913. return;
  914. }
  915. // Register metadata format's namespaces.
  916. foreach ($this->formats as $enc => $conf) {
  917. $obj->$method(strtolower($enc), $conf['namespaceURI']);
  918. }
  919. }
  920. /**
  921. * This saves the document to the database and index
  922. *
  923. * @access public
  924. *
  925. * @param int $pid: The PID of the saved record
  926. * @param int $core: The UID of the Solr core for indexing
  927. *
  928. * @return bool true on success or false on failure
  929. */
  930. public function save($pid = 0, $core = 0)
  931. {
  932. if (\TYPO3_MODE !== 'BE') {
  933. Helper::devLog('Saving a document is only allowed in the backend', DEVLOG_SEVERITY_ERROR);
  934. return false;
  935. }
  936. // Make sure $pid is a non-negative integer.
  937. $pid = max(intval($pid), 0);
  938. // Make sure $core is a non-negative integer.
  939. $core = max(intval($core), 0);
  940. // If $pid is not given, try to get it elsewhere.
  941. if (
  942. !$pid
  943. && $this->pid
  944. ) {
  945. // Retain current PID.
  946. $pid = $this->pid;
  947. } elseif (!$pid) {
  948. Helper::devLog('Invalid PID ' . $pid . ' for document saving', DEVLOG_SEVERITY_ERROR);
  949. return false;
  950. }
  951. // Set PID for metadata definitions.
  952. $this->cPid = $pid;
  953. // Set UID placeholder if not updating existing record.
  954. if ($pid != $this->pid) {
  955. $this->uid = uniqid('NEW');
  956. }
  957. // Get metadata array.
  958. $metadata = $this->getTitledata($pid);
  959. // Check for record identifier.
  960. if (empty($metadata['record_id'][0])) {
  961. Helper::devLog('No record identifier found to avoid duplication', DEVLOG_SEVERITY_ERROR);
  962. return false;
  963. }
  964. // Load plugin configuration.
  965. $conf = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf'][self::$extKey]);
  966. $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
  967. ->getQueryBuilderForTable('tx_dlf_structures');
  968. // Get UID for structure type.
  969. $result = $queryBuilder
  970. ->select('tx_dlf_structures.uid AS uid')
  971. ->from('tx_dlf_structures')
  972. ->where(
  973. $queryBuilder->expr()->eq('tx_dlf_structures.pid', intval($pid)),
  974. $queryBuilder->expr()->eq('tx_dlf_structures.index_name', $queryBuilder->expr()->literal($metadata['type'][0])),
  975. Helper::whereExpression('tx_dlf_structures')
  976. )
  977. ->setMaxResults(1)
  978. ->execute();
  979. if ($resArray = $result->fetch()) {
  980. $structure = $resArray['uid'];
  981. } else {
  982. Helper::devLog('Could not identify document/structure type "' . $queryBuilder->expr()->literal($metadata['type'][0]) . '"', DEVLOG_SEVERITY_ERROR);
  983. return false;
  984. }
  985. $metadata['type'][0] = $structure;
  986. // Remove appended "valueURI" from authors' names for storing in database.
  987. foreach ($metadata['author'] as $i => $author) {
  988. $splitName = explode(chr(31), $author);
  989. $metadata['author'][$i] = $splitName[0];
  990. }
  991. $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
  992. ->getQueryBuilderForTable('tx_dlf_collections');
  993. // Get UIDs for collections.
  994. $result = $queryBuilder
  995. ->select(
  996. 'tx_dlf_collections.index_name AS index_name',
  997. 'tx_dlf_collections.uid AS uid'
  998. )
  999. ->from('tx_dlf_collections')
  1000. ->where(
  1001. $queryBuilder->expr()->eq('tx_dlf_collections.pid', intval($pid)),
  1002. $queryBuilder->expr()->in('tx_dlf_collections.sys_language_uid', [-1, 0]),
  1003. Helper::whereExpression('tx_dlf_collections')
  1004. )
  1005. ->execute();
  1006. $collUid = [];
  1007. while ($resArray = $result->fetch()) {
  1008. $collUid[$resArray['index_name']] = $resArray['uid'];
  1009. }
  1010. $collections = [];
  1011. foreach ($metadata['collection'] as $collection) {
  1012. if (!empty($collUid[$collection])) {
  1013. // Add existing collection's UID.
  1014. $collections[] = $collUid[$collection];
  1015. } else {
  1016. // Insert new collection.
  1017. $collNewUid = uniqid('NEW');
  1018. $collData['tx_dlf_collections'][$collNewUid] = [
  1019. 'pid' => $pid,
  1020. 'label' => $collection,
  1021. 'index_name' => $collection,
  1022. 'oai_name' => (!empty($conf['publishNewCollections']) ? Helper::getCleanString($collection) : ''),
  1023. 'description' => '',
  1024. 'documents' => 0,
  1025. 'owner' => 0,
  1026. 'status' => 0,
  1027. ];
  1028. $substUid = Helper::processDBasAdmin($collData);
  1029. // Prevent double insertion.
  1030. unset($collData);
  1031. // Add new collection's UID.
  1032. $collections[] = $substUid[$collNewUid];
  1033. if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
  1034. Helper::addMessage(
  1035. htmlspecialchars(sprintf(Helper::getMessage('flash.newCollection'), $collection, $substUid[$collNewUid])),
  1036. Helper::getMessage('flash.attention', true),
  1037. \TYPO3\CMS\Core\Messaging\FlashMessage::INFO,
  1038. true
  1039. );
  1040. }
  1041. }
  1042. }
  1043. $metadata['collection'] = $collections;
  1044. $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
  1045. ->getQueryBuilderForTable('tx_dlf_libraries');
  1046. // Get UID for owner.
  1047. $owner = !empty($metadata['owner'][0]) ? $metadata['owner'][0] : 'default';
  1048. $result = $queryBuilder
  1049. ->select('tx_dlf_libraries.uid AS uid')
  1050. ->from('tx_dlf_libraries')
  1051. ->where(
  1052. $queryBuilder->expr()->eq('tx_dlf_libraries.pid', intval($pid)),
  1053. $queryBuilder->expr()->eq('tx_dlf_libraries.index_name', $queryBuilder->expr()->literal($owner)),
  1054. Helper::whereExpression('tx_dlf_libraries')
  1055. )
  1056. ->setMaxResults(1)
  1057. ->execute();
  1058. if ($resArray = $result->fetch()) {
  1059. $ownerUid = $resArray['uid'];
  1060. } else {
  1061. // Insert new library.
  1062. $libNewUid = uniqid('NEW');
  1063. $libData['tx_dlf_libraries'][$libNewUid] = [
  1064. 'pid' => $pid,
  1065. 'label' => $owner,
  1066. 'index_name' => $owner,
  1067. 'website' => '',
  1068. 'contact' => '',
  1069. 'image' => '',
  1070. 'oai_label' => '',
  1071. 'oai_base' => '',
  1072. 'opac_label' => '',
  1073. 'opac_base' => '',
  1074. 'union_label' => '',
  1075. 'union_base' => '',
  1076. ];
  1077. $substUid = Helper::processDBasAdmin($libData);
  1078. // Add new library's UID.
  1079. $ownerUid = $substUid[$libNewUid];
  1080. if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
  1081. Helper::addMessage(
  1082. htmlspecialchars(sprintf(Helper::getMessage('flash.newLibrary'), $owner, $ownerUid)),
  1083. Helper::getMessage('flash.attention', true),
  1084. \TYPO3\CMS\Core\Messaging\FlashMessage::INFO,
  1085. true
  1086. );
  1087. }
  1088. }
  1089. $metadata['owner'][0] = $ownerUid;
  1090. // Get UID of parent document.
  1091. $partof = $this->getParentDocumentUidForSaving($pid, $core);
  1092. // Use the date of publication or title as alternative sorting metric for parts of multi-part works.
  1093. if (!empty($partof)) {
  1094. if (
  1095. empty($metadata['volume'][0])
  1096. && !empty($metadata['year'][0])
  1097. ) {
  1098. $metadata['volume'] = $metadata['year'];
  1099. }
  1100. if (empty($metadata['volume_sorting'][0])) {
  1101. // If METS @ORDER is given it is preferred over year_sorting and year.
  1102. if (!empty($metadata['mets_order'][0])) {
  1103. $metadata['volume_sorting'][0] = $metadata['mets_order'][0];
  1104. } elseif (!empty($metadata['year_sorting'][0])) {
  1105. $metadata['volume_sorting'][0] = $metadata['year_sorting'][0];
  1106. } elseif (!empty($metadata['year'][0])) {
  1107. $metadata['volume_sorting'][0] = $metadata['year'][0];
  1108. }
  1109. }
  1110. // If volume_sorting is still empty, try to use title_sorting or METS @ORDERLABEL finally (workaround for newspapers)
  1111. if (empty($metadata['volume_sorting'][0])) {
  1112. if (!empty($metadata['title_sorting'][0])) {
  1113. $metadata['volume_sorting'][0] = $metadata['title_sorting'][0];
  1114. } elseif (!empty($metadata['mets_orderlabel'][0])) {
  1115. $metadata['volume_sorting'][0] = $metadata['mets_orderlabel'][0];
  1116. }
  1117. }
  1118. }
  1119. $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
  1120. ->getQueryBuilderForTable('tx_dlf_metadata');
  1121. // Get metadata for lists and sorting.
  1122. $result = $queryBuilder
  1123. ->select(
  1124. 'tx_dlf_metadata.index_name AS index_name',
  1125. 'tx_dlf_metadata.is_listed AS is_listed',
  1126. 'tx_dlf_metadata.is_sortable AS is_sortable'
  1127. )
  1128. ->from('tx_dlf_metadata')
  1129. ->where(
  1130. $queryBuilder->expr()->orX(
  1131. $queryBuilder->expr()->eq('tx_dlf_metadata.is_listed', 1),
  1132. $queryBuilder->expr()->eq('tx_dlf_metadata.is_sortable', 1)
  1133. ),
  1134. $queryBuilder->expr()->eq('tx_dlf_metadata.pid', intval($pid)),
  1135. Helper::whereExpression('tx_dlf_metadata')
  1136. )
  1137. ->execute();
  1138. $listed = [];
  1139. $sortable = [];
  1140. while ($resArray = $result->fetch()) {
  1141. if (!empty($metadata[$resArray['index_name']])) {
  1142. if ($resArray['is_listed']) {
  1143. $listed[$resArray['index_name']] = $metadata[$resArray['index_name']];
  1144. }
  1145. if ($resArray['is_sortable']) {
  1146. $sortable[$resArray['index_name']] = $metadata[$resArray['index_name']][0];
  1147. }
  1148. }
  1149. }
  1150. // Fill data array.
  1151. $data['tx_dlf_documents'][$this->uid] = [
  1152. 'pid' => $pid,
  1153. $GLOBALS['TCA']['tx_dlf_documents']['ctrl']['enablecolumns']['starttime'] => 0,
  1154. $GLOBALS['TCA']['tx_dlf_documents']['ctrl']['enablecolumns']['endtime'] => 0,
  1155. 'prod_id' => $metadata['prod_id'][0],
  1156. 'location' => $this->location,
  1157. 'record_id' => $metadata['record_id'][0],
  1158. 'opac_id' => $metadata['opac_id'][0],
  1159. 'union_id' => $metadata['union_id'][0],
  1160. 'urn' => $metadata['urn'][0],
  1161. 'purl' => $metadata['purl'][0],
  1162. 'title' => $metadata['title'][0],
  1163. 'title_sorting' => $metadata['title_sorting'][0],
  1164. 'author' => implode('; ', $metadata['author']),
  1165. 'year' => implode('; ', $metadata['year']),
  1166. 'place' => implode('; ', $metadata['place']),
  1167. 'thumbnail' => $this->_getThumbnail(true),
  1168. 'metadata' => serialize($listed),
  1169. 'metadata_sorting' => serialize($sortable),
  1170. 'structure' => $metadata['type'][0],
  1171. 'partof' => $partof,
  1172. 'volume' => $metadata['volume'][0],
  1173. 'volume_sorting' => $metadata['volume_sorting'][0],
  1174. 'license' => $metadata['license'][0],
  1175. 'terms' => $metadata['terms'][0],
  1176. 'restrictions' => $metadata['restrictions'][0],
  1177. 'out_of_print' => $metadata['out_of_print'][0],
  1178. 'rights_info' => $metadata['rights_info'][0],
  1179. 'collections' => $metadata['collection'],
  1180. 'mets_label' => $metadata['mets_label'][0],
  1181. 'mets_orderlabel' => $metadata['mets_orderlabel'][0],
  1182. 'mets_order' => $metadata['mets_order'][0],
  1183. 'owner' => $metadata['owner'][0],
  1184. 'solrcore' => $core,
  1185. 'status' => 0,
  1186. 'document_format' => $metadata['document_format'][0],
  1187. ];
  1188. // Unhide hidden documents.
  1189. if (!empty($conf['unhideOnIndex'])) {
  1190. $data['tx_dlf_documents'][$this->uid][$GLOBALS['TCA']['tx_dlf_documents']['ctrl']['enablecolumns']['disabled']] = 0;
  1191. }
  1192. // Process data.
  1193. $newIds = Helper::processDBasAdmin($data);
  1194. // Replace placeholder with actual UID.
  1195. if (strpos($this->uid, 'NEW') === 0) {
  1196. $this->uid = $newIds[$this->uid];
  1197. $this->pid = $pid;
  1198. $this->parentId = $partof;
  1199. }
  1200. if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
  1201. Helper::addMessage(
  1202. htmlspecialchars(sprintf(Helper::getMessage('flash.documentSaved'), $metadata['title'][0], $this->uid)),
  1203. Helper::getMessage('flash.done', true),
  1204. \TYPO3\CMS\Core\Messaging\FlashMessage::OK,
  1205. true
  1206. );
  1207. }
  1208. // Add document to index.
  1209. if ($core) {
  1210. Indexer::add($this, $core);
  1211. } else {
  1212. Helper::devLog('Invalid UID "' . $core . '" for Solr core', DEVLOG_SEVERITY_NOTICE);
  1213. }
  1214. return true;
  1215. }
  1216. /**
  1217. * Get the ID of the parent document if the current document has one. Also save a parent document
  1218. * to the database and the Solr index if their $pid and the current $pid differ.
  1219. * Currently only applies to METS documents.
  1220. *
  1221. * @access protected
  1222. *
  1223. * @abstract
  1224. *
  1225. * @return int The parent document's id.
  1226. */
  1227. protected abstract function getParentDocumentUidForSaving($pid, $core);
  1228. /**
  1229. * This returns $this->hasFulltext via __get()
  1230. *
  1231. * @access protected
  1232. *
  1233. * @return bool Are there any fulltext files available?
  1234. */
  1235. protected function _getHasFulltext()
  1236. {
  1237. $this->ensureHasFulltextIsSet();
  1238. return $this->hasFulltext;
  1239. }
  1240. /**
  1241. * This returns $this->location via __get()
  1242. *
  1243. * @access protected
  1244. *
  1245. * @return string The location of the document
  1246. */
  1247. protected function _getLocation()
  1248. {
  1249. return $this->location;
  1250. }
  1251. /**
  1252. * Format specific part of building the document's metadata array
  1253. *
  1254. * @access protected
  1255. *
  1256. * @abstract
  1257. *
  1258. * @param int $cPid
  1259. */
  1260. protected abstract function prepareMetadataArray($cPid);
  1261. /**
  1262. * This builds an array of the document's metadata
  1263. *
  1264. * @access protected
  1265. *
  1266. * @return array Array of metadata with their corresponding logical structure node ID as key
  1267. */
  1268. protected function _getMetadataArray()
  1269. {
  1270. // Set metadata definitions' PID.
  1271. $cPid = ($this->cPid ? $this->cPid : $this->pid);
  1272. if (!$cPid) {
  1273. Helper::devLog('Invalid PID ' . $cPid . ' for metadata definitions', DEVLOG_SEVERITY_ERROR);
  1274. return [];
  1275. }
  1276. if (
  1277. !$this->metadataArrayLoaded
  1278. || $this->metadataArray[0] != $cPid
  1279. ) {
  1280. $this->prepareMetadataArray($cPid);
  1281. $this->metadataArray[0] = $cPid;
  1282. $this->metadataArrayLoaded = true;
  1283. }
  1284. return $this->metadataArray;
  1285. }
  1286. /**
  1287. * This returns $this->numPages via __get()
  1288. *
  1289. * @access protected
  1290. *
  1291. * @return int The total number of pages and/or tracks
  1292. */
  1293. protected function _getNumPages()
  1294. {
  1295. $this->_getPhysicalStructure();
  1296. return $this->numPages;
  1297. }
  1298. /**
  1299. * This returns $this->parentId via __get()
  1300. *
  1301. * @access protected
  1302. *
  1303. * @return int The UID of the parent document or zero if not applicable
  1304. */
  1305. protected function _getParentId()
  1306. {
  1307. return $this->parentId;
  1308. }
  1309. /**
  1310. * This builds an array of the document's physical structure
  1311. *
  1312. * @access protected
  1313. *
  1314. * @abstract
  1315. *
  1316. * @return array Array of physical elements' id, type, label and file representations ordered
  1317. * by @ORDER attribute / IIIF Sequence's Canvases
  1318. */
  1319. protected abstract function _getPhysicalStructure();
  1320. /**
  1321. * This gives an array of the document's physical structure metadata
  1322. *
  1323. * @access protected
  1324. *
  1325. * @return array Array of elements' type, label and file representations ordered by @ID attribute / Canvas order
  1326. */
  1327. protected function _getPhysicalStructureInfo()
  1328. {
  1329. // Is there no physical structure array yet?
  1330. if (!$this->physicalStructureLoaded) {
  1331. // Build physical structure array.
  1332. $this->_getPhysicalStructure();
  1333. }
  1334. return $this->physicalStructureInfo;
  1335. }
  1336. /**
  1337. * This returns $this->pid via __get()
  1338. *
  1339. * @access protected
  1340. *
  1341. * @return int The PID of the document or zero if not in database
  1342. */
  1343. protected function _getPid()
  1344. {
  1345. return $this->pid;
  1346. }
  1347. /**
  1348. * This returns $this->ready via __get()
  1349. *
  1350. * @access protected
  1351. *
  1352. * @return bool Is the document instantiated successfully?
  1353. */
  1354. protected function _getReady()
  1355. {
  1356. return $this->ready;
  1357. }
  1358. /**
  1359. * This returns $this->recordId via __get()
  1360. *
  1361. * @access protected
  1362. *
  1363. * @return mixed The METS file's / IIIF manifest's record identifier
  1364. */
  1365. protected function _getRecordId()
  1366. {
  1367. return $this->recordId;
  1368. }
  1369. /**
  1370. * This returns $this->rootId via __get()
  1371. *
  1372. * @access protected
  1373. *
  1374. * @return int The UID of the root document or zero if not applicable
  1375. */
  1376. protected function _getRootId()
  1377. {
  1378. if (!$this->rootIdLoaded) {
  1379. if ($this->parentId) {
  1380. $parent = self::getInstance($this->parentId, $this->pid);
  1381. $this->rootId = $parent->rootId;
  1382. }
  1383. $this->rootIdLoaded = true;
  1384. }
  1385. return $this->rootId;
  1386. }
  1387. /**
  1388. * This returns the smLinks between logical and physical structMap (METS) and models the
  1389. * relation between IIIF Canvases and Manifests / Ranges in the same way
  1390. *
  1391. * @access protected
  1392. *
  1393. * @abstract
  1394. *
  1395. * @return array The links between logical and physical nodes / Range, Manifest and Canvas
  1396. */
  1397. protected abstract function _getSmLinks();
  1398. /**
  1399. * This builds an array of the document's logical structure
  1400. *
  1401. * @access protected
  1402. *
  1403. * @return array Array of structure nodes' id, label, type and physical page indexes/mptr / Canvas link with original hierarchy preserved
  1404. */
  1405. protected function _getTableOfContents()
  1406. {
  1407. // Is there no logical structure array yet?
  1408. if (!$this->tableOfContentsLoaded) {
  1409. // Get all logical structures.
  1410. $this->getLogicalStructure('', true);
  1411. $this->tableOfContentsLoaded = true;
  1412. }
  1413. return $this->tableOfContents;
  1414. }
  1415. /**
  1416. * This returns the document's thumbnail location
  1417. *
  1418. * @access protected
  1419. *
  1420. * @abstract
  1421. *
  1422. * @param bool $forceReload: Force reloading the thumbnail instead of returning the cached value
  1423. *
  1424. * @return string The document's thumbnail location
  1425. */
  1426. protected abstract function _getThumbnail($forceReload = false);
  1427. /**
  1428. * This returns the ID of the toplevel logical structure node
  1429. *
  1430. * @access protected
  1431. *
  1432. * @abstract
  1433. *
  1434. * @return string The logical structure node's ID
  1435. */
  1436. protected abstract function _getToplevelId();
  1437. /**
  1438. * This returns $this->uid via __get()
  1439. *
  1440. * @access protected
  1441. *
  1442. * @return mixed The UID or the URL of the document
  1443. */
  1444. protected function _getUid()
  1445. {
  1446. return $this->uid;
  1447. }
  1448. /**
  1449. * This sets $this->cPid via __set()
  1450. *
  1451. * @access protected
  1452. *
  1453. * @param int $value: The new PID for the metadata definitions
  1454. *
  1455. * @return void
  1456. */
  1457. protected function _setCPid($value)
  1458. {
  1459. $this->cPid = max(intval($value), 0);
  1460. }
  1461. /**
  1462. * This magic method is invoked each time a clone is called on the object variable
  1463. *
  1464. * @access protected
  1465. *
  1466. * @return void
  1467. */
  1468. protected function __clone()
  1469. {
  1470. // This method is defined as protected because singleton objects should not be cloned.
  1471. }
  1472. /**
  1473. * This is a singleton class, thus the constructor should be private/protected
  1474. * (Get an instance of this class by calling \Kitodo\Dlf\Common\Document::getInstance())
  1475. *
  1476. * @access protected
  1477. *
  1478. * @param int $uid: The UID of the document to parse or URL to XML file
  1479. * @param int $pid: If > 0, then only document with this PID gets loaded
  1480. * @param \SimpleXMLElement|IiifResourceInterface $preloadedDocument: Either null or the \SimpleXMLElement
  1481. * or IiifResourceInterface that has been loaded to determine the basic document format.
  1482. *
  1483. * @return void
  1484. */
  1485. protected function __construct($uid, $pid, $preloadedDocument)
  1486. {
  1487. $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
  1488. ->getQueryBuilderForTable('tx_dlf_documents');
  1489. $location = '';
  1490. // Prepare to check database for the requested document.
  1491. if (MathUtility::canBeInterpretedAsInteger($uid)) {
  1492. $whereClause = $queryBuilder->expr()->andX(
  1493. $queryBuilder->expr()->eq('tx_dlf_documents.uid', intval($uid)),
  1494. Helper::whereExpression('tx_dlf_documents')
  1495. );
  1496. } else {
  1497. // Try to load METS file / IIIF manifest.
  1498. if ($this->setPreloadedDocument($preloadedDocument) || (GeneralUtility::isValidUrl($uid)
  1499. && $this->load($uid))) {
  1500. // Initialize core METS object.
  1501. $this->init();
  1502. if ($this->getDocument() !== null) {
  1503. // Cast to string for safety reasons.
  1504. $location = (string) $uid;
  1505. $this->establishRecordId($pid);
  1506. } else {
  1507. // No METS / IIIF part found.
  1508. return;
  1509. }
  1510. } else {
  1511. // Loading failed.
  1512. return;
  1513. }
  1514. if (
  1515. !empty($location)
  1516. && !empty($this->recordId)
  1517. ) {
  1518. // Try to match record identifier or location (both should be unique).
  1519. $whereClause = $queryBuilder->expr()->andX(
  1520. $queryBuilder->expr()->orX(
  1521. $queryBuilder->expr()->eq('tx_dlf_documents.location', $queryBuilder->expr()->literal($location)),
  1522. $queryBuilder->expr()->eq('tx_dlf_documents.record_id', $queryBuilder->expr()->literal($this->recordId))
  1523. ),
  1524. Helper::whereExpression('tx_dlf_documents')
  1525. );
  1526. } else {
  1527. // Can't persistently identify document, don't try to match at all.
  1528. $whereClause = '1=-1';
  1529. }
  1530. }
  1531. // Check for PID if needed.
  1532. if ($pid) {
  1533. $whereClause = $queryBuilder->expr()->andX(
  1534. $whereClause,
  1535. $queryBuilder->expr()->eq('tx_dlf_documents.pid', intval($pid))
  1536. );
  1537. }
  1538. // Get document PID and location from database.
  1539. $result = $queryBuilder
  1540. ->select(
  1541. 'tx_dlf_documents.uid AS uid',
  1542. 'tx_dlf_documents.pid AS pid',
  1543. 'tx_dlf_documents.record_id AS record_id',
  1544. 'tx_dlf_documents.partof AS partof',
  1545. 'tx_dlf_documents.thumbnail AS thumbnail',
  1546. 'tx_dlf_documents.location AS location'
  1547. )
  1548. ->from('tx_dlf_documents')
  1549. ->where($whereClause)
  1550. ->setMaxResults(1)
  1551. ->execute();
  1552. if ($resArray = $result->fetch()) {
  1553. $this->uid = $resArray['uid'];
  1554. $this->pid = $resArray['pid'];
  1555. $this->recordId = $resArray['record_id'];
  1556. $this->parentId = $resArray['partof'];
  1557. $this->thumbnail = $resArray['thumbnail'];
  1558. $this->location = $resArray['location'];
  1559. $this->thumbnailLoaded = true;
  1560. // Load XML file if necessary...
  1561. if (
  1562. $this->getDocument() === null
  1563. && $this->load($this->location)
  1564. ) {
  1565. // ...and set some basic properties.
  1566. $this->init();
  1567. }
  1568. // Do we have a METS / IIIF object now?
  1569. if ($this->getDocument() !== null) {
  1570. // Set new location if necessary.
  1571. if (!empty($location)) {
  1572. $this->location = $location;
  1573. }
  1574. // Document ready!
  1575. $this->ready = true;
  1576. }
  1577. } elseif ($this->getDocument() !== null) {
  1578. // Set location as UID for documents not in database.
  1579. $this->uid = $location;
  1580. $this->location = $location;
  1581. // Document ready!
  1582. $this->ready = true;
  1583. } else {
  1584. Helper::devLog('No document with UID ' . $uid . ' found or document not accessible', DEVLOG_SEVERITY_ERROR);
  1585. }
  1586. }
  1587. /**
  1588. * This magic method is called each time an invisible property is referenced from the object
  1589. *
  1590. * @access public
  1591. *
  1592. * @param string $var: Name of variable to get
  1593. *
  1594. * @return mixed Value of $this->$var
  1595. */
  1596. public function __get($var)
  1597. {
  1598. $method = '_get' . ucfirst($var);
  1599. if (
  1600. !property_exists($this, $var)
  1601. || !method_exists($this, $method)
  1602. ) {
  1603. Helper::devLog('There is no getter function for property "' . $var . '"', DEVLOG_SEVERITY_WARNING);
  1604. return;
  1605. } else {
  1606. return $this->$method();
  1607. }
  1608. }
  1609. /**
  1610. * This magic method is called each time an invisible property is checked for isset() or empty()
  1611. *
  1612. * @access public
  1613. *
  1614. * @param string $var: Name of variable to check
  1615. *
  1616. * @return bool true if variable is set and not empty, false otherwise
  1617. */
  1618. public function __isset($var)
  1619. {
  1620. return !empty($this->__get($var));
  1621. }
  1622. /**
  1623. * This magic method is called each time an invisible property is referenced from the object
  1624. *
  1625. * @access public
  1626. *
  1627. * @param string $var: Name of variable to set
  1628. * @param mixed $value: New value of variable
  1629. *
  1630. * @return void
  1631. */
  1632. public function __set($var, $value)
  1633. {
  1634. $method = '_set' . ucfirst($var);
  1635. if (
  1636. !property_exists($this, $var)
  1637. || !method_exists($this, $method)
  1638. ) {
  1639. Helper::devLog('There is no setter function for property "' . $var . '"', DEVLOG_SEVERITY_WARNING);
  1640. } else {
  1641. $this->$method($value);
  1642. }
  1643. }
  1644. }