You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

546 lines
22 KiB

  1. <?php
  2. /**
  3. * (c) Kitodo. Key to digital objects e.V. <contact@kitodo.org>
  4. *
  5. * This file is part of the Kitodo and TYPO3 projects.
  6. *
  7. * @license GNU General Public License version 3 or later.
  8. * For the full copyright and license information, please read the
  9. * LICENSE.txt file that was distributed with this source code.
  10. */
  11. namespace Kitodo\Dlf\Common;
  12. use TYPO3\CMS\Core\Database\ConnectionPool;
  13. use TYPO3\CMS\Core\Messaging\FlashMessage;
  14. use TYPO3\CMS\Core\Utility\GeneralUtility;
  15. use TYPO3\CMS\Core\Utility\MathUtility;
  16. use Ubl\Iiif\Presentation\Common\Model\Resources\AnnotationContainerInterface;
  17. use Ubl\Iiif\Tools\IiifHelper;
  18. /**
  19. * Indexer class for the 'dlf' extension
  20. *
  21. * @author Sebastian Meyer <sebastian.meyer@slub-dresden.de>
  22. * @package TYPO3
  23. * @subpackage dlf
  24. * @access public
  25. */
  26. class Indexer
  27. {
  28. /**
  29. * The extension key
  30. *
  31. * @var string
  32. * @access public
  33. */
  34. public static $extKey = 'dlf';
  35. /**
  36. * Array of metadata fields' configuration
  37. * @see loadIndexConf()
  38. *
  39. * @var array
  40. * @access protected
  41. */
  42. protected static $fields = [
  43. 'autocomplete' => [],
  44. 'facets' => [],
  45. 'sortables' => [],
  46. 'indexed' => [],
  47. 'stored' => [],
  48. 'tokenized' => [],
  49. 'fieldboost' => []
  50. ];
  51. /**
  52. * Is the index configuration loaded?
  53. * @see $fields
  54. *
  55. * @var bool
  56. * @access protected
  57. */
  58. protected static $fieldsLoaded = false;
  59. /**
  60. * List of already processed documents
  61. *
  62. * @var array
  63. * @access protected
  64. */
  65. protected static $processedDocs = [];
  66. /**
  67. * Instance of \Kitodo\Dlf\Common\Solr class
  68. *
  69. * @var \Kitodo\Dlf\Common\Solr
  70. * @access protected
  71. */
  72. protected static $solr;
  73. /**
  74. * Insert given document into Solr index
  75. *
  76. * @access public
  77. *
  78. * @param \Kitodo\Dlf\Common\Document &$doc: The document to add
  79. * @param int $core: UID of the Solr core to use
  80. *
  81. * @return int 0 on success or 1 on failure
  82. */
  83. public static function add(Document &$doc, $core = 0)
  84. {
  85. if (in_array($doc->uid, self::$processedDocs)) {
  86. return 0;
  87. } elseif (self::solrConnect($core, $doc->pid)) {
  88. $errors = 0;
  89. // Handle multi-volume documents.
  90. if ($doc->parentId) {
  91. $parent = Document::getInstance($doc->parentId, 0, true);
  92. if ($parent->ready) {
  93. $errors = self::add($parent, $core);
  94. } else {
  95. Helper::devLog('Could not load parent document with UID ' . $doc->parentId, DEVLOG_SEVERITY_ERROR);
  96. return 1;
  97. }
  98. }
  99. try {
  100. // Add document to list of processed documents.
  101. self::$processedDocs[] = $doc->uid;
  102. // Delete old Solr documents.
  103. $updateQuery = self::$solr->service->createUpdate();
  104. $updateQuery->addDeleteQuery('uid:' . $doc->uid);
  105. self::$solr->service->update($updateQuery);
  106. // Index every logical unit as separate Solr document.
  107. foreach ($doc->tableOfContents as $logicalUnit) {
  108. if (!$errors) {
  109. $errors = self::processLogical($doc, $logicalUnit);
  110. } else {
  111. break;
  112. }
  113. }
  114. // Index fulltext files if available.
  115. if ($doc->hasFulltext) {
  116. foreach ($doc->physicalStructure as $pageNumber => $xmlId) {
  117. if (!$errors) {
  118. $errors = self::processPhysical($doc, $pageNumber, $doc->physicalStructureInfo[$xmlId]);
  119. } else {
  120. break;
  121. }
  122. }
  123. }
  124. // Commit all changes.
  125. $updateQuery = self::$solr->service->createUpdate();
  126. $updateQuery->addCommit();
  127. self::$solr->service->update($updateQuery);
  128. $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
  129. ->getQueryBuilderForTable('tx_dlf_documents');
  130. // Get document title from database.
  131. $result = $queryBuilder
  132. ->select('tx_dlf_documents.title AS title')
  133. ->from('tx_dlf_documents')
  134. ->where(
  135. $queryBuilder->expr()->eq('tx_dlf_documents.uid', intval($doc->uid)),
  136. Helper::whereExpression('tx_dlf_documents')
  137. )
  138. ->setMaxResults(1)
  139. ->execute();
  140. $allResults = $result->fetchAll();
  141. $resArray = $allResults[0];
  142. if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
  143. if (!$errors) {
  144. Helper::addMessage(
  145. htmlspecialchars(sprintf(Helper::getMessage('flash.documentIndexed'), $resArray['title'], $doc->uid)),
  146. Helper::getMessage('flash.done', true),
  147. FlashMessage::OK,
  148. true,
  149. 'core.template.flashMessages'
  150. );
  151. } else {
  152. Helper::addMessage(
  153. htmlspecialchars(sprintf(Helper::getMessage('flash.documentNotIndexed'), $resArray['title'], $doc->uid)),
  154. Helper::getMessage('flash.error', true),
  155. FlashMessage::ERROR,
  156. true,
  157. 'core.template.flashMessages'
  158. );
  159. }
  160. }
  161. return $errors;
  162. } catch (\Exception $e) {
  163. if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
  164. Helper::addMessage(
  165. Helper::getMessage('flash.solrException', true) . '<br />' . htmlspecialchars($e->getMessage()),
  166. Helper::getMessage('flash.error', true),
  167. FlashMessage::ERROR,
  168. true,
  169. 'core.template.flashMessages'
  170. );
  171. }
  172. Helper::devLog('Apache Solr threw exception: "' . $e->getMessage() . '"', DEVLOG_SEVERITY_ERROR);
  173. return 1;
  174. }
  175. } else {
  176. if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
  177. Helper::addMessage(
  178. Helper::getMessage('flash.solrNoConnection', true),
  179. Helper::getMessage('flash.warning', true),
  180. FlashMessage::WARNING,
  181. true,
  182. 'core.template.flashMessages'
  183. );
  184. }
  185. Helper::devLog('Could not connect to Apache Solr server', DEVLOG_SEVERITY_ERROR);
  186. return 1;
  187. }
  188. }
  189. /**
  190. * Returns the dynamic index field name for the given metadata field.
  191. *
  192. * @access public
  193. *
  194. * @param string $index_name: The metadata field's name in database
  195. * @param int $pid: UID of the configuration page
  196. *
  197. * @return string The field's dynamic index name
  198. */
  199. public static function getIndexFieldName($index_name, $pid = 0)
  200. {
  201. // Sanitize input.
  202. $pid = max(intval($pid), 0);
  203. if (!$pid) {
  204. Helper::devLog('Invalid PID ' . $pid . ' for metadata configuration', DEVLOG_SEVERITY_ERROR);
  205. return '';
  206. }
  207. // Load metadata configuration.
  208. self::loadIndexConf($pid);
  209. // Build field's suffix.
  210. $suffix = (in_array($index_name, self::$fields['tokenized']) ? 't' : 'u');
  211. $suffix .= (in_array($index_name, self::$fields['stored']) ? 's' : 'u');
  212. $suffix .= (in_array($index_name, self::$fields['indexed']) ? 'i' : 'u');
  213. $index_name .= '_' . $suffix;
  214. return $index_name;
  215. }
  216. /**
  217. * Load indexing configuration
  218. *
  219. * @access protected
  220. *
  221. * @param int $pid: The configuration page's UID
  222. *
  223. * @return void
  224. */
  225. protected static function loadIndexConf($pid)
  226. {
  227. if (!self::$fieldsLoaded) {
  228. $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
  229. ->getQueryBuilderForTable('tx_dlf_metadata');
  230. // Get the metadata indexing options.
  231. $result = $queryBuilder
  232. ->select(
  233. 'tx_dlf_metadata.index_name AS index_name',
  234. 'tx_dlf_metadata.index_tokenized AS index_tokenized',
  235. 'tx_dlf_metadata.index_stored AS index_stored',
  236. 'tx_dlf_metadata.index_indexed AS index_indexed',
  237. 'tx_dlf_metadata.is_sortable AS is_sortable',
  238. 'tx_dlf_metadata.is_facet AS is_facet',
  239. 'tx_dlf_metadata.is_listed AS is_listed',
  240. 'tx_dlf_metadata.index_autocomplete AS index_autocomplete',
  241. 'tx_dlf_metadata.index_boost AS index_boost'
  242. )
  243. ->from('tx_dlf_metadata')
  244. ->where(
  245. $queryBuilder->expr()->eq('tx_dlf_metadata.pid', intval($pid)),
  246. Helper::whereExpression('tx_dlf_metadata')
  247. )
  248. ->execute();
  249. while ($indexing = $result->fetch()) {
  250. if ($indexing['index_tokenized']) {
  251. self::$fields['tokenized'][] = $indexing['index_name'];
  252. }
  253. if (
  254. $indexing['index_stored']
  255. || $indexing['is_listed']
  256. ) {
  257. self::$fields['stored'][] = $indexing['index_name'];
  258. }
  259. if (
  260. $indexing['index_indexed']
  261. || $indexing['index_autocomplete']
  262. ) {
  263. self::$fields['indexed'][] = $indexing['index_name'];
  264. }
  265. if ($indexing['is_sortable']) {
  266. self::$fields['sortables'][] = $indexing['index_name'];
  267. }
  268. if ($indexing['is_facet']) {
  269. self::$fields['facets'][] = $indexing['index_name'];
  270. }
  271. if ($indexing['index_autocomplete']) {
  272. self::$fields['autocomplete'][] = $indexing['index_name'];
  273. }
  274. if ($indexing['index_boost'] > 0.0) {
  275. self::$fields['fieldboost'][$indexing['index_name']] = floatval($indexing['index_boost']);
  276. } else {
  277. self::$fields['fieldboost'][$indexing['index_name']] = false;
  278. }
  279. }
  280. self::$fieldsLoaded = true;
  281. }
  282. }
  283. /**
  284. * Processes a logical unit (and its children) for the Solr index
  285. *
  286. * @access protected
  287. *
  288. * @param \Kitodo\Dlf\Common\Document &$doc: The METS document
  289. * @param array $logicalUnit: Array of the logical unit to process
  290. *
  291. * @return int 0 on success or 1 on failure
  292. */
  293. protected static function processLogical(Document &$doc, array $logicalUnit)
  294. {
  295. $errors = 0;
  296. // Get metadata for logical unit.
  297. $metadata = $doc->metadataArray[$logicalUnit['id']];
  298. if (!empty($metadata)) {
  299. // Remove appended "valueURI" from authors' names for indexing.
  300. if (is_array($metadata['author'])) {
  301. foreach ($metadata['author'] as $i => $author) {
  302. $splitName = explode(chr(31), $author);
  303. $metadata['author'][$i] = $splitName[0];
  304. }
  305. }
  306. // Create new Solr document.
  307. $updateQuery = self::$solr->service->createUpdate();
  308. $solrDoc = $updateQuery->createDocument();
  309. // Create unique identifier from document's UID and unit's XML ID.
  310. $solrDoc->setField('id', $doc->uid . $logicalUnit['id']);
  311. $solrDoc->setField('uid', $doc->uid);
  312. $solrDoc->setField('pid', $doc->pid);
  313. if (MathUtility::canBeInterpretedAsInteger($logicalUnit['points'])) {
  314. $solrDoc->setField('page', $logicalUnit['points']);
  315. }
  316. if ($logicalUnit['id'] == $doc->toplevelId) {
  317. $solrDoc->setField('thumbnail', $doc->thumbnail);
  318. } elseif (!empty($logicalUnit['thumbnailId'])) {
  319. $solrDoc->setField('thumbnail', $doc->getFileLocation($logicalUnit['thumbnailId']));
  320. }
  321. $solrDoc->setField('partof', $doc->parentId);
  322. $solrDoc->setField('root', $doc->rootId);
  323. $solrDoc->setField('sid', $logicalUnit['id']);
  324. // There can be only one toplevel unit per UID, independently of backend configuration
  325. $solrDoc->setField('toplevel', $logicalUnit['id'] == $doc->toplevelId ? true : false);
  326. $solrDoc->setField('type', $logicalUnit['type'], self::$fields['fieldboost']['type']);
  327. $solrDoc->setField('title', $metadata['title'][0], self::$fields['fieldboost']['title']);
  328. $solrDoc->setField('volume', $metadata['volume'][0], self::$fields['fieldboost']['volume']);
  329. $solrDoc->setField('record_id', $metadata['record_id'][0]);
  330. $solrDoc->setField('purl', $metadata['purl'][0]);
  331. $solrDoc->setField('location', $doc->location);
  332. $solrDoc->setField('urn', $metadata['urn']);
  333. $solrDoc->setField('license', $metadata['license']);
  334. $solrDoc->setField('terms', $metadata['terms']);
  335. $solrDoc->setField('restrictions', $metadata['restrictions']);
  336. $solrDoc->setField('collection', $doc->metadataArray[$doc->toplevelId]['collection']);
  337. $coordinates = json_decode($metadata['coordinates'][0]);
  338. if (is_object($coordinates)) {
  339. $solrDoc->setField('geom', json_encode($coordinates->features[0]));
  340. }
  341. $autocomplete = [];
  342. foreach ($metadata as $index_name => $data) {
  343. if (
  344. !empty($data)
  345. && substr($index_name, -8) !== '_sorting'
  346. ) {
  347. $solrDoc->setField(self::getIndexFieldName($index_name, $doc->pid), $data, self::$fields['fieldboost'][$index_name]);
  348. if (in_array($index_name, self::$fields['sortables'])) {
  349. // Add sortable fields to index.
  350. $solrDoc->setField($index_name . '_sorting', $metadata[$index_name . '_sorting'][0]);
  351. }
  352. if (in_array($index_name, self::$fields['facets'])) {
  353. // Add facets to index.
  354. $solrDoc->setField($index_name . '_faceting', $data);
  355. }
  356. if (in_array($index_name, self::$fields['autocomplete'])) {
  357. $autocomplete = array_merge($autocomplete, $data);
  358. }
  359. }
  360. }
  361. // Add autocomplete values to index.
  362. if (!empty($autocomplete)) {
  363. $solrDoc->setField('autocomplete', $autocomplete);
  364. }
  365. // Add collection information to logical sub-elements if applicable.
  366. if (
  367. in_array('collection', self::$fields['facets'])
  368. && empty($metadata['collection'])
  369. && !empty($doc->metadataArray[$doc->toplevelId]['collection'])
  370. ) {
  371. $solrDoc->setField('collection_faceting', $doc->metadataArray[$doc->toplevelId]['collection']);
  372. }
  373. try {
  374. $updateQuery->addDocument($solrDoc);
  375. self::$solr->service->update($updateQuery);
  376. } catch (\Exception $e) {
  377. if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
  378. Helper::addMessage(
  379. Helper::getMessage('flash.solrException', true) . '<br />' . htmlspecialchars($e->getMessage()),
  380. Helper::getMessage('flash.error', true),
  381. FlashMessage::ERROR,
  382. true,
  383. 'core.template.flashMessages'
  384. );
  385. }
  386. return 1;
  387. }
  388. }
  389. // Check for child elements...
  390. if (!empty($logicalUnit['children'])) {
  391. foreach ($logicalUnit['children'] as $child) {
  392. if (!$errors) {
  393. // ...and process them, too.
  394. $errors = self::processLogical($doc, $child);
  395. } else {
  396. break;
  397. }
  398. }
  399. }
  400. return $errors;
  401. }
  402. /**
  403. * Processes a physical unit for the Solr index
  404. *
  405. * @access protected
  406. *
  407. * @param \Kitodo\Dlf\Common\Document &$doc: The METS document
  408. * @param int $page: The page number
  409. * @param array $physicalUnit: Array of the physical unit to process
  410. *
  411. * @return int 0 on success or 1 on failure
  412. */
  413. protected static function processPhysical(Document &$doc, $page, array $physicalUnit)
  414. {
  415. if (
  416. $doc->hasFulltext
  417. && $fulltext = $doc->getRawText($physicalUnit['id'])
  418. ) {
  419. // Read extension configuration.
  420. $extConf = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf'][self::$extKey]);
  421. // Create new Solr document.
  422. $updateQuery = self::$solr->service->createUpdate();
  423. $solrDoc = $updateQuery->createDocument();
  424. // Create unique identifier from document's UID and unit's XML ID.
  425. $solrDoc->setField('id', $doc->uid . $physicalUnit['id']);
  426. $solrDoc->setField('uid', $doc->uid);
  427. $solrDoc->setField('pid', $doc->pid);
  428. $solrDoc->setField('page', $page);
  429. $fileGrpsThumb = GeneralUtility::trimExplode(',', $extConf['fileGrpThumbs']);
  430. while ($fileGrpThumb = array_shift($fileGrpsThumb)) {
  431. if (!empty($physicalUnit['files'][$fileGrpThumb])) {
  432. $solrDoc->setField('thumbnail', $doc->getFileLocation($physicalUnit['files'][$fileGrpThumb]));
  433. break;
  434. }
  435. }
  436. $solrDoc->setField('partof', $doc->parentId);
  437. $solrDoc->setField('root', $doc->rootId);
  438. $solrDoc->setField('sid', $physicalUnit['id']);
  439. $solrDoc->setField('toplevel', false);
  440. $solrDoc->setField('type', $physicalUnit['type'], self::$fields['fieldboost']['type']);
  441. $solrDoc->setField('collection', $doc->metadataArray[$doc->toplevelId]['collection']);
  442. $solrDoc->setField('fulltext', htmlspecialchars($fulltext));
  443. // Add faceting information to physical sub-elements if applicable.
  444. foreach ($doc->metadataArray[$doc->toplevelId] as $index_name => $data) {
  445. if (
  446. !empty($data)
  447. && substr($index_name, -8) !== '_sorting'
  448. ) {
  449. if (in_array($index_name, self::$fields['facets'])) {
  450. // Remove appended "valueURI" from authors' names for indexing.
  451. if ($index_name == 'author') {
  452. foreach ($data as $i => $author) {
  453. $splitName = explode(chr(31), $author);
  454. $data[$i] = $splitName[0];
  455. }
  456. }
  457. // Add facets to index.
  458. $solrDoc->setField($index_name . '_faceting', $data);
  459. }
  460. }
  461. }
  462. // Add collection information to physical sub-elements if applicable.
  463. if (
  464. in_array('collection', self::$fields['facets'])
  465. && !empty($doc->metadataArray[$doc->toplevelId]['collection'])
  466. ) {
  467. $solrDoc->setField('collection_faceting', $doc->metadataArray[$doc->toplevelId]['collection']);
  468. }
  469. try {
  470. $updateQuery->addDocument($solrDoc);
  471. self::$solr->service->update($updateQuery);
  472. } catch (\Exception $e) {
  473. if (!(\TYPO3_REQUESTTYPE & \TYPO3_REQUESTTYPE_CLI)) {
  474. Helper::addMessage(
  475. Helper::getMessage('flash.solrException', true) . '<br />' . htmlspecialchars($e->getMessage()),
  476. Helper::getMessage('flash.error', true),
  477. FlashMessage::ERROR,
  478. true,
  479. 'core.template.flashMessages'
  480. );
  481. }
  482. return 1;
  483. }
  484. }
  485. return 0;
  486. }
  487. /**
  488. * Connects to Solr server.
  489. *
  490. * @access protected
  491. *
  492. * @param int $core: UID of the Solr core
  493. * @param int $pid: UID of the configuration page
  494. *
  495. * @return bool true on success or false on failure
  496. */
  497. protected static function solrConnect($core, $pid = 0)
  498. {
  499. // Get Solr instance.
  500. if (!self::$solr) {
  501. // Connect to Solr server.
  502. $solr = Solr::getInstance($core);
  503. if ($solr->ready) {
  504. self::$solr = $solr;
  505. // Load indexing configuration if needed.
  506. if ($pid) {
  507. self::loadIndexConf($pid);
  508. }
  509. } else {
  510. return false;
  511. }
  512. }
  513. return true;
  514. }
  515. /**
  516. * Prevent instantiation by hiding the constructor
  517. *
  518. * @access private
  519. */
  520. private function __construct()
  521. {
  522. // This is a static class, thus no instances should be created.
  523. }
  524. }