Skip to content

Commit

Permalink
[FEATURE] Implement full text reader (#1402)
Browse files Browse the repository at this point in the history
Co-authored-by: Sebastian Meyer <[email protected]>
  • Loading branch information
beatrycze-volk and sebastian-meyer authored Dec 16, 2024
1 parent 3d09497 commit bd529ae
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 113 deletions.
106 changes: 0 additions & 106 deletions Classes/Common/AbstractDocument.php
Original file line number Diff line number Diff line change
Expand Up @@ -613,112 +613,6 @@ public function getPhysicalPage(string $logicalPage): int
return 1;
}

/**
* This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
* XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
* to be given in the Canvas' / Manifest's "seeAlso" property.
*
* @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property
* of the Manifest / Range (IIIF)
*
* @return string The OCR full text
*/
protected function getFullTextFromXml(string $id): string
{
$fullText = '';
// Load available text formats, ...
$this->loadFormats();
// ... physical structure ...
$this->magicGetPhysicalStructure();
// ... and extension configuration.
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'files');
$fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
$textFormat = "";
if (!empty($this->physicalStructureInfo[$id])) {
while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
// Get full text file.
$fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
if ($fileContent !== false) {
$textFormat = $this->getTextFormat($fileContent);
} else {
$this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
return $fullText;
}
break;
}
}
} else {
$this->logger->warning('Invalid structure node @ID "' . $id . '"');
return $fullText;
}
// Is this text format supported?
// This part actually differs from previous version of indexed OCR
if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
$textMiniOcr = '';
if (!empty($this->formats[$textFormat]['class'])) {
$textMiniOcr = $this->getRawTextFromClass($id, $fileContent, $textFormat);
}
$fullText = $textMiniOcr;
} else {
$this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
}
return $fullText;
}

/**
* Get raw text from class for given format.
*
* @access private
*
* @param $id
* @param $fileContent
* @param $textFormat
*
* @return string
*/
private function getRawTextFromClass($id, $fileContent, $textFormat): string
{
$textMiniOcr = '';
$class = $this->formats[$textFormat]['class'];
// Get the raw text from class.
if (class_exists($class)) {
$obj = GeneralUtility::makeInstance($class);
if ($obj instanceof FulltextInterface) {
// Load XML from file.
$ocrTextXml = Helper::getXmlFileAsString($fileContent);
$textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
$this->rawTextArray[$id] = $textMiniOcr;
} else {
$this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
}
} else {
$this->logger->warning('Class "' . $class . ' does not exists for "' . $textFormat . ' text format"');
}
return $textMiniOcr;
}

/**
* Get format of the OCR full text
*
* @access private
*
* @param string $fileContent content of the XML file
*
* @return string The format of the OCR full text
*/
private function getTextFormat(string $fileContent): string
{
$xml = Helper::getXmlFileAsString($fileContent);

if ($xml !== false) {
// Get the root element's name as text format.
return strtoupper($xml->getName());
} else {
return '';
}
}

/**
* This determines a title for the given document
*
Expand Down
157 changes: 157 additions & 0 deletions Classes/Common/FullTextReader.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
<?php

/**
* (c) Kitodo. Key to digital objects e.V. <[email protected]>
*
* This file is part of the Kitodo and TYPO3 projects.
*
* @license GNU General Public License version 3 or later.
* For the full copyright and license information, please read the
* LICENSE.txt file that was distributed with this source code.
*/

namespace Kitodo\Dlf\Common;

use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
use TYPO3\CMS\Core\Log\Logger;
use TYPO3\CMS\Core\Log\LogManager;
use TYPO3\CMS\Core\Utility\GeneralUtility;

class FullTextReader
{
/**
* @access private
* @var Logger This holds the logger
*/
private Logger $logger;

/**
* @access private
* @var array This holds all formats
*/
private array $formats;

/**
* Constructor
*
* @param array $formats
*/
public function __construct(array $formats)
{
$this->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(get_class($this));
$this->formats = $formats;
}

/**
* This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
* XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
* to be given in the Canvas' / Manifest's "seeAlso" property.
*
* @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property
* of the Manifest / Range (IIIF)
* @param array $fileLocations The locations of the XML files
* @param mixed $physicalStructureNode The physical structure node (METS) or the Manifest / Range (IIIF)
*
* @return string The OCR full text
*/
public function getFromXml(string $id, array $fileLocations, $physicalStructureNode): string
{
$fullText = '';

$fileGrpsFulltext = $this->getFullTextFileGroups();
$textFormat = "";
if (!empty($physicalStructureNode)) {
while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
if (!empty($physicalStructureNode['files'][$fileGrpFulltext])) {
// Get full text file.
$fileContent = GeneralUtility::getUrl($fileLocations[$fileGrpFulltext]);
if ($fileContent !== false) {
$textFormat = $this->getTextFormat($fileContent);
} else {
$this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
return $fullText;
}
break;
}
}
} else {
$this->logger->warning('Invalid structure node @ID "' . $id . '"');
return $fullText;
}
// Is this text format supported?
// This part actually differs from previous version of indexed OCR
if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
$textMiniOcr = '';
if (!empty($this->formats[$textFormat]['class'])) {
$textMiniOcr = $this->getRawTextFromClass($fileContent, $textFormat);
}
$fullText = $textMiniOcr;
} else {
$this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
}

return $fullText;
}

/**
* Get raw text from class for given format.
*
* @access private
*
* @param string $fileContent The content of the XML file
* @param string $textFormat
*
* @return string
*/
private function getRawTextFromClass(string $fileContent, string $textFormat): string
{
$textMiniOcr = '';
$class = $this->formats[$textFormat]['class'];
// Get the raw text from class.
if (class_exists($class)) {
$obj = GeneralUtility::makeInstance($class);
if ($obj instanceof FulltextInterface) {
// Load XML from file.
$ocrTextXml = Helper::getXmlFileAsString($fileContent);
$textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
} else {
$this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
}
} else {
$this->logger->warning('Class "' . $class . ' does not exists for "' . $textFormat . ' text format"');
}
return $textMiniOcr;
}

/**
* Get full text file groups from extension configuration.
*
* @return array
*/
private function getFullTextFileGroups(): array
{
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('dlf', 'files');
return GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
}

/**
* Get format of the OCR full text
*
* @access private
*
* @param string $fileContent The content of the XML file
*
* @return string The format of the OCR full text
*/
private function getTextFormat(string $fileContent): string
{
$xml = Helper::getXmlFileAsString($fileContent);

if ($xml !== false) {
// Get the root element's name as text format.
return strtoupper($xml->getName());
} else {
return '';
}
}
}
14 changes: 10 additions & 4 deletions Classes/Common/IiifManifest.php
Original file line number Diff line number Diff line change
Expand Up @@ -755,17 +755,23 @@ public function getFullText(string $id): string
// ... and extension configuration.
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey);
$fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['files']['fileGrpFulltext']);
if (!empty($this->physicalStructureInfo[$id])) {

$physicalStructureNode = $this->physicalStructureInfo[$id];
if (!empty($physicalStructureNode)) {
while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
$rawText = parent::getFullTextFromXml($id);
if (!empty($physicalStructureNode['files'][$fileGrpFulltext])) {
$rawText = GeneralUtility::makeInstance(FullTextReader::class, $this->formats)->getFromXml(
$id,
[$fileGrpFulltext => $this->getFileLocation($physicalStructureNode['files'][$fileGrpFulltext])],
$physicalStructureNode
);
break;
}
}
if ($extConf['iiif']['indexAnnotations'] == 1) {
$iiifResource = $this->iiif->getContainedResourceById($id);
// Get annotation containers
$annotationContainerIds = $this->physicalStructureInfo[$id]['annotationContainers'];
$annotationContainerIds = $physicalStructureNode['annotationContainers'];
if (!empty($annotationContainerIds)) {
$annotationTexts = $this->getAnnotationTexts($annotationContainerIds, $iiifResource->getId());
$rawText .= implode(' ', $annotationTexts);
Expand Down
23 changes: 20 additions & 3 deletions Classes/Common/MetsDocument.php
Original file line number Diff line number Diff line change
Expand Up @@ -1116,12 +1116,29 @@ function ($element) {
public function getFullText(string $id): string
{
$fullText = '';

// Load fileGrps and check for full text files.
// Load available text formats, ...
$this->loadFormats();
// ... physical structure ...
$this->magicGetPhysicalStructure();
// ... fileGrps and check for full text files.
$this->magicGetFileGrps();

if ($this->hasFulltext) {
$fullText = $this->getFullTextFromXml($id);
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'files');
$fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);

$physicalStructureNode = $this->physicalStructureInfo[$id];

$fileLocations = [];
if (!empty($physicalStructureNode)) {
while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
$fileLocations[$fileGrpFulltext] = $this->getFileLocation($physicalStructureNode['files'][$fileGrpFulltext]);
}
}

$fullText = GeneralUtility::makeInstance(FullTextReader::class, $this->formats)->getFromXml($id, $fileLocations, $physicalStructureNode);
}

return $fullText;
}

Expand Down

0 comments on commit bd529ae

Please sign in to comment.