Skip to content

Commit

Permalink
Merge pull request #80 from esmero/ISSUE-79
Browse files Browse the repository at this point in the history
ISSUE-79: Use flv:exif as fallback for Image to HOCR matching
alliomeria authored Apr 20, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
2 parents 423a913 + 9348e76 commit af0172f
Showing 1 changed file with 22 additions and 2 deletions.
24 changes: 22 additions & 2 deletions src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php
Original file line number Diff line number Diff line change
@@ -348,7 +348,16 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug
//as the current Image and try to process, if not, run, tesseract
$width = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['width'] ?? NULL;
$height = $io->input->metadata['flv:identify'][$io->input->{$input_argument}]['height'] ?? NULL;
// In case identify failed, we can try with flv:exif (e.g JP2s might not pass the identify test)
if (!($width && $height)) {
$width = $io->input->metadata['flv:exif']['ImageWidth'] ?? NULL;
$height = $io->input->metadata['flv:exif']['ImageHeight'] ?? NULL;
}

if ($width && $height) {
// Cast them to INT to make sure we are matching exactly
$width = (int)$width;
$height = (int)$height;
$width_hocr = NULL;
$height_hocr = NULL;
$ados = $this->entityTypeManager->getStorage('node')
@@ -395,6 +404,9 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug
$page_coords = explode(' ', $pagetitle);
$width_hocr = $page_coords[2] ?? $width_hocr;
$height_hocr = $page_coords[3] ?? $height_hocr;
// Cast them to INT to make sure we are matching exactly
$width_hocr = $width_hocr ? (int)$width_hocr : $width_hocr;
$height_hocr = $height_hocr ? (int)$height_hocr : $height_hocr;
// NOTE: we can not match offset OCRs. either full page or not
if (($width_hocr == $width) && ($height_hocr == $height)) {
$ocr_html = file_get_contents($text_astructure['url']);
@@ -410,12 +422,20 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug
'@sequence_id' => $sequence_number,
]);
}

else {
$this->logger->info("@sbr_processor: HOCR to miniOCR processing from attached text file with UUID @source_hocr_uuid successfull for ADO with UUID @node_uuid and File with UUID @file_uuid with sequence number @sequence_id",
[
'@sbr_processor' => $this->getPluginId(),
'@node_uuid' => $node_uuid ?? 'ABSENT',
'@file_uuid' => $file_uuid ?? 'ABSENT',
'@source_hocr_uuid' => $text_astructure["dr:uuid"] ?? 'ABSENT',
'@sequence_id' => $sequence_number,
]);
}
$output->searchapi['fulltext'] = $miniocr;
$output->plugin = $miniocr;
$io->output = $output;
$external_found = TRUE;

}
}
// If a bbox was found break, no need to process

0 comments on commit af0172f

Please sign in to comment.