Skip to content

Commit

Permalink
Add support for workflow_ocr_backend
Browse files Browse the repository at this point in the history
  • Loading branch information
R0Wi committed Jan 15, 2025
1 parent fe820e1 commit 4bf4203
Show file tree
Hide file tree
Showing 45 changed files with 4,098 additions and 156 deletions.
11 changes: 11 additions & 0 deletions lib/AppInfo/Application.php
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,12 @@
use OCA\WorkflowOcr\Helper\SidecarFileAccessor;
use OCA\WorkflowOcr\Listener\RegisterFlowOperationsListener;
use OCA\WorkflowOcr\Notification\Notifier;
use OCA\WorkflowOcr\OcrProcessors\CommandLineUtils;
use OCA\WorkflowOcr\OcrProcessors\ICommandLineUtils;
use OCA\WorkflowOcr\OcrProcessors\IOcrProcessorFactory;
use OCA\WorkflowOcr\OcrProcessors\OcrProcessorFactory;
use OCA\WorkflowOcr\OcrProcessors\Remote\Client\ApiClient;
use OCA\WorkflowOcr\OcrProcessors\Remote\Client\IApiClient;
use OCA\WorkflowOcr\Service\EventService;
use OCA\WorkflowOcr\Service\GlobalSettingsService;
use OCA\WorkflowOcr\Service\IEventService;
Expand All @@ -46,8 +50,10 @@
use OCA\WorkflowOcr\Service\OcrBackendInfoService;
use OCA\WorkflowOcr\Service\OcrService;
use OCA\WorkflowOcr\SetupChecks\OcrMyPdfCheck;
use OCA\WorkflowOcr\Wrapper\AppApiWrapper;
use OCA\WorkflowOcr\Wrapper\CommandWrapper;
use OCA\WorkflowOcr\Wrapper\Filesystem;
use OCA\WorkflowOcr\Wrapper\IAppApiWrapper;
use OCA\WorkflowOcr\Wrapper\ICommand;
use OCA\WorkflowOcr\Wrapper\IFilesystem;
use OCA\WorkflowOcr\Wrapper\IViewFactory;
Expand All @@ -63,6 +69,8 @@

class Application extends App implements IBootstrap {
public const APP_NAME = 'workflow_ocr';
public const APP_BACKEND_NAME = 'workflow_ocr_backend';
public const APP_API_APP_NAME = 'app_api';

/**
* Application constructor.
Expand All @@ -83,6 +91,9 @@ public function register(IRegistrationContext $context): void {
$context->registerServiceAlias(IEventService::class, EventService::class);
$context->registerServiceAlias(IOcrBackendInfoService::class, OcrBackendInfoService::class);
$context->registerServiceAlias(INotificationService::class, NotificationService::class);
$context->registerServiceAlias(IApiClient::class, ApiClient::class);
$context->registerServiceAlias(ICommandLineUtils::class, CommandLineUtils::class);
$context->registerServiceAlias(IAppApiWrapper::class, AppApiWrapper::class);

// BUG #43
$context->registerService(ICommand::class, function () {
Expand Down
4 changes: 2 additions & 2 deletions lib/Exception/OcrProcessorNotFoundException.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
use Exception;

class OcrProcessorNotFoundException extends Exception {
public function __construct(string $mimeType) {
$this->message = 'OCR processor for mime type ' . $mimeType . ' not found';
public function __construct(string $mimeType, bool $useRemoteBackend) {
$this->message = 'OCR processor for mime type ' . $mimeType . '(useRemoteBackend=' . $useRemoteBackend . ') not found';
}
}
100 changes: 100 additions & 0 deletions lib/OcrProcessors/CommandLineUtils.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
<?php

declare(strict_types=1);

/**
* @copyright Copyright (c) 2025 Robin Windey <[email protected]>
*
* @license GNU AGPL version 3 or any later version
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

namespace OCA\WorkflowOcr\OcrProcessors;

use OCA\WorkflowOcr\Helper\ISidecarFileAccessor;
use OCA\WorkflowOcr\Model\GlobalSettings;
use OCA\WorkflowOcr\Model\WorkflowSettings;
use OCA\WorkflowOcr\Service\IOcrBackendInfoService;
use Psr\Log\LoggerInterface;

class CommandLineUtils implements ICommandLineUtils {
private static $ocrModeToCmdParameterMapping = [
WorkflowSettings::OCR_MODE_SKIP_TEXT => '--skip-text',
WorkflowSettings::OCR_MODE_REDO_OCR => '--redo-ocr',
WorkflowSettings::OCR_MODE_FORCE_OCR => '--force-ocr',
WorkflowSettings::OCR_MODE_SKIP_FILE => '' // This is the ocrmypdf default behaviour
];

public function __construct(
private ISidecarFileAccessor $sidecarFileAccessor,
private IOcrBackendInfoService $ocrBackendInfoService,
private LoggerInterface $logger,
) {
}

public function getCommandlineArgs(WorkflowSettings $settings, GlobalSettings $globalSettings, array $additionalCommandlineArgs = []): string {
$isLocalExecution = !$this->ocrBackendInfoService->isRemoteBackend();

// Default setting is quiet
$args = $isLocalExecution ? ['-q'] : [];

// OCR mode ('--skip-text', '--redo-ocr', '--force-ocr' or empty)
$args[] = self::$ocrModeToCmdParameterMapping[$settings->getOcrMode()];

// Language settings
if ($settings->getLanguages()) {
$langStr = implode('+', $settings->getLanguages());
$args[] = "--language $langStr";
}

// Remove background option (NOTE :: this is incompatible with redo-ocr, so
// we have to make it exclusive against each other!)
if ($settings->getRemoveBackground()) {
if ($settings->getOcrMode() === WorkflowSettings::OCR_MODE_REDO_OCR) {
$this->logger->warning('--remove-background is incompatible with --redo-ocr, ignoring');
} else {
$args[] = '--remove-background';
}
}

// Number of CPU's to be used
$processorCount = intval($globalSettings->processorCount);
if ($processorCount > 0) {
$args[] = '--jobs ' . $processorCount;
}

if ($isLocalExecution) {
// Save recognized text in tempfile
$sidecarFilePath = $this->sidecarFileAccessor->getOrCreateSidecarFile();
if ($sidecarFilePath) {
$args[] = '--sidecar ' . $sidecarFilePath;
}
}

$resultArgs = array_filter(array_merge(
$args,
$additionalCommandlineArgs,
[$this->escapeCustomCliArgs($settings->getCustomCliArgs())]
), fn ($arg) => !empty($arg));

return implode(' ', $resultArgs);
}

private function escapeCustomCliArgs(string $customCliArgs): string {
$customCliArgs = str_replace('&&', '', $customCliArgs);
$customCliArgs = str_replace(';', '', $customCliArgs);
return $customCliArgs;
}
}
31 changes: 31 additions & 0 deletions lib/OcrProcessors/ICommandLineUtils.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<?php

declare(strict_types=1);

/**
* @copyright Copyright (c) 2025 Robin Windey <[email protected]>
*
* @license GNU AGPL version 3 or any later version
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

namespace OCA\WorkflowOcr\OcrProcessors;

use OCA\WorkflowOcr\Model\GlobalSettings;
use OCA\WorkflowOcr\Model\WorkflowSettings;

interface ICommandLineUtils {
public function getCommandlineArgs(WorkflowSettings $settings, GlobalSettings $globalSettings, array $additionalCommandlineArgs = []): string;
}
7 changes: 0 additions & 7 deletions lib/OcrProcessors/IOcrProcessorFactory.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,4 @@ interface IOcrProcessorFactory {
* Creates a IOcrProcessor object for the given mimetype
*/
public function create(string $mimeType) : IOcrProcessor;

/**
* Returns true, if an OCR processor for the given mimetype
* can be constructed.
* @return bool
*/
public function canCreate(string $mimeType) : bool;
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

namespace OCA\WorkflowOcr\OcrProcessors;
namespace OCA\WorkflowOcr\OcrProcessors\Local;

use OCA\WorkflowOcr\Model\GlobalSettings;
use OCA\WorkflowOcr\Model\WorkflowSettings;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,42 +21,32 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

namespace OCA\WorkflowOcr\OcrProcessors;
namespace OCA\WorkflowOcr\OcrProcessors\Local;

use OCA\WorkflowOcr\Exception\OcrNotPossibleException;
use OCA\WorkflowOcr\Exception\OcrResultEmptyException;
use OCA\WorkflowOcr\Helper\ISidecarFileAccessor;
use OCA\WorkflowOcr\Model\GlobalSettings;
use OCA\WorkflowOcr\Model\WorkflowSettings;
use OCA\WorkflowOcr\OcrProcessors\ICommandLineUtils;
use OCA\WorkflowOcr\OcrProcessors\IOcrProcessor;
use OCA\WorkflowOcr\OcrProcessors\OcrProcessorResult;
use OCA\WorkflowOcr\Wrapper\ICommand;
use OCP\Files\File;
use Psr\Log\LoggerInterface;

abstract class OcrMyPdfBasedProcessor implements IOcrProcessor {
private static $ocrModeToCmdParameterMapping = [
WorkflowSettings::OCR_MODE_SKIP_TEXT => '--skip-text',
WorkflowSettings::OCR_MODE_REDO_OCR => '--redo-ocr',
WorkflowSettings::OCR_MODE_FORCE_OCR => '--force-ocr',
WorkflowSettings::OCR_MODE_SKIP_FILE => '' // This is the ocrmypdf default behaviour
];

/** @var ICommand */
private $command;

/** @var LoggerInterface */
private $logger;

/** @var ISidecarFileAccessor */
private $sidecarFileAccessor;

public function __construct(ICommand $command, LoggerInterface $logger, ISidecarFileAccessor $sidecarFileAccessor) {
$this->command = $command;
$this->logger = $logger;
$this->sidecarFileAccessor = $sidecarFileAccessor;
public function __construct(
private ICommand $command,
private LoggerInterface $logger,
private ISidecarFileAccessor $sidecarFileAccessor,
private ICommandLineUtils $commandLineUtils,
) {
}

public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $globalSettings): OcrProcessorResult {
$commandStr = 'ocrmypdf ' . $this->getCommandlineArgs($settings, $globalSettings) . ' - - || exit $? ; cat';
$additionalCommandlineArgs = $this->getAdditionalCommandlineArgs($settings, $globalSettings);
$commandStr = 'ocrmypdf ' . $this->commandLineUtils->getCommandlineArgs($settings, $globalSettings, $additionalCommandlineArgs) . ' - - || exit $? ; cat';

$inputFileContent = $file->getContent();

Expand Down Expand Up @@ -109,55 +99,4 @@ public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $
protected function getAdditionalCommandlineArgs(WorkflowSettings $settings, GlobalSettings $globalSettings): array {
return [];
}


private function getCommandlineArgs(WorkflowSettings $settings, GlobalSettings $globalSettings): string {
// Default setting is quiet
$args = ['-q'];

// OCR mode ('--skip-text', '--redo-ocr', '--force-ocr' or empty)
$args[] = self::$ocrModeToCmdParameterMapping[$settings->getOcrMode()];

// Language settings
if ($settings->getLanguages()) {
$langStr = implode('+', $settings->getLanguages());
$args[] = "-l $langStr";
}

// Remove background option (NOTE :: this is incompatible with redo-ocr, so
// we have to make it exclusive against each other!)
if ($settings->getRemoveBackground()) {
if ($settings->getOcrMode() === WorkflowSettings::OCR_MODE_REDO_OCR) {
$this->logger->warning('--remove-background is incompatible with --redo-ocr, ignoring');
} else {
$args[] = '--remove-background';
}
}

// Number of CPU's to be used
$processorCount = intval($globalSettings->processorCount);
if ($processorCount > 0) {
$args[] = '-j ' . $processorCount;
}

// Save recognized text in tempfile
$sidecarFilePath = $this->sidecarFileAccessor->getOrCreateSidecarFile();
if ($sidecarFilePath) {
$args[] = '--sidecar ' . $sidecarFilePath;
}

$resultArgs = array_filter(array_merge(
$args,
$this->getAdditionalCommandlineArgs($settings, $globalSettings),
[$this->escapeCustomCliArgs($settings->getCustomCliArgs())]
), fn ($arg) => !empty($arg));

return implode(' ', $resultArgs);
}

private function escapeCustomCliArgs(string $customCliArgs): string {
$customCliArgs = str_replace('&&', '', $customCliArgs);
$customCliArgs = str_replace(';', '', $customCliArgs);
return $customCliArgs;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

namespace OCA\WorkflowOcr\OcrProcessors;
namespace OCA\WorkflowOcr\OcrProcessors\Local;

class PdfOcrProcessor extends OcrMyPdfBasedProcessor {
}
Loading

0 comments on commit 4bf4203

Please sign in to comment.