Skip to content

Commit

Permalink
Add support for workflow_ocr_backend
Browse files Browse the repository at this point in the history
  • Loading branch information
R0Wi committed Jan 15, 2025
1 parent fe820e1 commit 4990c3f
Show file tree
Hide file tree
Showing 49 changed files with 4,165 additions and 158 deletions.
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ Since the actual processing of the files is done asynchronously via Nextcloud's


### Backend

This app is based on `ocrmypdf`. You can either install the CLI directly on the server running Nextcloud or use the alternative backend setup via Docker.

#### Local installation

> :warning: Since `v1.20.1` you'll have to install `OCRmyPDF`.
In the backend [`OCRmyPDF`](https://github.com/jbarlow83/OCRmyPDF) is used for processing PDF files. Make sure you have this commandline tool installed. Make sure you have the appropriate version (see below, Used libraries').
Expand All @@ -81,6 +86,14 @@ apt-get install tesseract-ocr-deu
apt-get install tesseract-ocr-chi-sim
```

#### `workflow_ocr_backend` installation

Starting from version 30, Nextcloud added support for [AppApi](https://docs.nextcloud.com/server/latest/admin_manual/exapps_management/AppAPIAndExternalApps.html) apps. In essence this allows external container based applications to be integrated into the Nextcloud ecosystem. This app is using this feature to provide an alternative backend setup via Docker.

Please refer to **https://github.com/R0Wi-DEV/workflow_ocr_backend** for more information on how to setup the backend.

> :information_source: If the `workflow_ocr_backend` External App is installed, the app will automatically use it as the backend even if you installed `ocrmypdf` locally.
### Setup Checks

The app will perform some [Setup Checks](https://docs.nextcloud.com/server/latest/admin_manual/configuration_server/security_setup_warnings.html) to verify your installation. If there is any problem with your backend setup, you'll see an error printed in Nextcloud under `Administration Settings` → `Overview` → `Security & setup warnings`.
Expand Down
11 changes: 11 additions & 0 deletions lib/AppInfo/Application.php
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,12 @@
use OCA\WorkflowOcr\Helper\SidecarFileAccessor;
use OCA\WorkflowOcr\Listener\RegisterFlowOperationsListener;
use OCA\WorkflowOcr\Notification\Notifier;
use OCA\WorkflowOcr\OcrProcessors\CommandLineUtils;
use OCA\WorkflowOcr\OcrProcessors\ICommandLineUtils;
use OCA\WorkflowOcr\OcrProcessors\IOcrProcessorFactory;
use OCA\WorkflowOcr\OcrProcessors\OcrProcessorFactory;
use OCA\WorkflowOcr\OcrProcessors\Remote\Client\ApiClient;
use OCA\WorkflowOcr\OcrProcessors\Remote\Client\IApiClient;
use OCA\WorkflowOcr\Service\EventService;
use OCA\WorkflowOcr\Service\GlobalSettingsService;
use OCA\WorkflowOcr\Service\IEventService;
Expand All @@ -46,8 +50,10 @@
use OCA\WorkflowOcr\Service\OcrBackendInfoService;
use OCA\WorkflowOcr\Service\OcrService;
use OCA\WorkflowOcr\SetupChecks\OcrMyPdfCheck;
use OCA\WorkflowOcr\Wrapper\AppApiWrapper;
use OCA\WorkflowOcr\Wrapper\CommandWrapper;
use OCA\WorkflowOcr\Wrapper\Filesystem;
use OCA\WorkflowOcr\Wrapper\IAppApiWrapper;
use OCA\WorkflowOcr\Wrapper\ICommand;
use OCA\WorkflowOcr\Wrapper\IFilesystem;
use OCA\WorkflowOcr\Wrapper\IViewFactory;
Expand All @@ -63,6 +69,8 @@

class Application extends App implements IBootstrap {
public const APP_NAME = 'workflow_ocr';
public const APP_BACKEND_NAME = 'workflow_ocr_backend';
public const APP_API_APP_NAME = 'app_api';

/**
* Application constructor.
Expand All @@ -83,6 +91,9 @@ public function register(IRegistrationContext $context): void {
$context->registerServiceAlias(IEventService::class, EventService::class);
$context->registerServiceAlias(IOcrBackendInfoService::class, OcrBackendInfoService::class);
$context->registerServiceAlias(INotificationService::class, NotificationService::class);
$context->registerServiceAlias(IApiClient::class, ApiClient::class);
$context->registerServiceAlias(ICommandLineUtils::class, CommandLineUtils::class);
$context->registerServiceAlias(IAppApiWrapper::class, AppApiWrapper::class);

// BUG #43
$context->registerService(ICommand::class, function () {
Expand Down
4 changes: 2 additions & 2 deletions lib/Exception/OcrProcessorNotFoundException.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
use Exception;

class OcrProcessorNotFoundException extends Exception {
public function __construct(string $mimeType) {
$this->message = 'OCR processor for mime type ' . $mimeType . ' not found';
public function __construct(string $mimeType, bool $useRemoteBackend) {
$this->message = 'OCR processor for mime type ' . $mimeType . '(useRemoteBackend=' . $useRemoteBackend . ') not found';
}
}
2 changes: 1 addition & 1 deletion lib/Model/WorkflowSettings.php
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ private function setJson(?string $json = null) {
$this->setProperty($this->customCliArgs, $data, 'customCliArgs', fn ($value) => is_string($value));
}

private function setProperty(& $property, array $jsonData, string $key, ?callable $dataCheck = null): void {
private function setProperty(array|bool|int|string & $property, array $jsonData, string $key, ?callable $dataCheck = null): void {
if (array_key_exists($key, $jsonData) && ($dataCheck === null || $dataCheck($jsonData[$key]))) {
$property = $jsonData[$key];
}
Expand Down
100 changes: 100 additions & 0 deletions lib/OcrProcessors/CommandLineUtils.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
<?php

declare(strict_types=1);

/**
* @copyright Copyright (c) 2025 Robin Windey <[email protected]>
*
* @license GNU AGPL version 3 or any later version
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

namespace OCA\WorkflowOcr\OcrProcessors;

use OCA\WorkflowOcr\Helper\ISidecarFileAccessor;
use OCA\WorkflowOcr\Model\GlobalSettings;
use OCA\WorkflowOcr\Model\WorkflowSettings;
use OCA\WorkflowOcr\Service\IOcrBackendInfoService;
use Psr\Log\LoggerInterface;

class CommandLineUtils implements ICommandLineUtils {
private static $ocrModeToCmdParameterMapping = [
WorkflowSettings::OCR_MODE_SKIP_TEXT => '--skip-text',
WorkflowSettings::OCR_MODE_REDO_OCR => '--redo-ocr',
WorkflowSettings::OCR_MODE_FORCE_OCR => '--force-ocr',
WorkflowSettings::OCR_MODE_SKIP_FILE => '' // This is the ocrmypdf default behaviour
];

public function __construct(
private ISidecarFileAccessor $sidecarFileAccessor,
private IOcrBackendInfoService $ocrBackendInfoService,
private LoggerInterface $logger,
) {
}

public function getCommandlineArgs(WorkflowSettings $settings, GlobalSettings $globalSettings, array $additionalCommandlineArgs = []): string {
$isLocalExecution = !$this->ocrBackendInfoService->isRemoteBackend();

// Default setting is quiet
$args = $isLocalExecution ? ['-q'] : [];

// OCR mode ('--skip-text', '--redo-ocr', '--force-ocr' or empty)
$args[] = self::$ocrModeToCmdParameterMapping[$settings->getOcrMode()];

// Language settings
if ($settings->getLanguages()) {
$langStr = implode('+', $settings->getLanguages());
$args[] = "--language $langStr";
}

// Remove background option (NOTE :: this is incompatible with redo-ocr, so
// we have to make it exclusive against each other!)
if ($settings->getRemoveBackground()) {
if ($settings->getOcrMode() === WorkflowSettings::OCR_MODE_REDO_OCR) {
$this->logger->warning('--remove-background is incompatible with --redo-ocr, ignoring');
} else {
$args[] = '--remove-background';
}
}

// Number of CPU's to be used
$processorCount = intval($globalSettings->processorCount);
if ($processorCount > 0) {
$args[] = '--jobs ' . $processorCount;
}

if ($isLocalExecution) {
// Save recognized text in tempfile
$sidecarFilePath = $this->sidecarFileAccessor->getOrCreateSidecarFile();
if ($sidecarFilePath) {
$args[] = '--sidecar ' . $sidecarFilePath;
}
}

$resultArgs = array_filter(array_merge(
$args,
$additionalCommandlineArgs,
[$this->escapeCustomCliArgs($settings->getCustomCliArgs())]
), fn ($arg) => !empty($arg));

return implode(' ', $resultArgs);
}

private function escapeCustomCliArgs(string $customCliArgs): string {
$customCliArgs = str_replace('&&', '', $customCliArgs);
$customCliArgs = str_replace(';', '', $customCliArgs);
return $customCliArgs;
}
}
31 changes: 31 additions & 0 deletions lib/OcrProcessors/ICommandLineUtils.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<?php

declare(strict_types=1);

/**
* @copyright Copyright (c) 2025 Robin Windey <[email protected]>
*
* @license GNU AGPL version 3 or any later version
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

namespace OCA\WorkflowOcr\OcrProcessors;

use OCA\WorkflowOcr\Model\GlobalSettings;
use OCA\WorkflowOcr\Model\WorkflowSettings;

interface ICommandLineUtils {
public function getCommandlineArgs(WorkflowSettings $settings, GlobalSettings $globalSettings, array $additionalCommandlineArgs = []): string;
}
7 changes: 0 additions & 7 deletions lib/OcrProcessors/IOcrProcessorFactory.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,4 @@ interface IOcrProcessorFactory {
* Creates a IOcrProcessor object for the given mimetype
*/
public function create(string $mimeType) : IOcrProcessor;

/**
* Returns true, if an OCR processor for the given mimetype
* can be constructed.
* @return bool
*/
public function canCreate(string $mimeType) : bool;
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

namespace OCA\WorkflowOcr\OcrProcessors;
namespace OCA\WorkflowOcr\OcrProcessors\Local;

use OCA\WorkflowOcr\Model\GlobalSettings;
use OCA\WorkflowOcr\Model\WorkflowSettings;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,42 +21,32 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

namespace OCA\WorkflowOcr\OcrProcessors;
namespace OCA\WorkflowOcr\OcrProcessors\Local;

use OCA\WorkflowOcr\Exception\OcrNotPossibleException;
use OCA\WorkflowOcr\Exception\OcrResultEmptyException;
use OCA\WorkflowOcr\Helper\ISidecarFileAccessor;
use OCA\WorkflowOcr\Model\GlobalSettings;
use OCA\WorkflowOcr\Model\WorkflowSettings;
use OCA\WorkflowOcr\OcrProcessors\ICommandLineUtils;
use OCA\WorkflowOcr\OcrProcessors\IOcrProcessor;
use OCA\WorkflowOcr\OcrProcessors\OcrProcessorResult;
use OCA\WorkflowOcr\Wrapper\ICommand;
use OCP\Files\File;
use Psr\Log\LoggerInterface;

abstract class OcrMyPdfBasedProcessor implements IOcrProcessor {
private static $ocrModeToCmdParameterMapping = [
WorkflowSettings::OCR_MODE_SKIP_TEXT => '--skip-text',
WorkflowSettings::OCR_MODE_REDO_OCR => '--redo-ocr',
WorkflowSettings::OCR_MODE_FORCE_OCR => '--force-ocr',
WorkflowSettings::OCR_MODE_SKIP_FILE => '' // This is the ocrmypdf default behaviour
];

/** @var ICommand */
private $command;

/** @var LoggerInterface */
private $logger;

/** @var ISidecarFileAccessor */
private $sidecarFileAccessor;

public function __construct(ICommand $command, LoggerInterface $logger, ISidecarFileAccessor $sidecarFileAccessor) {
$this->command = $command;
$this->logger = $logger;
$this->sidecarFileAccessor = $sidecarFileAccessor;
public function __construct(
private ICommand $command,
private LoggerInterface $logger,
private ISidecarFileAccessor $sidecarFileAccessor,
private ICommandLineUtils $commandLineUtils,
) {
}

public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $globalSettings): OcrProcessorResult {
$commandStr = 'ocrmypdf ' . $this->getCommandlineArgs($settings, $globalSettings) . ' - - || exit $? ; cat';
$additionalCommandlineArgs = $this->getAdditionalCommandlineArgs($settings, $globalSettings);
$commandStr = 'ocrmypdf ' . $this->commandLineUtils->getCommandlineArgs($settings, $globalSettings, $additionalCommandlineArgs) . ' - - || exit $? ; cat';

$inputFileContent = $file->getContent();

Expand Down Expand Up @@ -109,55 +99,4 @@ public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $
protected function getAdditionalCommandlineArgs(WorkflowSettings $settings, GlobalSettings $globalSettings): array {
return [];
}


private function getCommandlineArgs(WorkflowSettings $settings, GlobalSettings $globalSettings): string {
// Default setting is quiet
$args = ['-q'];

// OCR mode ('--skip-text', '--redo-ocr', '--force-ocr' or empty)
$args[] = self::$ocrModeToCmdParameterMapping[$settings->getOcrMode()];

// Language settings
if ($settings->getLanguages()) {
$langStr = implode('+', $settings->getLanguages());
$args[] = "-l $langStr";
}

// Remove background option (NOTE :: this is incompatible with redo-ocr, so
// we have to make it exclusive against each other!)
if ($settings->getRemoveBackground()) {
if ($settings->getOcrMode() === WorkflowSettings::OCR_MODE_REDO_OCR) {
$this->logger->warning('--remove-background is incompatible with --redo-ocr, ignoring');
} else {
$args[] = '--remove-background';
}
}

// Number of CPU's to be used
$processorCount = intval($globalSettings->processorCount);
if ($processorCount > 0) {
$args[] = '-j ' . $processorCount;
}

// Save recognized text in tempfile
$sidecarFilePath = $this->sidecarFileAccessor->getOrCreateSidecarFile();
if ($sidecarFilePath) {
$args[] = '--sidecar ' . $sidecarFilePath;
}

$resultArgs = array_filter(array_merge(
$args,
$this->getAdditionalCommandlineArgs($settings, $globalSettings),
[$this->escapeCustomCliArgs($settings->getCustomCliArgs())]
), fn ($arg) => !empty($arg));

return implode(' ', $resultArgs);
}

private function escapeCustomCliArgs(string $customCliArgs): string {
$customCliArgs = str_replace('&&', '', $customCliArgs);
$customCliArgs = str_replace(';', '', $customCliArgs);
return $customCliArgs;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

namespace OCA\WorkflowOcr\OcrProcessors;
namespace OCA\WorkflowOcr\OcrProcessors\Local;

class PdfOcrProcessor extends OcrMyPdfBasedProcessor {
}
Loading

0 comments on commit 4990c3f

Please sign in to comment.