From 15ac838329f3646cfe4e09f6a2fd09caa9e244ab Mon Sep 17 00:00:00 2001 From: iMac_Abdallah Date: Mon, 16 Sep 2024 21:43:07 +0200 Subject: [PATCH] OLA Integration --- README.md | 1 + default_workflow.nf | 0 goobi_operandi.sh | 12 +- kitodo_operandi.sh | 6 +- ocrd-models/qurator-gt4histocr-1/0.ckpt.h5 | Bin ocrd-models/qurator-gt4histocr-1/0.ckpt.json | 0 script_docker.sh | 0 script_native.sh | 0 upload_to_ola_hd.sh | 116 ++++++++++++++++++- 9 files changed, 121 insertions(+), 14 deletions(-) mode change 100644 => 100755 README.md mode change 100644 => 100755 default_workflow.nf mode change 100644 => 100755 goobi_operandi.sh mode change 100644 => 100755 kitodo_operandi.sh mode change 100644 => 100755 ocrd-models/qurator-gt4histocr-1/0.ckpt.h5 mode change 100644 => 100755 ocrd-models/qurator-gt4histocr-1/0.ckpt.json mode change 100644 => 100755 script_docker.sh mode change 100644 => 100755 script_native.sh mode change 100644 => 100755 upload_to_ola_hd.sh diff --git a/README.md b/README.md old mode 100644 new mode 100755 index dbe8fc5..74548f7 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ This script is made to integrate Operandi with other tools such as Goobi and Kit `script_docker.sh` is for terminal use with OCR-D docker installation. `goobi_operandi.sh` is used for Operandi-Goobi integration. `kitodo_operandi.sh` is used for Operandi-Kitodo integration. +`upload_to_ola_hd.sh` is used for OLA_HD-Kitodo-Goobi integration. diff --git a/default_workflow.nf b/default_workflow.nf old mode 100644 new mode 100755 diff --git a/goobi_operandi.sh b/goobi_operandi.sh old mode 100644 new mode 100755 index 419ca2e..c9940f0 --- a/goobi_operandi.sh +++ b/goobi_operandi.sh @@ -25,8 +25,8 @@ WORKFLOW="default_workflow.nf" METS_URL="" IMAGE_DIR=$(pwd)/images EXT="jpg" -CPUs=4 -RAM=8 +CPUs=8 +RAM=64 ZIP="" workflow_id="default_workflow" LOCAL_OCRD=false @@ -423,13 +423,13 @@ upload_to_ola_hd() { handle_results() { echo "Process title is $PROCESS_TITLE" unzip -o "$OCRD_RESULTS" -d "$WORKSPACE_DIR"_results - mkdir -p $PARENT_WORKSPACE/ocr/"$PROCESS_TITLE"_alto - mv -f "$WORKSPACE_DIR"_results/data/*ALTO*/* $PARENT_WORKSPACE/ocr/"$PROCESS_TITLE"_alto/ - echo "$OCRD_RESULTS" > "$PARENT_WORKSPACE/.ocrd_results_path" + mkdir -p $PARENT_WORKSPACE/ocr/ + mv -f "$WORKSPACE_DIR"_results/data/* $PARENT_WORKSPACE/ocr/ } + cleanup(){ - rm -r .nextflow* tmp/ work/ report* $PARENT_WORKSPACE/ocrd.log "$WORKSPACE_DIR"_local "$WORKSPACE_DIR"_results $WORKSPACE_DIR + rm -r .nextflow* tmp/ work/ report* $PARENT_WORKSPACE/ocrd.log "$WORKSPACE_DIR"_local "$WORKSPACE_DIR"_results $WORKSPACE_DIR $OCRD_RESULTS } diff --git a/kitodo_operandi.sh b/kitodo_operandi.sh old mode 100644 new mode 100755 index afab4d3..cac12d4 --- a/kitodo_operandi.sh +++ b/kitodo_operandi.sh @@ -420,11 +420,11 @@ upload_to_ola_hd() { # Function to handle results for kitodo handle_results() { unzip -o "$OCRD_RESULTS" -d "$WORKSPACE_DIR"_results - mv -f "$WORKSPACE_DIR"_results/data/*ALTO*/* $PARENT_WORKSPACE/ocr/alto/ - echo "$OCRD_RESULTS" > "$PARENT_WORKSPACE/.ocrd_results_path" + mv -f "$WORKSPACE_DIR"_results/data/* $PARENT_WORKSPACE/ocr/ } + cleanup(){ - rm -r .nextflow* tmp/ work/ report* $PARENT_WORKSPACE/ocrd.log "$WORKSPACE_DIR"_local "$WORKSPACE_DIR"_results $WORKSPACE_DIR + rm -r .nextflow* tmp/ work/ report* $PARENT_WORKSPACE/ocrd.log "$WORKSPACE_DIR"_local "$WORKSPACE_DIR"_results $WORKSPACE_DIR $OCRD_RESULTS } diff --git a/ocrd-models/qurator-gt4histocr-1/0.ckpt.h5 b/ocrd-models/qurator-gt4histocr-1/0.ckpt.h5 old mode 100644 new mode 100755 diff --git a/ocrd-models/qurator-gt4histocr-1/0.ckpt.json b/ocrd-models/qurator-gt4histocr-1/0.ckpt.json old mode 100644 new mode 100755 diff --git a/script_docker.sh b/script_docker.sh old mode 100644 new mode 100755 diff --git a/script_native.sh b/script_native.sh old mode 100644 new mode 100755 diff --git a/upload_to_ola_hd.sh b/upload_to_ola_hd.sh old mode 100644 new mode 100755 index 9d24063..3ea8200 --- a/upload_to_ola_hd.sh +++ b/upload_to_ola_hd.sh @@ -1,17 +1,123 @@ +#!/bin/bash + #this script is done to upload ocr bagit to OLA-HD # OLA_USR should be added as an env variable and it refers to ola-hd username:password #operandi script should store the results path inside process/directory($s1)/.ocrd_results_path -#ocr bagit path -OCRD_RESULTS=$(<"$1/.ocrd_results_path") +SCRIPT_PATH="$(dirname "$(realpath "$0")")" +cd "$SCRIPT_PATH" +SERVER_ADDR=141.5.99.53 +CURRENT_TIME=`date +"%m%d%Y_%H%M%S"` +WORKSPACE_DIR="$PWD/ws_$CURRENT_TIME" +RESULTS_AVAILABLE=false +ERROR_LOG="error_log.txt" +LOG_FILE="log_file.txt" +METS_PATH_URL="" +OCRD_RESULTS="" + +#Get the options +while getopts ":s:f:m:u:w:i:c:r:n:elz:o:" opt; do + case $opt in + s) SERVER_ADDR="$OPTARG" ;; + m) METS_PATH_URL="$OPTARG" ;; + w) WORKSPACE_DIR="$OPTARG" ;; + z) OCRD_RESULTS="$OPTARG" + RESULTS_AVAILABLE=true;; + o) OLA_USR="$OPTARG";; + \?) echo "Invalid option: -$OPTARG" >&2; exit 1 ;; + :) echo "Option -$OPTARG requires an argument." >&2; exit 1 ;; + esac +done + + + + +# Function to log errors and information with timestamp and workspace name +log_info() { + local log_message="$1" + echo "[INFO] $(date '+%Y-%m-%d %H:%M:%S') - (Upload to OLA) Workspace: $WORKSPACE_DIR - $log_message" + echo "[INFO] $(date '+%Y-%m-%d %H:%M:%S') - (Upload to OLA) Workspace: $WORKSPACE_DIR - $log_message" >> "$LOG_FILE" +} + +# Function to log errors with timestamp and workspace name +log_error() { + local error_message="$1" + echo "[ERROR] $(date '+%Y-%m-%d %H:%M:%S') - (Upload to OLA) Workspace: $WORKSPACE_DIR - $error_message" >> "$ERROR_LOG" + echo "[ERROR] $(date '+%Y-%m-%d %H:%M:%S') - (Upload to OLA) Workspace: $WORKSPACE_DIR - $error_message" >> "$LOG_FILE" +} + + upload_to_ola_hd() { - echo "Uploading the results to OLA-HD..." - curl -X POST 141.5.99.53/api/bag -u "$OLA_USR" -H 'content-type: multipart/form-data' -F file=@"$OCRD_RESULTS" + log_info "Uploading the results to OLA-HD..." + curl -X POST $SERVER_ADDR/api/bag -u "$OLA_USR" -H 'content-type: multipart/form-data' -F file=@"$OCRD_RESULTS" if [ $? -ne 0 ]; then log_error "Failed to download the results." exit 1 fi } -upload_to_ola_hd +create_workspace() { + + # Function to generate OCR-D zip + log_info "Creating workspace..." + $DOCKER_RAPPER ocrd workspace -d "/data/$PROCESS_TITLE" clone $METS_PATH_URL + + if [ $? -ne 0 ]; then + log_error "Failed to generate the OCR-D zip." + exit 1 + fi + + +} + +# Function to generate OCR-D zip +generate_ocrd_zip() { + log_info "Generating an OCR-D zip..." + $DOCKER_RAPPER ocrd zip bag -i "$PROCESS_TITLE" -d "/data/$PROCESS_TITLE" + + if [ $? -ne 0 ]; then + log_error "Failed to generate the OCR-D zip." + exit 1 + fi +} + +# Function to validate OCR-D zip +validate_ocrd_zip() { + log_info "Validating the OCR-D zip..." + $DOCKER_RAPPER ocrd zip validate "/data/$PROCESS_TITLE.ocrd.zip" + if [ $? -ne 0 ]; then + log_error "Validation failed. The OCR-D zip is not valid." + exit 1 + fi +} + +cleanup() { + rm -r $WORKSPACE_DIR ocrd.log $OCRD_RESULTS $SCRIPT_PATH/tmp +} + +main() { + + PROCESS_TITLE=$(basename "$WORKSPACE_DIR") + PARENT_WORKSPACE=$(dirname "$WORKSPACE_DIR") + DOCKER_RAPPER="docker run --rm -u $(id -u) -v $SCRIPT_PATH/tmp:/tmp -v $SCRIPT_PATH/ocrd-models:/ocrd-models -v $PARENT_WORKSPACE:/data -- ocrd/all:maximum" + + if [ "$RESULTS_AVAILABLE" == false ] ; then + + if [ -z "$METS_PATH_URL" ] ; then + log_error "METS URL is not given..." + exit 1 + fi + create_workspace + generate_ocrd_zip + validate_ocrd_zip + OCRD_RESULTS=$WORKSPACE_DIR.ocrd.zip + fi + + upload_to_ola_hd + cleanup + +} + +main +