diff --git a/README.md b/README.md index 136a1c8cb..075b68bd4 100644 --- a/README.md +++ b/README.md @@ -54,22 +54,34 @@ From the `./ocr/` directory, there are some Powershell ( ;-) ) scripts to recrea 1. `cd ./ocr` -1. Make sure you have Tesseract installed. `brew install tesseract` on OSX. +1. We have three places for OCRing, on the deployed demo site pdf-discovery-demo.dev.o19s.com, on your local Docker deployed service, or running on your computer. -1. Check the `./tika-properties/.../TesseractOCRConfig.properties` file, make sure it points to your Tesseract setup. +1. Look at the file `./extract.ps1` to see where the extraction is actually being run of the above three options. + +1. If you are running on local computer, first make sure you have Tesseract installed. `brew install tesseract` on OSX. + +1. Then check the `./tika-properties/.../TesseractOCRConfig.properties` file, make sure it points to your Tesseract setup. 1. Run the extraction process, creating the working docs in the `/extracts` directory from the PDF's in `/files`. ``` -pwsh extract-directory.ps1 ./files +pwsh extract-directory.ps1 ./files3 ./extracts3 ``` 1. Create Solr documents. ``` -pwsh create-solr-docs.ps1 ./extracts ./files ./docs_for_solr/ +pwsh create-solr-docs.ps1 ./extracts3 ./files3 ./docs_for_solr3/ ``` +1. Load Solr documents INTO Sol + +Run the load script: + +`` +./init/load_sample_files.sh ./docs_for_solr3 http://localhost:8983/solr/documents/update +`` + ### Interested in manually extracting content from Tika Server? From the `./ocr/` directory run: diff --git a/docker-compose.yml b/docker-compose.yml index 471ae1eaf..8a011e068 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -26,7 +26,7 @@ services: - ./volumes/solr_backup:/solr_backup links: - zookeeper - command: "bash -c '/opt/solr/bin/solr start -f -z zookeeper:2181 -Dbootstrap_confdir=/solr-config/index/conf'" + command: "bash -c './post-hooks.sh & /opt/solr/bin/solr start -f -z zookeeper:2181 -Denable.packages=true -Dbootstrap_confdir=/solr-config/index/conf'" solr-proxy: build: ./solr-proxy diff --git a/ocr/extract-directory.ps1 b/ocr/extract-directory.ps1 index 514cf2b64..7b6074c1e 100644 --- a/ocr/extract-directory.ps1 +++ b/ocr/extract-directory.ps1 @@ -17,6 +17,6 @@ $pdf_files = Get-ChildItem -Path $source_directory –Recurse | Where-Object {$_ foreach ($pdf_file in $pdf_files) { Write-Host $pdf_file - Invoke-Expression "./extract.ps1 $pdf_file $extracts_directory" + Invoke-Expression "./extract2.ps1 $pdf_file $extracts_directory" } diff --git a/ocr/extract.ps1 b/ocr/extract.ps1 index 4b157c750..c0c9869c2 100644 --- a/ocr/extract.ps1 +++ b/ocr/extract.ps1 @@ -28,8 +28,9 @@ if(!(Test-Path($extract_file_json))){ Write-Host "About to Tika Extract PDF file $pdf_file" - $result = curl -T $pdf_file http://pdf-discovery-demo.dev.o19s.com:9998/rmeta --header "X-Tika-OCRLanguage: eng" --header "X-Tika-PDFOcrStrategy: ocr_and_text_extraction" --header "X-Tika-OCRoutputType: hocr" - #$result = curl -T $pdf_file http://localhost:9998/rmeta --header "X-Tika-OCRLanguage: eng" --header "X-Tika-PDFOcrStrategy: ocr_and_text_extraction" --header "X-Tika-OCRoutputType: hocr" + # Specify where we are OCR'ing the data + #$result = curl -T $pdf_file http://pdf-discovery-demo.dev.o19s.com:9998/rmeta --header "X-Tika-OCRLanguage: eng" --header "X-Tika-PDFOcrStrategy: ocr_and_text_extraction" --header "X-Tika-OCRoutputType: hocr" + $result = curl -T $pdf_file http://localhost:9998/rmeta --header "X-Tika-OCRLanguage: eng" --header "X-Tika-PDFOcrStrategy: ocr_and_text_extraction" --header "X-Tika-OCRoutputType: hocr" #$result = java -cp tika-app-1.24.1.jar org.apache.tika.cli.TikaCLI --config=tika-config.xml --xmp --jsonRecursive --extract --pretty-print -x $pdf_file Set-Content -Path $extract_file_json -Value $result diff --git a/ocr/init/init.sh b/ocr/init/init.sh index 960756b77..c60267913 100755 --- a/ocr/init/init.sh +++ b/ocr/init/init.sh @@ -5,10 +5,13 @@ # echo "Waiting on MySQL init..." # sleep 5 #done -#echo "Sleeping 15" -#sleep 15 +echo "Sleeping 15" +sleep 15 ./wait-for-solr.sh --max-attempts 10 --wait-seconds 4 --solr-url http://solr:8983 +echo "Sleeping 30 more" +sleep 30 + echo "Uploading security.json to ZK" java -jar ./jackhanna-0.0.4-SNAPSHOT.jar zookeeper:2181 putfile --file security.json --zkFile /security.json diff --git a/solr/Dockerfile b/solr/Dockerfile index ce5ac3cd9..3b15cdbb1 100644 --- a/solr/Dockerfile +++ b/solr/Dockerfile @@ -1,4 +1,4 @@ -FROM solr:8.11.1 +FROM solr:8.11.4 # Add Tesseract USER root @@ -12,8 +12,9 @@ RUN mkdir -p /home/solr/ # Cache dir for fonts from Tika. RUN chown -R 8983:8983 /home/solr # Add Solr customizations -ADD lib/*.jar /opt/solr/server/solr-webapp/webapp/WEB-INF/lib/ ADD web.xml /opt/solr/server/solr-webapp/webapp/WEB-INF +ADD post-hooks.sh /opt/solr +RUN chmod +x /opt/solr/post-hooks.sh COPY solr-home /solr-config ADD set_heap.sh /docker-entrypoint-initdb.db diff --git a/solr/lib/offset-hl-formatter-1.0.1-solr7.1.0-SNAPSHOT.jar b/solr/lib/offset-hl-formatter-1.0.1-solr7.1.0-SNAPSHOT.jar deleted file mode 100644 index be56dc4af..000000000 Binary files a/solr/lib/offset-hl-formatter-1.0.1-solr7.1.0-SNAPSHOT.jar and /dev/null differ diff --git a/solr/lib/solr-payloads-1.0.3-solr7.1.0-SNAPSHOT.jar b/solr/lib/solr-payloads-1.0.3-solr7.1.0-SNAPSHOT.jar deleted file mode 100644 index 3cb00a97e..000000000 Binary files a/solr/lib/solr-payloads-1.0.3-solr7.1.0-SNAPSHOT.jar and /dev/null differ diff --git a/solr/post-hooks.sh b/solr/post-hooks.sh new file mode 100644 index 000000000..b22994610 --- /dev/null +++ b/solr/post-hooks.sh @@ -0,0 +1,6 @@ +wait-for-solr.sh --max-attempts 25 --wait-seconds 4 --solr-url http://solr:8983 + +bin/solr package add-repo osc https://raw.githubusercontent.com/o19s/payload-component/master/repo +bin/solr package install solr-payloads:1.1.4 + + diff --git a/solr/solr-home/index/conf/schema.xml b/solr/solr-home/index/conf/schema.xml index 435338e42..8da52ad9a 100644 --- a/solr/solr-home/index/conf/schema.xml +++ b/solr/solr-home/index/conf/schema.xml @@ -67,13 +67,13 @@ - + - + - + diff --git a/solr/solr-home/index/conf/solrconfig.xml b/solr/solr-home/index/conf/solrconfig.xml index de9505fa6..f5f81e5e7 100644 --- a/solr/solr-home/index/conf/solrconfig.xml +++ b/solr/solr-home/index/conf/solrconfig.xml @@ -134,7 +134,7 @@ - + + class="solr-payloads:com.o19s.hl.OffsetFormatter">