diff --git a/README.md b/README.md
index 136a1c8cb..075b68bd4 100644
--- a/README.md
+++ b/README.md
@@ -54,22 +54,34 @@ From the `./ocr/` directory, there are some Powershell ( ;-) ) scripts to recrea
1. `cd ./ocr`
-1. Make sure you have Tesseract installed. `brew install tesseract` on OSX.
+1. We have three places for OCRing, on the deployed demo site pdf-discovery-demo.dev.o19s.com, on your local Docker deployed service, or running on your computer.
-1. Check the `./tika-properties/.../TesseractOCRConfig.properties` file, make sure it points to your Tesseract setup.
+1. Look at the file `./extract.ps1` to see where the extraction is actually being run of the above three options.
+
+1. If you are running on local computer, first make sure you have Tesseract installed. `brew install tesseract` on OSX.
+
+1. Then check the `./tika-properties/.../TesseractOCRConfig.properties` file, make sure it points to your Tesseract setup.
1. Run the extraction process, creating the working docs in the `/extracts` directory from the PDF's in `/files`.
```
-pwsh extract-directory.ps1 ./files
+pwsh extract-directory.ps1 ./files3 ./extracts3
```
1. Create Solr documents.
```
-pwsh create-solr-docs.ps1 ./extracts ./files ./docs_for_solr/
+pwsh create-solr-docs.ps1 ./extracts3 ./files3 ./docs_for_solr3/
```
+1. Load Solr documents INTO Sol
+
+Run the load script:
+
+``
+./init/load_sample_files.sh ./docs_for_solr3 http://localhost:8983/solr/documents/update
+``
+
### Interested in manually extracting content from Tika Server?
From the `./ocr/` directory run:
diff --git a/docker-compose.yml b/docker-compose.yml
index 471ae1eaf..8a011e068 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -26,7 +26,7 @@ services:
- ./volumes/solr_backup:/solr_backup
links:
- zookeeper
- command: "bash -c '/opt/solr/bin/solr start -f -z zookeeper:2181 -Dbootstrap_confdir=/solr-config/index/conf'"
+ command: "bash -c './post-hooks.sh & /opt/solr/bin/solr start -f -z zookeeper:2181 -Denable.packages=true -Dbootstrap_confdir=/solr-config/index/conf'"
solr-proxy:
build: ./solr-proxy
diff --git a/ocr/extract-directory.ps1 b/ocr/extract-directory.ps1
index 514cf2b64..7b6074c1e 100644
--- a/ocr/extract-directory.ps1
+++ b/ocr/extract-directory.ps1
@@ -17,6 +17,6 @@ $pdf_files = Get-ChildItem -Path $source_directory –Recurse | Where-Object {$_
foreach ($pdf_file in $pdf_files) {
Write-Host $pdf_file
- Invoke-Expression "./extract.ps1 $pdf_file $extracts_directory"
+ Invoke-Expression "./extract2.ps1 $pdf_file $extracts_directory"
}
diff --git a/ocr/extract.ps1 b/ocr/extract.ps1
index 4b157c750..c0c9869c2 100644
--- a/ocr/extract.ps1
+++ b/ocr/extract.ps1
@@ -28,8 +28,9 @@ if(!(Test-Path($extract_file_json))){
Write-Host "About to Tika Extract PDF file $pdf_file"
- $result = curl -T $pdf_file http://pdf-discovery-demo.dev.o19s.com:9998/rmeta --header "X-Tika-OCRLanguage: eng" --header "X-Tika-PDFOcrStrategy: ocr_and_text_extraction" --header "X-Tika-OCRoutputType: hocr"
- #$result = curl -T $pdf_file http://localhost:9998/rmeta --header "X-Tika-OCRLanguage: eng" --header "X-Tika-PDFOcrStrategy: ocr_and_text_extraction" --header "X-Tika-OCRoutputType: hocr"
+ # Specify where we are OCR'ing the data
+ #$result = curl -T $pdf_file http://pdf-discovery-demo.dev.o19s.com:9998/rmeta --header "X-Tika-OCRLanguage: eng" --header "X-Tika-PDFOcrStrategy: ocr_and_text_extraction" --header "X-Tika-OCRoutputType: hocr"
+ $result = curl -T $pdf_file http://localhost:9998/rmeta --header "X-Tika-OCRLanguage: eng" --header "X-Tika-PDFOcrStrategy: ocr_and_text_extraction" --header "X-Tika-OCRoutputType: hocr"
#$result = java -cp tika-app-1.24.1.jar org.apache.tika.cli.TikaCLI --config=tika-config.xml --xmp --jsonRecursive --extract --pretty-print -x $pdf_file
Set-Content -Path $extract_file_json -Value $result
diff --git a/ocr/init/init.sh b/ocr/init/init.sh
index 960756b77..c60267913 100755
--- a/ocr/init/init.sh
+++ b/ocr/init/init.sh
@@ -5,10 +5,13 @@
# echo "Waiting on MySQL init..."
# sleep 5
#done
-#echo "Sleeping 15"
-#sleep 15
+echo "Sleeping 15"
+sleep 15
./wait-for-solr.sh --max-attempts 10 --wait-seconds 4 --solr-url http://solr:8983
+echo "Sleeping 30 more"
+sleep 30
+
echo "Uploading security.json to ZK"
java -jar ./jackhanna-0.0.4-SNAPSHOT.jar zookeeper:2181 putfile --file security.json --zkFile /security.json
diff --git a/solr/Dockerfile b/solr/Dockerfile
index ce5ac3cd9..3b15cdbb1 100644
--- a/solr/Dockerfile
+++ b/solr/Dockerfile
@@ -1,4 +1,4 @@
-FROM solr:8.11.1
+FROM solr:8.11.4
# Add Tesseract
USER root
@@ -12,8 +12,9 @@ RUN mkdir -p /home/solr/ # Cache dir for fonts from Tika.
RUN chown -R 8983:8983 /home/solr
# Add Solr customizations
-ADD lib/*.jar /opt/solr/server/solr-webapp/webapp/WEB-INF/lib/
ADD web.xml /opt/solr/server/solr-webapp/webapp/WEB-INF
+ADD post-hooks.sh /opt/solr
+RUN chmod +x /opt/solr/post-hooks.sh
COPY solr-home /solr-config
ADD set_heap.sh /docker-entrypoint-initdb.db
diff --git a/solr/lib/offset-hl-formatter-1.0.1-solr7.1.0-SNAPSHOT.jar b/solr/lib/offset-hl-formatter-1.0.1-solr7.1.0-SNAPSHOT.jar
deleted file mode 100644
index be56dc4af..000000000
Binary files a/solr/lib/offset-hl-formatter-1.0.1-solr7.1.0-SNAPSHOT.jar and /dev/null differ
diff --git a/solr/lib/solr-payloads-1.0.3-solr7.1.0-SNAPSHOT.jar b/solr/lib/solr-payloads-1.0.3-solr7.1.0-SNAPSHOT.jar
deleted file mode 100644
index 3cb00a97e..000000000
Binary files a/solr/lib/solr-payloads-1.0.3-solr7.1.0-SNAPSHOT.jar and /dev/null differ
diff --git a/solr/post-hooks.sh b/solr/post-hooks.sh
new file mode 100644
index 000000000..b22994610
--- /dev/null
+++ b/solr/post-hooks.sh
@@ -0,0 +1,6 @@
+wait-for-solr.sh --max-attempts 25 --wait-seconds 4 --solr-url http://solr:8983
+
+bin/solr package add-repo osc https://raw.githubusercontent.com/o19s/payload-component/master/repo
+bin/solr package install solr-payloads:1.1.4
+
+
diff --git a/solr/solr-home/index/conf/schema.xml b/solr/solr-home/index/conf/schema.xml
index 435338e42..8da52ad9a 100644
--- a/solr/solr-home/index/conf/schema.xml
+++ b/solr/solr-home/index/conf/schema.xml
@@ -67,13 +67,13 @@
-
+
-
+
-
+
diff --git a/solr/solr-home/index/conf/solrconfig.xml b/solr/solr-home/index/conf/solrconfig.xml
index de9505fa6..f5f81e5e7 100644
--- a/solr/solr-home/index/conf/solrconfig.xml
+++ b/solr/solr-home/index/conf/solrconfig.xml
@@ -134,7 +134,7 @@
-
+
+ class="solr-payloads:com.o19s.hl.OffsetFormatter">