From f44df3927658a2ddd7ec9068dc6fd842acc0263e Mon Sep 17 00:00:00 2001 From: Ivan Mrsulja Date: Wed, 11 Dec 2024 19:44:37 +0100 Subject: [PATCH 01/21] Added initial version of dspace harvest via oaipmh. --- .../changenamespace-all.config.xml | 20 ++ .../diff-additions.config.xml | 91 ++++++++ .../diff-subtractions.config.xml | 90 ++++++++ .../harvested-data.model.xml | 152 +++++++++++++ .../match-authorship.conf.xml | 31 +++ .../match-main.conf.xml | 31 +++ .../match-pub.conf.xml | 31 +++ .../oaifetch-dc.datamap.xsl | 206 ++++++++++++++++++ .../example-oaifetch-dspace/oaifetch.conf.xml | 20 ++ .../previous-harvest.model.xml | 10 + .../raw-records.config.xml | 10 + .../example-oaifetch-dspace/run-oaifetch.sh | 159 ++++++++++++++ .../score-author.conf.xml | 22 ++ .../score-authorship.conf.xml | 25 +++ .../score-data.model.xml | 152 +++++++++++++ .../score-pub.conf.xml | 65 ++++++ .../score-publisher.conf.xml | 31 +++ .../score-vcard-name.conf.xml | 37 ++++ .../score-vcard.conf.xml | 31 +++ .../translated-records.config.xml | 10 + .../truncate.config.xml | 12 + .../example-oaifetch-dspace/usage.txt | 10 + .../example-oaifetch-dspace/vivo.model.xml | 106 +++++++++ .../xsltranslator.config.xml | 20 ++ .../org/vivoweb/harvester/util/FileAide.java | 29 ++- 25 files changed, 1397 insertions(+), 4 deletions(-) create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/changenamespace-all.config.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/diff-additions.config.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/diff-subtractions.config.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/harvested-data.model.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/match-authorship.conf.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/match-main.conf.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/match-pub.conf.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch-dc.datamap.xsl create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch.conf.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/previous-harvest.model.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/raw-records.config.xml create mode 100755 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.sh create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-author.conf.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-authorship.conf.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-data.model.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-pub.conf.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-publisher.conf.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard-name.conf.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard.conf.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/translated-records.config.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/truncate.config.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/vivo.model.xml create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/xsltranslator.config.xml diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/changenamespace-all.config.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/changenamespace-all.config.xml new file mode 100644 index 000000000..6cdcf2450 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/changenamespace-all.config.xml @@ -0,0 +1,20 @@ + + + + + harvested-data.model.xml + + + vivo.model.xml + + + http://vivo.example.com/harvest + + http://localhost:8080/vivo-1.15/individual/ + INFO + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/diff-additions.config.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/diff-additions.config.xml new file mode 100644 index 000000000..f104c564c --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/diff-additions.config.xml @@ -0,0 +1,91 @@ + + + + + + + + harvested-data.model.xml + + + + + previous-harvest.model.xml + + + + + xmlrdf=data/vivo-additions.rdf.xml + xmlrdf=RDF/XML + ntriple=data/vivo-ntriple-additions.xml + ntriple=N-TRIPLE + + + + + + + + + + + + + + INFO + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/diff-subtractions.config.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/diff-subtractions.config.xml new file mode 100644 index 000000000..ac7202f0a --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/diff-subtractions.config.xml @@ -0,0 +1,90 @@ + + + + + + + + previous-harvest.model.xml + + + + + harvested-data.model.xml + + + + + xmlrdf=data/vivo-subtractions.rdf.xml + ntriple=data/vivo-ntriple-subtractions.xml + ntriple=N-TRIPLE + + + + + + + + + + + + + + INFO + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/harvested-data.model.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/harvested-data.model.xml new file mode 100644 index 000000000..3619908ed --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/harvested-data.model.xml @@ -0,0 +1,152 @@ + + + + + + + + + + + + + tdb + data/harvested-data/ + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/match-authorship.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/match-authorship.conf.xml new file mode 100644 index 000000000..044b9f505 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/match-authorship.conf.xml @@ -0,0 +1,31 @@ + + + + + harvested-data.model.xml + + + score-data.model.xml + + + true + + + 0.7 + INFO + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/match-main.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/match-main.conf.xml new file mode 100644 index 000000000..044b9f505 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/match-main.conf.xml @@ -0,0 +1,31 @@ + + + + + harvested-data.model.xml + + + score-data.model.xml + + + true + + + 0.7 + INFO + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/match-pub.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/match-pub.conf.xml new file mode 100644 index 000000000..044b9f505 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/match-pub.conf.xml @@ -0,0 +1,31 @@ + + + + + harvested-data.model.xml + + + score-data.model.xml + + + true + + + 0.7 + INFO + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch-dc.datamap.xsl b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch-dc.datamap.xsl new file mode 100644 index 000000000..06980c35e --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch-dc.datamap.xsl @@ -0,0 +1,206 @@ + + + + + + http://vivo.example.com/harvest/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Authorship for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Authorship for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch.conf.xml new file mode 100644 index 000000000..f4bae937d --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch.conf.xml @@ -0,0 +1,20 @@ + + + + +https://demo.dspace.org/server/oai/request + + +2020-10-01T00:00:00Z +2024-12-31T00:00:00Z +oai_dc +raw-records.config.xml +INFO + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/previous-harvest.model.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/previous-harvest.model.xml new file mode 100644 index 000000000..6542cb903 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/previous-harvest.model.xml @@ -0,0 +1,10 @@ + + + + tdb + previous-harvest + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/raw-records.config.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/raw-records.config.xml new file mode 100644 index 000000000..5203936d2 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/raw-records.config.xml @@ -0,0 +1,10 @@ + + + + org.vivoweb.harvester.util.repo.TextFileRecordHandler + data/raw-records + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.sh b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.sh new file mode 100755 index 000000000..6207d3090 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.sh @@ -0,0 +1,159 @@ +#!/bin/bash + +#Copyright (c) 2010-2011 VIVO Harvester Team. For full list of contributors, please see the AUTHORS file provided. +#All rights reserved. +#This program and the accompanying materials are made available under the terms of the new BSD license which accompanies this distribution, and is available at http://www.opensource.org/licenses/bsd-license.html + +# set to the directory where the harvester was installed or unpacked +# HARVESTER_INSTALL_DIR is set to the location of the installed harvester +# If the deb file was used to install the harvester then the +# directory should be set to /usr/share/vivo/harvester which is the +# current location associated with the deb installation. +# Since it is also possible the harvester was installed by +# uncompressing the tar.gz the setting is available to be changed +# and should agree with the installation location +#export HARVESTER_INSTALL_DIR=/usr/local/src/VIVO-Harvester +HARVESTER_INSTALL_DIR=$(pwd)/../../../../../../VIVO-Harvester +export HARVEST_NAME=example-oaifetch-dspace +export DATE=`date +%Y-%m-%d'T'%T` + +# Add harvester binaries to path for execution +# The tools within this script refer to binaries supplied within the harvester +# Since they can be located in another directory their path should be +# included within the classpath and the path environment variables. +export PATH=$PATH:$HARVESTER_INSTALL_DIR/bin +export CLASSPATH=$CLASSPATH:$HARVESTER_INSTALL_DIR/bin/harvester.jar:$HARVESTER_INSTALL_DIR/bin/dependency/* +export CLASSPATH=$CLASSPATH:$HARVESTER_INSTALL_DIR/build/harvester.jar:$HARVESTER_INSTALL_DIR/build/dependency/* + +# Exit on first error +# The -e flag prevents the script from continuing even though a tool fails. +# Continuing after a tool failure is undesirable since the harvested +# data could be rendered corrupted and incompatible. +set -e + +# Supply the location of the detailed log file which is generated during the script. +# If there is an issue with a harvest, this file proves invaluable in finding +# a solution to the problem. It has become common practice in addressing a problem +# to request this file. The passwords and usernames are filtered out of this file +# to prevent these logs from containing sensitive information. +echo "Full Logging in $HARVEST_NAME.$DATE.log" +if [ ! -d logs ]; then + mkdir logs +fi +cd logs +touch $HARVEST_NAME.$DATE.log +ln -sf $HARVEST_NAME.$DATE.log $HARVEST_NAME.latest.log +cd .. + +#clear old data +# For a fresh harvest, the removal of the previous information maintains data integrity. +# If you are continuing a partial run or wish to use the old and already retrieved +# data, you will want to comment out this line since it could prevent you from having +# the required harvest data. +rm -rf data + +# Execute Fetch +# This stage of the script is where the information is gathered together into one local +# place to facilitate the further steps of the harvest. The data is stored locally +# in a format based off of the source. The format is a form of RDF but not in the VIVO ontology + +java $HARVESTER_JAVA_OPTS org.vivoweb.harvester.fetch.OAIFetch -X oaifetch.conf.xml +# Execute Translate +# This is the part of the script where the input data is transformed into valid RDF +# Translate will apply an xslt file to the fetched data which will result in the data +# becoming valid RDF in the VIVO ontology +harvester-xsltranslator -X xsltranslator.config.xml + +# Execute Transfer to import from record handler into local temp model +# From this stage on the script places the data into a Jena model. A model is a +# data storage structure similar to a database, but in RDF. +# The harvester tool Transfer is used to move/add/remove/dump data in models. +# For this call on the transfer tool: +# -s refers to the source translated records file, which was just produced by the translator step +# -o refers to the destination model for harvested data +# -d means that this call will also produce a text dump file in the specified location +harvester-transfer -w INFO -s translated-records.config.xml -o harvested-data.model.xml -d data/harvested-data/imported-records.rdf.xml + +#Score on publications +# Compare names of publications in VIVO with names of publications from MODS and assign a +# score value indicating how likely they are the same publication. +#harvester-score -X score-pub.conf.xml + +#Match publications +# Use the score values from the previous score to rename publications which we deem are the +# same, so that they match the URI of the publication in VIVO. +#harvester-match -X match-pub.conf.xml + +#Score on authors, organizations, geographic locations, journals, hyperlinks, and +# date-time intervals +# Same as above, but for authors, organizations, geographic locations, journals, hyperlinks, +# and date-time intervals. +#harvester-score -X score-author.conf.xml +#harvester-score -X score-publisher.conf.xml + +#harvester-score -X score-interval.conf.xml +#harvester-score -X score-datetime.conf.xml + +#Match +# Rename matches scored above. +#harvester-match -X match-main.conf.xml + +#Score on authorships +# Same as above, but for authorships. +#harvester-score -X score-authorship.conf.xml + +#Match +# Rename matching authorships. +#harvester-match -X match-authorship.conf.xml + +#Truncate Score Data model +#harvester-jenaconnect -X truncate.config.xml + + +# Execute ChangeNamespace to get unmatched publications into current namespace +# This is where the new people from the harvest are given uris within the namespace of Vivo +# If there is an issue with uris being in another namespace after import, make sure this step +# was completed for those uris. + +#echo "changenamespace" +#harvester-changenamespace -X changenamespace-all.config.xml + + +# Dump harvested data model for testing +# harvester-jenaconnect -X harvested-dump.config.xml + +# Perform an update +# The harvester maintains copies of previous harvests in order to perform the same harvest twice +# but only add the new statements, while removing the old statements that are no longer +# contained in the input data. This is done in several steps of finding the old statements, +# then the new statements, and then applying them to the Vivo main model. + +# Find Subtractions +# When making the previous harvest model agree with the current harvest, the statements that exist in +# the previous harvest but not in the current harvest need to be identified for removal. +harvester-diff -X diff-subtractions.config.xml + +# Find Additions +# When making the previous harvest model agree with the current harvest, the statements that exist in +# the current harvest but not in the previous harvest need to be identified for addition. +harvester-diff -X diff-additions.config.xml + +# Apply Subtractions to Previous model +harvester-transfer -w info -o previous-harvest.model.xml -r data/vivo-subtractions.rdf.xml -m +# Apply Additions to Previous model +harvester-transfer -w info -o previous-harvest.model.xml -r data/vivo-additions.rdf.xml + +# Now that the changes have been applied to the previous harvest and the harvested data in vivo +# agree with the previous harvest, the changes are now applied to the vivo model. +# Apply Subtractions to VIVO model +harvester-transfer -w info -o vivo.model.xml -r data/vivo-subtractions.rdf.xml -m +# Apply Additions to VIVO model +harvester-transfer -w info -o vivo.model.xml -r data/vivo-additions.rdf.xml + +#Output some counts +PUBS=`cat data/vivo-additions.rdf.xml | grep oai | wc -l` +AUTHORS=`cat data/vivo-additions.rdf.xml | grep 'http://xmlns.com/foaf/0.1/Person' | wc -l` +AUTHORSHIPS=`cat data/vivo-additions.rdf.xml | grep Authorship | wc -l` +echo "Imported $PUBS publications, $AUTHORS authors, and $AUTHORSHIPS authorships" + +echo 'Harvest completed successfully' diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-author.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-author.conf.xml new file mode 100644 index 000000000..9b2647663 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-author.conf.xml @@ -0,0 +1,22 @@ + + + + harvested-data.model.xml + vivo.model.xml + score-data.model.xml + fName=org.vivoweb.harvester.score.algorithm.NormalizedLevenshteinDifference + fName=http://xmlns.com/foaf/0.1/firstName + fName=0.3 + fName=http://xmlns.com/foaf/0.1/firstName + lName=org.vivoweb.harvester.score.algorithm.NormalizedLevenshteinDifference + lName=http://xmlns.com/foaf/0.1/lastName + lName=0.5 + lName=http://xmlns.com/foaf/0.1/lastName + http://vivo.example.com/harvest/author/ + INFO + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-authorship.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-authorship.conf.xml new file mode 100644 index 000000000..e4330f91b --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-authorship.conf.xml @@ -0,0 +1,25 @@ + + + + harvested-data.model.xml + vivo.model.xml + score-data.model.xml + + authpub=org.vivoweb.harvester.score.algorithm.EqualityTest + authpub=http://vivoweb.org/ontology/core#linkedInformationResource + authpub=0.5 + authpub=http://vivoweb.org/ontology/core#linkedInformationResource + + authauth=org.vivoweb.harvester.score.algorithm.EqualityTest + authauth=http://vivoweb.org/ontology/core#linkedAuthor + authauth=0.5 + authauth=http://vivoweb.org/ontology/core#linkedAuthor + + http://vivo.example.com/harvest/authorship/ + INFO + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-data.model.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-data.model.xml new file mode 100644 index 000000000..2b7c3b0b4 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-data.model.xml @@ -0,0 +1,152 @@ + + + + + + + + + + + + + tdb + data/score-data/ + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-pub.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-pub.conf.xml new file mode 100644 index 000000000..90b40902f --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-pub.conf.xml @@ -0,0 +1,65 @@ + + + + + harvested-data.model.xml + + + vivo.model.xml + + + score-data.model.xml + + + title=org.vivoweb.harvester.score.algorithm.EqualityTest + + + title=http://www.w3.org/2000/01/rdf-schema#label + + + title=1.0 + + + title=http://www.w3.org/2000/01/rdf-schema#label + + + http://vivo.example.com/harvest/oai/ + INFO + + + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-publisher.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-publisher.conf.xml new file mode 100644 index 000000000..608ba66b5 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-publisher.conf.xml @@ -0,0 +1,31 @@ + + + + DEBUG + + harvested-data.model.xml + + + vivo.model.xml + + + score-data.model.xml + + + + http://vivo.example.com/harvest/publisher/ + + + + + vcard=org.vivoweb.harvester.score.algorithm.EqualityTest + vcard=1.0 + vcard=http://www.w3.org/2000/01/rdf-schema#label + vcard=http://www.w3.org/2000/01/rdf-schema#label + + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard-name.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard-name.conf.xml new file mode 100644 index 000000000..be5e39376 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard-name.conf.xml @@ -0,0 +1,37 @@ + + + + INFO + + harvested-data.model.xml + + + vivo.model.xml + + + score-data.model.xml + + + + http://vivo.example.com/harvest/vcardName/ + + + + + givenName=org.vivoweb.harvester.score.algorithm.EqualityTest + givenName=0.5 + givenName=http://www.w3.org/2006/vcard/ns#givenName + givenName=http://www.w3.org/2006/vcard/ns#givenName + + + familyName=org.vivoweb.harvester.score.algorithm.EqualityTest + familyName=0.5 + familyName=http://www.w3.org/2006/vcard/ns#familyName + familyName=http://www.w3.org/2006/vcard/ns#familyName + + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard.conf.xml new file mode 100644 index 000000000..aa6fab919 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard.conf.xml @@ -0,0 +1,31 @@ + + + + DEBUG + + harvested-data.model.xml + + + vivo.model.xml + + + score-data.model.xml + + + + http://vivo.example.com/harvest/vcard/ + + + + + vcard=org.vivoweb.harvester.score.algorithm.EqualityTest + vcard=1.0 + vcard=http://www.w3.org/2000/01/rdf-schema#label + vcard=http://www.w3.org/2000/01/rdf-schema#label + + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/translated-records.config.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/translated-records.config.xml new file mode 100644 index 000000000..76dc28b22 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/translated-records.config.xml @@ -0,0 +1,10 @@ + + + + org.vivoweb.harvester.util.repo.TextFileRecordHandler + data/translated-records + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/truncate.config.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/truncate.config.xml new file mode 100644 index 000000000..f4a89f4a1 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/truncate.config.xml @@ -0,0 +1,12 @@ + + + + + score-data.model.xml + true + INFO + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt new file mode 100644 index 000000000..c0ad663a6 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt @@ -0,0 +1,10 @@ +This example will demonstrate fetching oai-phm provider which exposes content using mets and +eventuall load RDF into a VIVO instance. + +You will need to: + +- Examine the run-oaifetch.sh script and change the location of the VIVO Harvesters if necessary. +- Modify the vivo.model.xml file to provide parameters for accessing your VIVO web application. + + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/vivo.model.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/vivo.model.xml new file mode 100644 index 000000000..218a6f514 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/vivo.model.xml @@ -0,0 +1,106 @@ + + + + + + tdb + /home/ivanmrsulja/Desktop/posao/vivo/vivo_home/tdbContentModels + http://vitro.mannlib.cornell.edu/default/vitro-kb-2 + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/xsltranslator.config.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/xsltranslator.config.xml new file mode 100644 index 000000000..c201831b6 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/xsltranslator.config.xml @@ -0,0 +1,20 @@ + + + + INFO + + raw-records.config.xml + + + translated-records.config.xml + + + oaifetch-dc.datamap.xsl + + true + + diff --git a/src/main/java/org/vivoweb/harvester/util/FileAide.java b/src/main/java/org/vivoweb/harvester/util/FileAide.java index 1a3b85807..095983652 100644 --- a/src/main/java/org/vivoweb/harvester/util/FileAide.java +++ b/src/main/java/org/vivoweb/harvester/util/FileAide.java @@ -24,6 +24,7 @@ import org.apache.commons.vfs.VFS; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.vivoweb.harvester.util.repo.TextFileRecordHandler; /** * Assists in common tasks using Files @@ -300,15 +301,35 @@ public static OutputStream getOutputStream(String path, boolean append) throws I * @throws IOException error resolving path */ public static Set getNonHiddenChildren(String path) throws IOException { - Set allFileListing = new HashSet(); - for(FileObject file : getFileObject(path).findFiles(Selectors.SELECT_CHILDREN)) { - if(!file.isHidden() && (file.getType() == FileType.FILE)) { + Set allFileListing = new HashSet<>(); + + // Process sub-directories + for (FileObject file : getFileObject(path).findFiles(Selectors.SELECT_CHILDREN)) { + if (!file.isHidden() && file.getType() == FileType.FOLDER) { + // Move all files in children directories to the parent folder + for (FileObject childFile : file.findFiles(Selectors.SELECT_CHILDREN)) { + if (!childFile.isHidden() && childFile.getType() == FileType.FILE) { + FileObject parentFolder = file.getParent(); + if (parentFolder != null) { + FileObject targetFile = parentFolder.resolveFile(childFile.getName().getBaseName()); + childFile.moveTo(targetFile); + } + } + } + } + } + + // Process files directly under the given path + for (FileObject file : getFileObject(path).findFiles(Selectors.SELECT_CHILDREN)) { + if (!file.isHidden() && file.getType() == FileType.FILE) { allFileListing.add(file.getName().getBaseName()); } } + return allFileListing; } - + + /** * Get an inputstream from the first file under the given path with a matching fileName * @param path the path to search under From 00ba9e2815268799a078d062120b8ea393d9a062 Mon Sep 17 00:00:00 2001 From: Ivan Mrsulja Date: Thu, 12 Dec 2024 14:14:14 +0100 Subject: [PATCH 02/21] Refactored ETL process, everything that is crucial works now. --- ...etch.conf.xml => dspace-oaifetch.conf.xml} | 0 .../oaifetch-dc.datamap.xsl | 193 +++++++++++------- .../example-oaifetch-dspace/run-oaifetch.bat | 93 +++++++++ .../example-oaifetch-dspace/run-oaifetch.sh | 6 +- .../score-author.conf.xml | 8 +- .../score-authorship.conf.xml | 2 +- .../score-vcard-name.conf.xml | 2 +- .../score-vcard.conf.xml | 2 +- .../example-oaifetch-dspace/usage.txt | 15 +- 9 files changed, 231 insertions(+), 90 deletions(-) rename example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/{oaifetch.conf.xml => dspace-oaifetch.conf.xml} (100%) create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.bat diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml similarity index 100% rename from example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch.conf.xml rename to example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch-dc.datamap.xsl b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch-dc.datamap.xsl index 06980c35e..6181695ed 100644 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch-dc.datamap.xsl +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch-dc.datamap.xsl @@ -24,20 +24,34 @@ xmlns:stringhash="java:org.vivoweb.harvester.util.xslt.StringHash" extension-element-prefixes = "stringhash" > - - + + + + http://vivo.example.com/harvest/ + + + - - + + + + + + + + + @@ -48,74 +62,10 @@ - - - - - - - - - - - - - - - - Authorship for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Authorship for - - - - - - - - - - - - - - - - - + @@ -130,7 +80,7 @@ - + @@ -149,6 +99,10 @@ + + + + @@ -160,7 +114,7 @@ - + @@ -203,4 +157,101 @@ + + + + + + + + + + + + + + + + + + + Authorship for + + + + + + + + + + + + + + + + + + + + + + + + + + vCard for: + + + + + + + + vCard name for: + + + + + + + + + + + + + + + + + + + + + + + Authorship for + + + + + + + + + + + + + + + + + + + + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.bat b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.bat new file mode 100644 index 000000000..2370cc231 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.bat @@ -0,0 +1,93 @@ +@echo off + +IF exist data ( + rmdir /s /q data +) + +IF exist logs ( + rmdir /s /q logs +) + +REM set to the directory where the harvester was installed or unpacked +REM HARVESTER_INSTALL_DIR is set to the location of the installed harvester +REM If the deb file was used to install the harvester then the +REM directory should be set to /usr/share/vivo/harvester which is the +REM current location associated with the deb installation. +REM Since it is also possible the harvester was installed by +REM uncompressing the tar.gz the setting is available to be changed +REM and should agree with the installation location +set HARVESTER_INSTALL_DIR= +set HARVEST_NAME=DSpace-OAI-Fetch +FOR %%A IN (%Date:/=%) DO SET Today=%%A + +REM set the CLASSPATH and HARVESTER_JAVA_OPTS to be used by all commands +set CLASSPATH=%HARVESTER_INSTALL_DIR%/build/harvester.jar;%HARVESTER_INSTALL_DIR%/build/dependency/* +set HARVESTER_JAVA_OPTS=-Xms1024M -Xmx2048M + +REM Execute Fetch +REM This stage of the script is where the information is gathered together into one local +REM place to facilitate the further steps of the harvest. The data is stored locally +REM in a format based off of the source. The format is a form of RDF but not in the VIVO ontology +echo Fetch from OpenAlex +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.fetch.JSONFetch -X dspace-oaifetch.config.xml + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Execute Translate +REM This is the part of the script where the input data is transformed into valid RDF +REM Translate will apply an xslt file to the fetched data which will result in the data +REM becoming valid RDF in the VIVO ontology +echo Translate data to valid RDF +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.translate.XSLTranslator -X xsltranslator.config.xml + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Execute Transfer to import from record handler into local temp model +REM From this stage on the script places the data into a Jena model. A model is a +REM data storage structure similar to a database, but in RDF. +REM The harvester tool Transfer is used to move/add/remove/dump data in models. +echo Transfer RDF into temporary triple store +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.transfer.Transfer -X transfer.config.xml + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Perform an update +REM The harvester maintains copies of previous harvests in order to perform the same harvest twice +REM but only add the new statements, while removing the old statements that are no longer +REM contained in the input data. This is done in several steps of finding the old statements, +REM then the new statements, and then applying them to the Vivo main model. + +REM Find Subtractions +REM When making the previous harvest model agree with the current harvest, the statements that exist in +REM the previous harvest but not in the current harvest need to be identified for removal. +echo Find Subtractions +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.diff.Diff -X diff-subtractions.config.xml + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Find Additions +REM When making the previous harvest model agree with the current harvest, the statements that exist in +REM the current harvest but not in the previous harvest need to be identified for addition. +echo Find Additions +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.diff.Diff -X diff-additions.config.xml + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Apply Subtractions to Previous model +echo Apply Subtractions to Previous model +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.transfer.Transfer -w INFO -o previous-harvest.model.xml -r data/vivo-subtractions.rdf.xml -m + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Apply Additions to Previous model +echo Apply Additions to Previous model +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.transfer.Transfer -w INFO -o previous-harvest.model.xml -r data/vivo-additions.rdf.xml + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Now that the changes have been applied to the previous harvest and the harvested data in vivo +REM agree with the previous harvest, the changes are now applied to the vivo model. +REM Apply Subtractions to VIVO model +echo Apply Subtractions to VIVO model +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.transfer.Transfer -w INFO -o vivo.model.xml -r data/vivo-subtractions.rdf.xml -m + if %errorlevel% neq 0 exit /b %errorlevel% + +REM Apply Additions to VIVO model +echo Apply Additions to VIVO model +@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.transfer.Transfer -w INFO -o vivo.model.xml -r data/vivo-additions.rdf.xml + if %errorlevel% neq 0 exit /b %errorlevel% + +echo Harvest completed successfully \ No newline at end of file diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.sh b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.sh index 6207d3090..ba50b154f 100755 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.sh +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.sh @@ -12,9 +12,9 @@ # Since it is also possible the harvester was installed by # uncompressing the tar.gz the setting is available to be changed # and should agree with the installation location -#export HARVESTER_INSTALL_DIR=/usr/local/src/VIVO-Harvester +#export HARVESTER_INSTALL_DIR= HARVESTER_INSTALL_DIR=$(pwd)/../../../../../../VIVO-Harvester -export HARVEST_NAME=example-oaifetch-dspace +export HARVEST_NAME=DSpace-OAI-fetch export DATE=`date +%Y-%m-%d'T'%T` # Add harvester binaries to path for execution @@ -56,8 +56,8 @@ rm -rf data # This stage of the script is where the information is gathered together into one local # place to facilitate the further steps of the harvest. The data is stored locally # in a format based off of the source. The format is a form of RDF but not in the VIVO ontology +java $HARVESTER_JAVA_OPTS org.vivoweb.harvester.fetch.OAIFetch -X dspace-oaifetch.conf.xml -java $HARVESTER_JAVA_OPTS org.vivoweb.harvester.fetch.OAIFetch -X oaifetch.conf.xml # Execute Translate # This is the part of the script where the input data is transformed into valid RDF # Translate will apply an xslt file to the fetched data which will result in the data diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-author.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-author.conf.xml index 9b2647663..d43358151 100644 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-author.conf.xml +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-author.conf.xml @@ -9,13 +9,9 @@ vivo.model.xml score-data.model.xml fName=org.vivoweb.harvester.score.algorithm.NormalizedLevenshteinDifference - fName=http://xmlns.com/foaf/0.1/firstName + fName=http://www.w3.org/2000/01/rdf-schema#label fName=0.3 - fName=http://xmlns.com/foaf/0.1/firstName - lName=org.vivoweb.harvester.score.algorithm.NormalizedLevenshteinDifference - lName=http://xmlns.com/foaf/0.1/lastName - lName=0.5 - lName=http://xmlns.com/foaf/0.1/lastName + fName=http://www.w3.org/2000/01/rdf-schema#label http://vivo.example.com/harvest/author/ INFO diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-authorship.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-authorship.conf.xml index e4330f91b..4b21b0451 100644 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-authorship.conf.xml +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-authorship.conf.xml @@ -19,7 +19,7 @@ authauth=0.5 authauth=http://vivoweb.org/ontology/core#linkedAuthor - http://vivo.example.com/harvest/authorship/ + http://vivo.example.com/harvest/authorship_author/ INFO diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard-name.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard-name.conf.xml index be5e39376..7852f9eca 100644 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard-name.conf.xml +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard-name.conf.xml @@ -17,7 +17,7 @@ - http://vivo.example.com/harvest/vcardName/ + http://vivo.example.com/harvest/vcardName_ diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard.conf.xml index aa6fab919..629512762 100644 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard.conf.xml +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/score-vcard.conf.xml @@ -17,7 +17,7 @@ - http://vivo.example.com/harvest/vcard/ + http://vivo.example.com/harvest/vcard_ diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt index c0ad663a6..e804eebb8 100644 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt @@ -1,10 +1,11 @@ -This example will demonstrate fetching oai-phm provider which exposes content using mets and -eventuall load RDF into a VIVO instance. +This example will demonstrate fetching DSpace oai-phm provider which exposes content +using DublinCore metadata format and loading it into a VIVO instance. You will need to: -- Examine the run-oaifetch.sh script and change the location of the VIVO Harvesters if necessary. -- Modify the vivo.model.xml file to provide parameters for accessing your VIVO web application. - - - +- Examine the run-oaifetch.sh (or .bat) script and type the location of the VIVO Harvester installation directory. +- You can uncomment the score and match functions if you want deduplication to be performed (it is turned off by default) +- Modify the vivo.model.xml file to provide parameters for accessing your VIVO web application. +- Shut down your VIVO instance (in order to free the TDB lock) +- Run run-oaifetch.sh (or .bat) +- Restart your VIVO instance and reindex the search indexes From d6ae4062ad0fdb11c28610784a4118d05bb77a46 Mon Sep 17 00:00:00 2001 From: Ivan Mrsulja Date: Thu, 12 Dec 2024 16:48:24 +0100 Subject: [PATCH 03/21] Refactored java code in order to make it more readable. --- .../org/vivoweb/harvester/util/FileAide.java | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/vivoweb/harvester/util/FileAide.java b/src/main/java/org/vivoweb/harvester/util/FileAide.java index 095983652..dd23c73c8 100644 --- a/src/main/java/org/vivoweb/harvester/util/FileAide.java +++ b/src/main/java/org/vivoweb/harvester/util/FileAide.java @@ -19,6 +19,7 @@ import org.apache.commons.vfs.AllFileSelector; import org.apache.commons.vfs.FileContent; import org.apache.commons.vfs.FileObject; +import org.apache.commons.vfs.FileSystemException; import org.apache.commons.vfs.FileType; import org.apache.commons.vfs.Selectors; import org.apache.commons.vfs.VFS; @@ -305,15 +306,17 @@ public static Set getNonHiddenChildren(String path) throws IOException { // Process sub-directories for (FileObject file : getFileObject(path).findFiles(Selectors.SELECT_CHILDREN)) { - if (!file.isHidden() && file.getType() == FileType.FOLDER) { - // Move all files in children directories to the parent folder - for (FileObject childFile : file.findFiles(Selectors.SELECT_CHILDREN)) { - if (!childFile.isHidden() && childFile.getType() == FileType.FILE) { - FileObject parentFolder = file.getParent(); - if (parentFolder != null) { - FileObject targetFile = parentFolder.resolveFile(childFile.getName().getBaseName()); - childFile.moveTo(targetFile); - } + if (file.isHidden() || file.getType() != FileType.FOLDER) { + continue; + } + // Move all files in children directories to the parent folder + for (FileObject childFile : file.findFiles(Selectors.SELECT_CHILDREN)) { + if (isValidFile(childFile)) { + FileObject parentFolder = file.getParent(); + if (parentFolder != null) { + FileObject targetFile = + parentFolder.resolveFile(childFile.getName().getBaseName()); + childFile.moveTo(targetFile); } } } @@ -321,7 +324,7 @@ public static Set getNonHiddenChildren(String path) throws IOException { // Process files directly under the given path for (FileObject file : getFileObject(path).findFiles(Selectors.SELECT_CHILDREN)) { - if (!file.isHidden() && file.getType() == FileType.FILE) { + if (isValidFile(file)) { allFileListing.add(file.getName().getBaseName()); } } @@ -329,6 +332,9 @@ public static Set getNonHiddenChildren(String path) throws IOException { return allFileListing; } + private static boolean isValidFile(FileObject file) throws FileSystemException { + return !file.isHidden() && file.getType() == FileType.FILE; + } /** * Get an inputstream from the first file under the given path with a matching fileName From 0c3490d8c6bc354bacb304f1163ffc8f8d43878b Mon Sep 17 00:00:00 2001 From: Ivan Mrsulja Date: Fri, 13 Dec 2024 13:07:43 +0100 Subject: [PATCH 04/21] Added the possibility of loading through sparql api. Added support for all default dspace document types. Added support for TTL, N3 and RDF/XML format in sparqlapi loader. --- .../dspace-oaifetch.conf.xml | 2 +- .../oaifetch-dc.datamap.xsl | 42 ++++++- ...n-oaifetch.bat => run-dspace-oaifetch.bat} | 34 ++++-- ...run-oaifetch.sh => run-dspace-oaifetch.sh} | 26 +++-- .../sparqlupdate.conf.xml | 19 +++ .../example-oaifetch-dspace/usage.txt | 6 +- .../harvester/services/SparqlUpdate.java | 108 +++++++++--------- 7 files changed, 164 insertions(+), 73 deletions(-) rename example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/{run-oaifetch.bat => run-dspace-oaifetch.bat} (81%) rename example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/{run-oaifetch.sh => run-dspace-oaifetch.sh} (91%) create mode 100644 example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/sparqlupdate.conf.xml diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml index f4bae937d..5859d4251 100644 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml @@ -6,7 +6,7 @@ --> -https://demo.dspace.org/server/oai/request +http://ir.cut.ac.za/oai/request - + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.bat b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-dspace-oaifetch.bat similarity index 81% rename from example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.bat rename to example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-dspace-oaifetch.bat index 2370cc231..e51aad4ee 100644 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.bat +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-dspace-oaifetch.bat @@ -8,6 +8,20 @@ IF exist logs ( rmdir /s /q logs ) +set LOAD_METHOD=sparql + +if not "%1"=="" ( + if "%1"=="tdb" ( + set LOAD_METHOD=tdb + ) else if "%1"=="sparql" ( + set LOAD_METHOD=sparql + ) else ( + echo Invalid argument: %1 + echo Usage: %~nx0 [tdb|sparql] + exit /b 1 + ) +) + REM set to the directory where the harvester was installed or unpacked REM HARVESTER_INSTALL_DIR is set to the location of the installed harvester REM If the deb file was used to install the harvester then the @@ -80,14 +94,18 @@ echo Apply Additions to Previous model REM Now that the changes have been applied to the previous harvest and the harvested data in vivo REM agree with the previous harvest, the changes are now applied to the vivo model. -REM Apply Subtractions to VIVO model -echo Apply Subtractions to VIVO model -@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.transfer.Transfer -w INFO -o vivo.model.xml -r data/vivo-subtractions.rdf.xml -m - if %errorlevel% neq 0 exit /b %errorlevel% +if "%LOAD_METHOD%"=="tdb" ( + REM Apply Subtractions to VIVO model + echo Apply Subtractions to VIVO model + @java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.transfer.Transfer -w INFO -o vivo.model.xml -r data/vivo-subtractions.rdf.xml -m + if %errorlevel% neq 0 exit /b %errorlevel% -REM Apply Additions to VIVO model -echo Apply Additions to VIVO model -@java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.transfer.Transfer -w INFO -o vivo.model.xml -r data/vivo-additions.rdf.xml - if %errorlevel% neq 0 exit /b %errorlevel% + REM Apply Additions to VIVO model + echo Apply Additions to VIVO model + @java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.transfer.Transfer -w INFO -o vivo.model.xml -r data/vivo-additions.rdf.xml + if %errorlevel% neq 0 exit /b %errorlevel% +) else ( + @java %HARVESTER_JAVA_OPTS% -cp %CLASSPATH% org.vivoweb.harvester.services.SparqlUpdate -X sparqlupdate.conf.xml +) echo Harvest completed successfully \ No newline at end of file diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.sh b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-dspace-oaifetch.sh similarity index 91% rename from example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.sh rename to example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-dspace-oaifetch.sh index ba50b154f..d5a9fced7 100755 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-oaifetch.sh +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/run-dspace-oaifetch.sh @@ -1,5 +1,11 @@ #!/bin/bash +if [[ "$1" != "" && "$1" != "tdb" && "$1" != "sparql" ]]; then + echo "Invalid argument: $1" + echo "Usage: $0 [tdb|sparql]" + exit 1 +fi + #Copyright (c) 2010-2011 VIVO Harvester Team. For full list of contributors, please see the AUTHORS file provided. #All rights reserved. #This program and the accompanying materials are made available under the terms of the new BSD license which accompanies this distribution, and is available at http://www.opensource.org/licenses/bsd-license.html @@ -60,7 +66,7 @@ java $HARVESTER_JAVA_OPTS org.vivoweb.harvester.fetch.OAIFetch -X dspace-oaifetc # Execute Translate # This is the part of the script where the input data is transformed into valid RDF -# Translate will apply an xslt file to the fetched data which will result in the data +# Translate will apply an xslt file to the fetched data which will result in the data # becoming valid RDF in the VIVO ontology harvester-xsltranslator -X xsltranslator.config.xml @@ -71,7 +77,7 @@ harvester-xsltranslator -X xsltranslator.config.xml # For this call on the transfer tool: # -s refers to the source translated records file, which was just produced by the translator step # -o refers to the destination model for harvested data -# -d means that this call will also produce a text dump file in the specified location +# -d means that this call will also produce a text dump file in the specified location harvester-transfer -w INFO -s translated-records.config.xml -o harvested-data.model.xml -d data/harvested-data/imported-records.rdf.xml #Score on publications @@ -81,7 +87,7 @@ harvester-transfer -w INFO -s translated-records.config.xml -o harvested-data.mo #Match publications # Use the score values from the previous score to rename publications which we deem are the -# same, so that they match the URI of the publication in VIVO. +# same, so that they match the URI of the publication in VIVO. #harvester-match -X match-pub.conf.xml #Score on authors, organizations, geographic locations, journals, hyperlinks, and @@ -145,10 +151,16 @@ harvester-transfer -w info -o previous-harvest.model.xml -r data/vivo-additions. # Now that the changes have been applied to the previous harvest and the harvested data in vivo # agree with the previous harvest, the changes are now applied to the vivo model. -# Apply Subtractions to VIVO model -harvester-transfer -w info -o vivo.model.xml -r data/vivo-subtractions.rdf.xml -m -# Apply Additions to VIVO model -harvester-transfer -w info -o vivo.model.xml -r data/vivo-additions.rdf.xml +if [[ "$1" == "tdb" ]]; then + # Apply Subtractions to VIVO model + harvester-transfer -w info -o vivo.model.xml -r data/vivo-subtractions.rdf.xml -m + # Apply Additions to VIVO model + harvester-transfer -w info -o vivo.model.xml -r data/vivo-additions.rdf.xml +fi + +if [[ "$1" == "" || "$1" == "sparql" ]]; then + java $HARVESTER_JAVA_OPTS org.vivoweb.harvester.services.SparqlUpdate -X sparqlupdate.conf.xml +fi #Output some counts PUBS=`cat data/vivo-additions.rdf.xml | grep oai | wc -l` diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/sparqlupdate.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/sparqlupdate.conf.xml new file mode 100644 index 000000000..e18f9c5b6 --- /dev/null +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/sparqlupdate.conf.xml @@ -0,0 +1,19 @@ + + + + INFO + + data/vivo-additions.rdf.xml + add + + + http://localhost:8080/vivo/api/sparqlUpdate + http://vitro.mannlib.cornell.edu/default/vitro-kb-2 + vivo_root@mydomain.edu + admin123 + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt index e804eebb8..046b709bf 100644 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt @@ -6,6 +6,6 @@ You will need to: - Examine the run-oaifetch.sh (or .bat) script and type the location of the VIVO Harvester installation directory. - You can uncomment the score and match functions if you want deduplication to be performed (it is turned off by default) - Modify the vivo.model.xml file to provide parameters for accessing your VIVO web application. -- Shut down your VIVO instance (in order to free the TDB lock) -- Run run-oaifetch.sh (or .bat) -- Restart your VIVO instance and reindex the search indexes +- Shut down your VIVO instance (if you are performing load using the TDB, in order to free the TDB lock) +- Run `run-oaifetch.sh [tdb/sparql]` (use .bat script if you are on WindowsOS. It is recommended that you perform the first harvest, especially for large datasets, using TDB, as SPARQL API may have trouble loading very large files. All subsequent re-harvests can be performed with SPARQL API without the need for restarting your VIVO instance) +- Restart your VIVO instance (in case you loaded data using the TDB) and reindex the search indexes diff --git a/src/main/java/org/vivoweb/harvester/services/SparqlUpdate.java b/src/main/java/org/vivoweb/harvester/services/SparqlUpdate.java index 3edeabaa0..5cb104a88 100644 --- a/src/main/java/org/vivoweb/harvester/services/SparqlUpdate.java +++ b/src/main/java/org/vivoweb/harvester/services/SparqlUpdate.java @@ -7,35 +7,29 @@ import java.io.ByteArrayOutputStream; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.file.Files; import java.util.ArrayList; import java.util.List; -import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; -import org.apache.http.Header; import org.apache.http.HttpEntity; -import org.apache.http.HttpHeaders; import org.apache.http.NameValuePair; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpPost; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; -import org.apache.http.message.BasicHeader; import org.apache.http.message.BasicNameValuePair; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.vivoweb.harvester.util.FileAide; import org.vivoweb.harvester.util.InitLog; import org.vivoweb.harvester.util.args.ArgDef; import org.vivoweb.harvester.util.args.ArgList; import org.vivoweb.harvester.util.args.ArgParser; import org.vivoweb.harvester.util.args.UsageException; -import org.vivoweb.harvester.util.repo.JenaConnect; -import org.vivoweb.harvester.util.repo.MemJenaConnect; -import org.vivoweb.harvester.util.repo.RecordHandler; /** * Execute Sparql update in Jena model in an instance of VIVO @@ -157,50 +151,62 @@ private SparqlUpdate(ArgList argList) throws IOException { * @throws IOException error */ private void execute() throws IOException { - StringBuffer updateBuffer = new StringBuffer(); - if (this.type.equals("add")) { - updateBuffer.append("INSERT DATA {"); - } else { - updateBuffer.append("DELETE DATA {"); - } - updateBuffer.append("GRAPH <"+ this.model + "> {"); + StringBuffer updateBuffer = new StringBuffer(); + if (this.type.equals("add")) { + updateBuffer.append("INSERT DATA {"); + } else { + updateBuffer.append("DELETE DATA {"); + } + updateBuffer.append("GRAPH <").append(this.model).append("> {"); - //String rdfString = FileAide.getTextContent(this.inRDF); - String rdfString = FileUtils.readFileToString(new File(this.inRDF), "UTF-8"); - updateBuffer.append(rdfString); - updateBuffer.append(" }"); - updateBuffer.append("}"); - System.out.println(updateBuffer.toString()); - - CloseableHttpClient httpclient = HttpClients.createDefault(); - try { - HttpPost httpPost = new HttpPost(this.url); - - List nvps = new ArrayList (); - nvps.add(new BasicNameValuePair("email", this.username)); - nvps.add(new BasicNameValuePair("password", this.password)); - nvps.add(new BasicNameValuePair("update", updateBuffer.toString())); - httpPost.setEntity(new UrlEncodedFormEntity(nvps)); - CloseableHttpResponse response = httpclient.execute(httpPost); - try { - System.out.println(response.getStatusLine()); - HttpEntity entity = response.getEntity(); - InputStream is = entity.getContent(); - try { - IOUtils.copy(is, System.out); - } finally { - is.close(); - } - - } finally { - response.close(); - } - - } finally { - httpclient.close(); - } + //String rdfString = FileAide.getTextContent(this.inRDF); + Model model = ModelFactory.createDefaultModel(); + deduceRdfFormatAndParseData(model, new File(this.inRDF)); + + try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { + model.write(out, "N-TRIPLES"); + updateBuffer.append(out.toString("UTF-8")); + } + updateBuffer.append(" }"); + updateBuffer.append("}"); + System.out.println(updateBuffer); + + try (CloseableHttpClient httpclient = HttpClients.createDefault()) { + HttpPost httpPost = new HttpPost(this.url); + + List nvps = new ArrayList<>(); + nvps.add(new BasicNameValuePair("email", this.username)); + nvps.add(new BasicNameValuePair("password", this.password)); + nvps.add(new BasicNameValuePair("update", updateBuffer.toString())); + httpPost.setEntity(new UrlEncodedFormEntity(nvps)); + try (CloseableHttpResponse response = httpclient.execute(httpPost)) { + System.out.println(response.getStatusLine()); + HttpEntity entity = response.getEntity(); + try (InputStream is = entity.getContent()) { + IOUtils.copy(is, System.out); + } + + } + + } } - + + private void deduceRdfFormatAndParseData(Model model, File rdfFile) throws IOException { + String[] supportedFormats = new String[]{"RDF/XML", "TURTLE", "N3"}; + for (String format : supportedFormats) { + try (InputStream in = Files.newInputStream(rdfFile.toPath())) { + try { + model.read(in, null, format); + return; + } catch (Exception e) { + // pass + } + } + } + + throw new IOException("Unable to deduce RDF format for file: " + rdfFile.getName()); + } + /** * Get the ArgParser for this task * @return the ArgParser From b00e51f8533252b0f09d7f1c82c0e1ceee091899 Mon Sep 17 00:00:00 2001 From: Ivan Mrsulja Date: Tue, 17 Dec 2024 16:57:10 +0100 Subject: [PATCH 05/21] Fixed ID sanitization bug. --- .../example-oaifetch-dspace/dspace-oaifetch.conf.xml | 2 +- .../harvester/util/repo/TextFileRecordHandler.java | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml index 5859d4251..f4bae937d 100644 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml @@ -6,7 +6,7 @@ --> -http://ir.cut.ac.za/oai/request +https://demo.dspace.org/server/oai/request -https://demo.dspace.org/server/oai/request +https://dspace.mit.edu/oai/request -2020-10-01T00:00:00Z +2024-12-29T00:00:00Z 2024-12-31T00:00:00Z oai_dc raw-records.config.xml diff --git a/src/main/java/org/vivoweb/harvester/util/repo/JenaConnect.java b/src/main/java/org/vivoweb/harvester/util/repo/JenaConnect.java index 37770d13b..dc6d34a4b 100644 --- a/src/main/java/org/vivoweb/harvester/util/repo/JenaConnect.java +++ b/src/main/java/org/vivoweb/harvester/util/repo/JenaConnect.java @@ -18,17 +18,6 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.vivoweb.harvester.util.InitLog; -import org.vivoweb.harvester.util.FileAide; -import org.vivoweb.harvester.util.args.ArgDef; -import org.vivoweb.harvester.util.args.ArgList; -import org.vivoweb.harvester.util.args.ArgParser; -import org.vivoweb.harvester.util.args.UsageException; -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; import org.apache.jena.graph.GraphEvents; import org.apache.jena.query.Dataset; import org.apache.jena.query.Query; @@ -39,8 +28,6 @@ import org.apache.jena.query.ResultSet; import org.apache.jena.query.ResultSetFactory; import org.apache.jena.query.ResultSetFormatter; -import org.apache.jena.sparql.resultset.ResultsFormat; -//import org.apache.jena.sparql.resultset.ResultSetFormat; import org.apache.jena.query.Syntax; import org.apache.jena.rdf.model.Model; import org.apache.jena.rdf.model.Property; @@ -48,9 +35,20 @@ import org.apache.jena.rdf.model.RDFWriter; import org.apache.jena.rdf.model.Resource; import org.apache.jena.shared.Lock; - -import org.apache.jena.update.UpdateAction; +import org.apache.jena.sparql.resultset.ResultsFormat; +import org.apache.jena.update.UpdateAction; import org.apache.jena.update.UpdateFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.vivoweb.harvester.util.FileAide; +import org.vivoweb.harvester.util.InitLog; +import org.vivoweb.harvester.util.args.ArgDef; +import org.vivoweb.harvester.util.args.ArgList; +import org.vivoweb.harvester.util.args.ArgParser; +import org.vivoweb.harvester.util.args.UsageException; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; /** * Connection Helper for Jena Models @@ -425,8 +423,14 @@ public int loadRdfFromRH(RecordHandler rh, String namespace, String language) { if (namespace != null) { log.trace("using namespace '"+namespace+"'"); } - ByteArrayInputStream bais = new ByteArrayInputStream(r.getData().getBytes( - StandardCharsets.UTF_8)); + + String charReferenceRegex = "(?<=^|[^&])(&#(?:[0-9]+|x[0-9a-fA-F]+);)"; + String fullyEscapedContent = r.getData() + .replaceAll(charReferenceRegex, "&$1"); + + ByteArrayInputStream bais = new ByteArrayInputStream( + fullyEscapedContent.getBytes(StandardCharsets.UTF_8) + ); getJenaModel().read(bais, namespace, language); try { bais.close(); From e4445b556ac38fecf952f88fd457d67efb196299 Mon Sep 17 00:00:00 2001 From: Ivan Mrsulja Date: Fri, 3 Jan 2025 15:19:07 +0100 Subject: [PATCH 17/21] Refined fix for unescaped XML encoded entities. --- .../example-oaifetch-dspace/dspace-oaifetch.conf.xml | 4 ++-- src/main/java/org/vivoweb/harvester/fetch/OAIFetch.java | 4 +++- .../java/org/vivoweb/harvester/util/repo/JenaConnect.java | 6 +----- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml index 71d5404bc..ae2fcbb2d 100644 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml @@ -12,8 +12,8 @@ aei.pitt.edu/cgi/oai2 www4.fao.org:8080/oaicat/OAIHandler --> -2024-12-29T00:00:00Z -2024-12-31T00:00:00Z +2023-05-15T00:00:00Z +2023-05-17T00:00:00Z oai_dc raw-records.config.xml INFO diff --git a/src/main/java/org/vivoweb/harvester/fetch/OAIFetch.java b/src/main/java/org/vivoweb/harvester/fetch/OAIFetch.java index 924aaba62..5e1f11ff9 100644 --- a/src/main/java/org/vivoweb/harvester/fetch/OAIFetch.java +++ b/src/main/java/org/vivoweb/harvester/fetch/OAIFetch.java @@ -193,7 +193,9 @@ public void execute() throws IOException { if (! StringUtils.equalsIgnoreCase(strArray[1], "deleted")) { log.trace("Adding record: " + strArray[0]); - this.rhOutput.addRecord(strArray[0], strArray[1], this.getClass()); + String charReferenceRegex = "(?<=^|[^&])(&#(?:[0-9]+|x[0-9a-fA-F]+);)"; + String fullyEscapedData = strArray[1].replaceAll(charReferenceRegex, "&$1").replace("&&#", "&#"); + this.rhOutput.addRecord(strArray[0], fullyEscapedData, this.getClass()); } } diff --git a/src/main/java/org/vivoweb/harvester/util/repo/JenaConnect.java b/src/main/java/org/vivoweb/harvester/util/repo/JenaConnect.java index dc6d34a4b..13c728698 100644 --- a/src/main/java/org/vivoweb/harvester/util/repo/JenaConnect.java +++ b/src/main/java/org/vivoweb/harvester/util/repo/JenaConnect.java @@ -424,12 +424,8 @@ public int loadRdfFromRH(RecordHandler rh, String namespace, String language) { log.trace("using namespace '"+namespace+"'"); } - String charReferenceRegex = "(?<=^|[^&])(&#(?:[0-9]+|x[0-9a-fA-F]+);)"; - String fullyEscapedContent = r.getData() - .replaceAll(charReferenceRegex, "&$1"); - ByteArrayInputStream bais = new ByteArrayInputStream( - fullyEscapedContent.getBytes(StandardCharsets.UTF_8) + r.getData().getBytes(StandardCharsets.UTF_8) ); getJenaModel().read(bais, namespace, language); try { From c6088d059e29ae74fdc915f5a7dae222a4ab14e3 Mon Sep 17 00:00:00 2001 From: Ivan Mrsulja Date: Wed, 8 Jan 2025 10:00:42 +0100 Subject: [PATCH 18/21] Fixed SPARQL update encoding problems. --- .../org/vivoweb/harvester/services/SparqlUpdate.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/vivoweb/harvester/services/SparqlUpdate.java b/src/main/java/org/vivoweb/harvester/services/SparqlUpdate.java index 5cb104a88..204d46f88 100644 --- a/src/main/java/org/vivoweb/harvester/services/SparqlUpdate.java +++ b/src/main/java/org/vivoweb/harvester/services/SparqlUpdate.java @@ -9,6 +9,8 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.util.ArrayList; import java.util.List; @@ -169,7 +171,6 @@ private void execute() throws IOException { } updateBuffer.append(" }"); updateBuffer.append("}"); - System.out.println(updateBuffer); try (CloseableHttpClient httpclient = HttpClients.createDefault()) { HttpPost httpPost = new HttpPost(this.url); @@ -178,25 +179,24 @@ private void execute() throws IOException { nvps.add(new BasicNameValuePair("email", this.username)); nvps.add(new BasicNameValuePair("password", this.password)); nvps.add(new BasicNameValuePair("update", updateBuffer.toString())); - httpPost.setEntity(new UrlEncodedFormEntity(nvps)); + httpPost.setEntity(new UrlEncodedFormEntity(nvps, "UTF-8")); try (CloseableHttpResponse response = httpclient.execute(httpPost)) { System.out.println(response.getStatusLine()); HttpEntity entity = response.getEntity(); try (InputStream is = entity.getContent()) { IOUtils.copy(is, System.out); } - } - } } private void deduceRdfFormatAndParseData(Model model, File rdfFile) throws IOException { String[] supportedFormats = new String[]{"RDF/XML", "TURTLE", "N3"}; for (String format : supportedFormats) { - try (InputStream in = Files.newInputStream(rdfFile.toPath())) { + try (InputStream in = Files.newInputStream(rdfFile.toPath()); + InputStreamReader reader = new InputStreamReader(in, StandardCharsets.UTF_8)) { try { - model.read(in, null, format); + model.read(reader, null, format); return; } catch (Exception e) { // pass From 0bfec998f3b63aa4ecddcddfb847c05d1f43d7d8 Mon Sep 17 00:00:00 2001 From: Ivan Mrsulja Date: Wed, 8 Jan 2025 14:59:53 +0100 Subject: [PATCH 19/21] Aligned JENA version to the one used in current VIVO. Fixed RDF autorship and type bugs when performing TDB import. Fixed SPARQL update encoding issue. --- .../dspace-oaifetch.conf.xml | 1 + .../oaifetch-dc.datamap.xsl | 47 ++++++++++++++++++- .../example-oaifetch-dspace/usage.txt | 13 ++++- pom.xml | 11 +++-- .../harvester/qualify/RenameResources.java | 26 +++++----- 5 files changed, 76 insertions(+), 22 deletions(-) diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml index ae2fcbb2d..411d264f7 100644 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/dspace-oaifetch.conf.xml @@ -15,6 +15,7 @@ 2023-05-15T00:00:00Z 2023-05-17T00:00:00Z oai_dc + raw-records.config.xml INFO diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch-dc.datamap.xsl b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch-dc.datamap.xsl index 5cbe2593d..d4586fd9c 100644 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch-dc.datamap.xsl +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/oaifetch-dc.datamap.xsl @@ -41,6 +41,12 @@ + + + + + + @@ -94,50 +100,62 @@ + + + + + + + + + + + + @@ -151,8 +169,18 @@ + + + + + + + + + + @@ -213,7 +241,7 @@ Authorship for - + @@ -224,8 +252,16 @@ + + + + + + + + @@ -275,8 +311,17 @@ + + + + + + + + + diff --git a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt index dd621e3ef..d0be69719 100644 --- a/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt +++ b/example-scripts/bash-scripts/full-harvest-examples/1.15-examples/example-oaifetch-dspace/usage.txt @@ -3,9 +3,20 @@ using DublinCore metadata format and loading it into a VIVO instance. You will need to: +- Modify the dspace-oaifetch.conf.xml configuration file to specify harvest endpoint, time period, set, etc. - Examine the run-oaifetch.sh (or .bat) script and type the location of the VIVO Harvester installation directory. - You can uncomment the score and match functions if you want deduplication to be performed (it is turned off by default) -- Modify the vivo.model.xml file to provide parameters for accessing your VIVO web application. +- Modify the vivo.model.xml file to provide parameters for accessing your VIVO web application and tdbContentModels directory. - Shut down your VIVO instance (if you are performing load using the TDB, in order to free the TDB lock) - Run `run-dspace-oaifetch.sh [tdb/sparql]` (use .bat script if you are on WindowsOS. It is recommended that you perform the first harvest, especially for large datasets, using TDB, as SPARQL API may have trouble loading very large files. All subsequent re-harvests can be performed with SPARQL API without the need for restarting your VIVO instance) - Restart your VIVO instance (in case you loaded data using the TDB) and reindex the search indexes + +Note: When performing TDB import, if it's your first time running VIVO, it is mandatory that you first start up VIVO and wait for +the default entities (ABAC, locations, initial concepts...) to initialize. Only after first initialization is it safe to perform TDB import. +Failure to do this may result in losing all imported data when the application is restarted. + +Note: If you are running an older VIVO version, you should navigate to pom.xml and update jena.version to that of your VIVO instance. That way, you will +avoid any compatibility issues when performing TDB import. Failure to do this may result in a simple "Error: null" message when trying to transfer your local TDB content models +to the actual tdbContentModels directory. + +Note: It is recommended that when you are performing your initial harvest, you use TDB import. Every other time SPARQL API import should be your go-to method. diff --git a/pom.xml b/pom.xml index 4890197ee..42411e705 100644 --- a/pom.xml +++ b/pom.xml @@ -28,6 +28,7 @@ UTF-8 + 3.16.0 @@ -422,7 +423,7 @@ org.apache.jena jena-core pom - 3.4.0 + ${jena.version} @@ -439,7 +440,7 @@ org.apache.jena jena-arq - 3.4.0 + ${jena.version} slf4j-log4j12 @@ -454,7 +455,7 @@ org.apache.jena jena-iri - 3.4.0 + ${jena.version} slf4j-log4j12 @@ -470,7 +471,7 @@ org.apache.jena jena-sdb - 3.4.0 + ${jena.version} @@ -487,7 +488,7 @@ org.apache.jena jena-tdb - 3.4.0 + ${jena.version} slf4j-api diff --git a/src/main/java/org/vivoweb/harvester/qualify/RenameResources.java b/src/main/java/org/vivoweb/harvester/qualify/RenameResources.java index 37666f03f..c2007f573 100644 --- a/src/main/java/org/vivoweb/harvester/qualify/RenameResources.java +++ b/src/main/java/org/vivoweb/harvester/qualify/RenameResources.java @@ -8,6 +8,14 @@ import java.io.IOException; import java.util.HashSet; import java.util.Set; +import org.apache.jena.graph.Graph; +import org.apache.jena.graph.GraphUtil; +import org.apache.jena.graph.Node; +import org.apache.jena.graph.Triple; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.Resource; +import org.apache.jena.reasoner.InfGraph; +import org.apache.jena.util.iterator.ExtendedIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.vivoweb.harvester.util.InitLog; @@ -18,17 +26,6 @@ import org.vivoweb.harvester.util.args.UsageException; import org.vivoweb.harvester.util.repo.JenaConnect; -//import org.apache.jena.graph.BulkUpdateHandler; -import org.apache.jena.graph.Graph; -import org.apache.jena.graph.GraphUtil; -import org.apache.jena.graph.Node; -import org.apache.jena.graph.Triple; -import org.apache.jena.rdf.model.Model; -import org.apache.jena.rdf.model.Resource; -import org.apache.jena.reasoner.InfGraph; -import org.apache.jena.util.iterator.ExtendedIterator; -import org.apache.jena.util.iterator.Filter; - /** * Changes the namespace for all matching uris * @author Christopher Haines (hainesc@ctrip.ufl.edu) @@ -142,16 +139,15 @@ public static Resource renameResource(final Resource old, final String uri) { // combine these iterators ExtendedIterator combinedTriples = subjectTriples.andThen(objectTriples); // Filter reflexive triples, which are found twice in each find method, thus, we need to make sure to keep only one. - ExtendedIterator filteredTriples = combinedTriples.filterKeep(new Filter() { - @Override - public boolean accept(final Triple o) { + ExtendedIterator filteredTriples = combinedTriples.filterKeep( + (final Triple o) -> { if(o.getSubject().equals(o.getObject())) { reflexiveTriples.add(o); return false; } return true; } - }); + ); // create a new resource node to replace old final Resource newRes = model.createResource(uri); From 6a6538994c0f0709211a8047808e159b2ddb007e Mon Sep 17 00:00:00 2001 From: Ivan Mrsulja Date: Fri, 10 Jan 2025 09:58:32 +0100 Subject: [PATCH 20/21] Switched to using final compiled regex patterns for slight performance boost. --- .../org/vivoweb/harvester/fetch/OAIFetch.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/vivoweb/harvester/fetch/OAIFetch.java b/src/main/java/org/vivoweb/harvester/fetch/OAIFetch.java index 5e1f11ff9..2e996d3ff 100644 --- a/src/main/java/org/vivoweb/harvester/fetch/OAIFetch.java +++ b/src/main/java/org/vivoweb/harvester/fetch/OAIFetch.java @@ -9,11 +9,10 @@ import java.net.HttpURLConnection; import java.net.URL; import java.text.ParseException; -import java.text.ParsePosition; import java.text.SimpleDateFormat; import java.util.Date; -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.transform.TransformerException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.dlese.dpc.oai.harvester.HarvestMessageHandler; import org.dlese.dpc.oai.harvester.Harvester; @@ -30,8 +29,6 @@ import org.vivoweb.harvester.util.repo.RecordHandler; import org.vivoweb.harvester.util.repo.RecordStreamOrigin; import org.vivoweb.harvester.util.repo.XMLRecordOutputStream; -import org.xml.sax.SAXException; -import ORG.oclc.oai.harvester2.app.RawWrite; /** * Class for harvesting from OAI Data Sources @@ -79,7 +76,10 @@ public class OAIFetch implements RecordStreamOrigin { * the base for each instance's xmlRos */ private static XMLRecordOutputStream xmlRosBase = new XMLRecordOutputStream(new String[]{"record"}, "", "", ".*?(.*?).*?", null); - + + private static final Pattern CHAR_REFERENCE_PATTERN = + Pattern.compile("(?<=^|[^&])(&#(?:[0-9]+|x[0-9a-fA-F]+);)"); + /** * Constuctor * @param address The website address of the repository, without http:// @@ -193,8 +193,8 @@ public void execute() throws IOException { if (! StringUtils.equalsIgnoreCase(strArray[1], "deleted")) { log.trace("Adding record: " + strArray[0]); - String charReferenceRegex = "(?<=^|[^&])(&#(?:[0-9]+|x[0-9a-fA-F]+);)"; - String fullyEscapedData = strArray[1].replaceAll(charReferenceRegex, "&$1").replace("&&#", "&#"); + Matcher matcher = CHAR_REFERENCE_PATTERN.matcher(strArray[1]); + String fullyEscapedData = matcher.replaceAll("&$1").replace("&&#", "&#"); this.rhOutput.addRecord(strArray[0], fullyEscapedData, this.getClass()); } } From ffbfc717ba8de0847e07390c30d83dda495e9aa0 Mon Sep 17 00:00:00 2001 From: Ivan Mrsulja Date: Fri, 10 Jan 2025 09:59:30 +0100 Subject: [PATCH 21/21] Added javadocs. --- src/main/java/org/vivoweb/harvester/fetch/OAIFetch.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/main/java/org/vivoweb/harvester/fetch/OAIFetch.java b/src/main/java/org/vivoweb/harvester/fetch/OAIFetch.java index 2e996d3ff..b206bdd31 100644 --- a/src/main/java/org/vivoweb/harvester/fetch/OAIFetch.java +++ b/src/main/java/org/vivoweb/harvester/fetch/OAIFetch.java @@ -77,6 +77,11 @@ public class OAIFetch implements RecordStreamOrigin { */ private static XMLRecordOutputStream xmlRosBase = new XMLRecordOutputStream(new String[]{"record"}, "", "", ".*?(.*?).*?", null); + /** + * Pattern to match character references in a string that are not already escaped. + * This ensures that numeric or hexadecimal character references (e.g., { or {) + * are detected when not preceded by an ampersand (&) to avoid double-escaping. + */ private static final Pattern CHAR_REFERENCE_PATTERN = Pattern.compile("(?<=^|[^&])(&#(?:[0-9]+|x[0-9a-fA-F]+);)");