Skip to content

Commit

Permalink
Optimized batch_import_marc (#2995)
Browse files Browse the repository at this point in the history
- Allows multiple files to be passed to import-marc.sh (Linux + Windows)
- Utilizes multiple file loading to improve performance of batch-import-marc.sh (Linux only)

---------

Co-authored-by: Demian Katz <[email protected]>
  • Loading branch information
damien-git and demiankatz authored Jul 21, 2023
1 parent e8f95e0 commit a2b1e19
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 22 deletions.
5 changes: 5 additions & 0 deletions harvest/batch-import-marc.bat
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ if "%1"=="-d" goto dswitch
if "%1"=="-h" goto helpmessage
if "%1"=="-m" goto mswitch
if "%1"=="-p" goto pswitch
if "%1"=="-x" goto xswitch
if "%1"=="-z" goto zswitch
goto switchloopend
:dswitch
Expand All @@ -59,6 +60,10 @@ set PROPERTIES_FILE=%2
shift
shift
goto switchloop
:xswitch
echo The -x switch is not currently supported under Windows.
echo See https://vufind.org/jira/browse/VUFIND-1626 for more details.
goto end
:zswitch
set LOGGING=0
shift
Expand Down
42 changes: 34 additions & 8 deletions harvest/batch-import-marc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ then
HARVEST_DIR="$VUFIND_HOME/harvest"
fi

if [ -z "$MAX_BATCH_COUNT" ]
then
MAX_BATCH_COUNT=10
fi

BASEPATH_UNDER_HARVEST=true
LOGGING=true
MOVE_DATA=true
Expand All @@ -41,18 +46,20 @@ Options:
-h: Print this message
-m: Do not move the data files after importing.
-p: Used specified SolrMarc configuration properties file
-x: Maximum number of files to send in batches to import-marc.sh (default is MAX_BATCH_COUNT or 10)
-z: No logging.
EOF
}

while getopts ":dhmp:z" OPT
while getopts ":dhmpx:z" OPT
do
case $OPT in
d) BASEPATH_UNDER_HARVEST=false;;
h) usage;
exit 0;;
m) MOVE_DATA=false;;
p) PROPERTIES_FILE="$OPTARG"; export PROPERTIES_FILE;;
x) MAX_BATCH_COUNT="$OPTARG";;
z) LOGGING=false;;
:)
echo "argument to '-$OPTARG' is missing" >&2
Expand All @@ -70,6 +77,13 @@ then
exit 1
fi

# Check MAX_BATCH_COUNT is a positive integer
if ! [[ "$MAX_BATCH_COUNT" =~ ^[1-9][0-9]*$ ]]
then
echo "MAX_BATCH_COUNT (option -x) is not a positive integer: \"$MAX_BATCH_COUNT\""
exit 1
fi

# Set up BASEPATH and check if the path is valid:
if [ $BASEPATH_UNDER_HARVEST == false ]
then
Expand Down Expand Up @@ -101,28 +115,40 @@ fi

# The log() function can be redefined to suit a variety of logging needs
# Positional parameters must be consistent:
# $1 = name of the file being imported
# $1 = name of the first file being imported
if [ $LOGGING == false ]
then
function log {
cat - > /dev/null
}
else
function log {
local FILE=$1
cat -u - > $BASEPATH/log/`basename $FILE`.log
local FILES=$@
local LOGFILE
if [ $# -eq 1 ]
then
LOGFILE=$BASEPATH/log/`basename $1`.log
> $LOGFILE
else
LOGFILE=$BASEPATH/log/`basename $1`_and_more.log
echo -e "This log is for the following files: \n$FILES\n" > $LOGFILE
fi
cat -u - >> $LOGFILE
}
fi

# Process all the files in the target directory:
find $BASEPATH -maxdepth 1 \( -iname "*.xml" -o -iname "*.mrc" -o -iname "*.marc" \) -type f -print0 | \
while read -d $'\0' file
find $BASEPATH -maxdepth 1 \( -iname "*.xml" -o -iname "*.mrc" -o -iname "*.marc" \) -type f -print0 | xargs -0 -n $MAX_BATCH_COUNT | \
while read -d $'\n' files
do
# Logging output handled by log() function
# PROPERTIES_FILE passed via environment
$VUFIND_HOME/import-marc.sh $file 2> >(log $file)
$VUFIND_HOME/import-marc.sh $files 2> >(log $files)
if [ "$?" -eq "0" ] && [ $MOVE_DATA == true ]
then
mv $file $BASEPATH/processed/`basename $file`
for file in $files
do
mv $file $BASEPATH/processed/`basename $file`
done
fi
done
16 changes: 12 additions & 4 deletions import-marc.bat
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ if "%1"=="-p" (
)

rem #####################################################
rem # Make sure we have the expected number of arguments
rem # Print usage when called with no arguments
rem #####################################################
if not "!%1!"=="!!" goto argfound
echo Usage: %THISFILE% [-p c:\path\to\import.properties] c:\path\to\marc.mrc
echo Usage: %THISFILE% [-p c:\path\to\import.properties] c:\path\to\marc.mrc ...
goto end
:argfound

Expand Down Expand Up @@ -115,11 +115,19 @@ rem # Set Command Options
rem ##################################################
for %%a in (%VUFIND_HOME%\import\solrmarc_core_*.jar) do set JAR_FILE=%%a

rem ##################################################
rem # Collect all filenames from command line
rem ##################################################
:collectfilenamesloop
set ALL_FILENAMES=%ALL_FILENAMES% %1
shift
if not "!%1!"=="!!" goto :collectfilenamesloop

rem #####################################################
rem # Execute Importer
rem #####################################################
set RUN_CMD=%JAVA% %INDEX_OPTIONS% -Duser.timezone=UTC -Dlog4j.configuration="file:///%LOG4J_CONFIG%" %EXTRA_SOLRMARC_SETTINGS% -jar %JAR_FILE% %PROPERTIES_FILE% -solrj %VUFIND_HOME%\solr\vendor\server\solr-webapp\webapp\WEB-INF\lib -lib_local %VUFIND_HOME%\import\lib_local;%VUFIND_HOME%\solr\vendor\modules\analysis-extras\lib %1
echo Now Importing %1 ...
set RUN_CMD=%JAVA% %INDEX_OPTIONS% -Duser.timezone=UTC -Dlog4j.configuration="file:///%LOG4J_CONFIG%" %EXTRA_SOLRMARC_SETTINGS% -jar %JAR_FILE% %PROPERTIES_FILE% -solrj %VUFIND_HOME%\solr\vendor\server\solr-webapp\webapp\WEB-INF\lib -lib_local %VUFIND_HOME%\import\lib_local;%VUFIND_HOME%\solr\vendor\modules\analysis-extras\lib%ALL_FILENAMES%
echo Now Importing%ALL_FILENAMES% ...
echo %RUN_CMD%
%RUN_CMD%
exit /b %ERRORLEVEL%
Expand Down
23 changes: 13 additions & 10 deletions import-marc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,13 @@ done
shift $(($OPTIND - 1))

#####################################################
# Make sure we have the expected number of arguments
# Print usage when called with no argument
#####################################################
E_BADARGS=65
EXPECTED_ARGS=1

if [ $# -ne $EXPECTED_ARGS ]
if [ $# -eq 0 ]
then
echo " Usage: `basename $0` [-p ./path/to/import.properties] ./path/to/marc.mrc"
echo " Usage: `basename $0` [-p ./path/to/import.properties] ./path/to/marc.mrc ..."
exit $E_BADARGS
fi

Expand Down Expand Up @@ -129,11 +128,15 @@ then
fi

#####################################################
# Normalize target file path to absolute path
# Normalize file paths to absolute paths
#####################################################
MARC_PATH=`dirname $1`
MARC_PATH=`cd $MARC_PATH && pwd`
MARC_FILE=`basename $1`
NORMALIZED_PATHS=""
for f in $*; do
MARC_PATH=`dirname $f`
MARC_PATH=`cd $MARC_PATH && pwd`
MARC_FILE=`basename $f`
NORMALIZED_PATHS="${NORMALIZED_PATHS} $MARC_PATH/$MARC_FILE"
done

#####################################################
# Set up SolrJ symlinks for performance (searching
Expand All @@ -158,8 +161,8 @@ fi
# Execute Importer
#####################################################

RUN_CMD="$JAVA $INDEX_OPTIONS -Duser.timezone=UTC -Dlog4j.configuration=file://$LOG4J_CONFIG $EXTRA_SOLRMARC_SETTINGS -jar $JAR_FILE $PROPERTIES_FILE -solrj $SOLRJ_DIR -lib_local "$VUFIND_HOME/import/lib_local\;$VUFIND_HOME/solr/vendor/modules/analysis-extras/lib" $MARC_PATH/$MARC_FILE"
echo "Now Importing $1 ..."
RUN_CMD="$JAVA $INDEX_OPTIONS -Duser.timezone=UTC -Dlog4j.configuration=file://$LOG4J_CONFIG $EXTRA_SOLRMARC_SETTINGS -jar $JAR_FILE $PROPERTIES_FILE -solrj $SOLRJ_DIR -lib_local "$VUFIND_HOME/import/lib_local\;$VUFIND_HOME/solr/vendor/modules/analysis-extras/lib" $NORMALIZED_PATHS"
echo "Now Importing $NORMALIZED_PATHS ..."
# solrmarc writes log messages to stderr, write RUN_CMD to the same place
echo "`date '+%h %d, %H:%M:%S'` $RUN_CMD" >&2
exec $RUN_CMD

0 comments on commit a2b1e19

Please sign in to comment.