From e014efde5b02e2e0ff68bc379525582d02b1c523 Mon Sep 17 00:00:00 2001 From: Michael Hucka Date: Fri, 16 Apr 2021 17:48:14 -0700 Subject: [PATCH 1/6] Return the count of articles archived by microarchiver --- scripts/helpers.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/helpers.sh b/scripts/helpers.sh index 527db05..8dc9a50 100644 --- a/scripts/helpers.sh +++ b/scripts/helpers.sh @@ -60,11 +60,13 @@ EOF fi # Run microarchiver with arguments and save output in $log - microarchiver $@ >> $log 2>&1 + count=$(microarchiver $@ 2>&1 | tee -a $log | grep "Total articles" | cut -d ' ' -f3) - # Was it a successful run? If not, send mail & quit. + # If successful, return the num. of articles written, else send mail & quit status=$? - if (($status > 0 && $status < 100)); then + if (($status == 0)); then + echo $count + else case "$status" in 1) cause="No network detected" ;; 2) cause="The user interrupted program execution" ;; From c8362eeb3cb40141273bbccc540a22c8bc4dba12 Mon Sep 17 00:00:00 2001 From: Michael Hucka Date: Fri, 16 Apr 2021 17:48:32 -0700 Subject: [PATCH 2/6] Make the script email both the CSV and HTML format reports --- scripts/archive-in-portico | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) mode change 100755 => 100644 scripts/archive-in-portico diff --git a/scripts/archive-in-portico b/scripts/archive-in-portico old mode 100755 new mode 100644 index cdedb59..517b2fb --- a/scripts/archive-in-portico +++ b/scripts/archive-in-portico @@ -45,7 +45,7 @@ today=$(date +%Y-%m-%d) datestampfile=$PORTICO_OUTPUT/last-run-date failurefile=$PORTICO_OUTPUT/last-failures -# Today's run will be written in a subdirctory. Note the subdirectory name +# Today's run will be written in a subdirectory. Note the subdirectory name # includes the current time, not just today's date, because otherwise we # would overwrite the previous data if we ran run multiple times per day. now=$(date +%Y-%m-%d-%H%M) @@ -91,26 +91,30 @@ fi # Run microarchiver separately on past failures, leaving the results unpackaged # so that we can add to them the results of today's run. +rerun_count=0 if [[ -f $failurefile ]]; then echo "=== Running microarchiver on past failures ===" >> $log echo "" >> $log # Note the use of -Z to prevent zip'ing the final results. - run_microarchiver -s portico -Z -C -a $failurefile -o $outputdir -r $report -@ $debuglog + run_microarchiver -s portico -Z -C -a $failurefile -o $outputdir -r $report \ + -f csv,html -t Past_failures_retried -@ $debuglog echo "" >> $log fi +mv $outputdir/report.html $outputdir/rerun-report.html echo "=== Running microarchiver for new articles ===" >> $log echo "" >> $log -thisreport=$outputdir/latest-report.csv -thisdebuglog=$outputdir/latest-debug.log +this_report=$outputdir/latest-report.csv +this_debuglog=$outputdir/latest-debug.log # This will add new articles to any existing ones from the past failures # code above, and this time will zip up the final result. -run_microarchiver -s portico -C -d $afterdate -o $outputdir -r $thisreport -@ $thisdebuglog +run_microarchiver -s portico -C -d $afterdate -o $outputdir -r $this_report \ + -f csv,html -t $today -@ $this_debuglog # Combine separate report files, leave that, & delete the intermediate files. -tail -n +2 $thisreport >> $report -tail $thisdebuglog >> $debuglog -rm -f $thisdebuglog $thisreport +tail -n +2 $this_report >> $report +tail $this_debuglog >> $debuglog +rm -f $this_debuglog $this_report # Did we have any failures? If so, note them for next time. grep -i "missing," $outputdir/*report.csv | cut -f2 -d',' > $failurefile @@ -136,13 +140,13 @@ echo $today > $datestampfile grep -F "Total articles" $log | \ sed 's/Total //g;1 s/articles/Past failures retried/;2 s/articles/New &/' | \ - mail -s "Portico archiving results for $today" -a $report -a $log $EMAIL_SUCCESS + mail -s "Portico archiving results for $today" \ + -a $outputdir/latest-report.html -a $outputdir/rerun-report.html \ + -a $log $EMAIL_SUCCESS # Post the report to Slack ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ run_slack chat send --channel $SLACK_CHANNEL --color "#00ff00" \ - --title "microarchiver successfully completed Portico upload" \ + --title "Portico run for micropublications.org completed." \ --text "There were $(wc -l < $failurefile) articles skipped." -run_slack file upload --channels $SLACK_CHANNEL --file $report \ - --comment "Here is the record of what was uploaded:" From 549a68f28ef1ccd52f1f9f1c9dab1a49adef705b Mon Sep 17 00:00:00 2001 From: mhucka Date: Tue, 1 Jun 2021 11:21:07 -0700 Subject: [PATCH 3/6] Increase default network timeout duration The file for 10.17912/micropub.biology.000402 was taking too long to download and exceeded the 20 second default timeout, so I increased it. --- microarchiver/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/microarchiver/network.py b/microarchiver/network.py index db5bf36..9c8ca88 100644 --- a/microarchiver/network.py +++ b/microarchiver/network.py @@ -251,7 +251,7 @@ def addurl(text): return (text + ' for {}').format(url) try: - req = timed_request('get', url, stream = True) + req = timed_request('get', url, stream = True, timeout = 60) except requests.exceptions.ConnectionError as ex: if recursing >= _MAX_RECURSIVE_CALLS: raise NetworkFailure(addurl('Too many connection errors')) From 53cd222f24415d1988e768418c213bb23a1f1c55 Mon Sep 17 00:00:00 2001 From: mhucka Date: Tue, 1 Jun 2021 11:21:27 -0700 Subject: [PATCH 4/6] Stop Pillow complaining about large files --- microarchiver/__main__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/microarchiver/__main__.py b/microarchiver/__main__.py index 1de2c53..23f3009 100644 --- a/microarchiver/__main__.py +++ b/microarchiver/__main__.py @@ -39,6 +39,10 @@ # "OSError: image file is truncated (10 bytes not processed)" ImageFile.LOAD_TRUNCATED_IMAGES = True +# This is to prevent Pillow from warning "DecompressionBombWarning: Image size +# (100153418 pixels) exceeds limit of 89478485 pixels ..." +Image.MAX_IMAGE_PIXELS = None + import microarchiver from microarchiver import print_version from .exceptions import * From 7ab73ad99907b8d3991bbb6a6e6448911a6270c3 Mon Sep 17 00:00:00 2001 From: mhucka Date: Tue, 1 Jun 2021 11:25:24 -0700 Subject: [PATCH 5/6] Version bump --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index dd1e114..557565e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,7 +12,7 @@ [metadata] name = microarchiver -version = 1.12.0 +version = 1.12.1 summary = Archives articles from microPublication.org description = Create archives of articles from microPublication.org. author = Michael Hucka, Tom Morrell From 93bb0f097688d40f1514277333d0022571731109 Mon Sep 17 00:00:00 2001 From: mhucka Date: Tue, 1 Jun 2021 11:25:30 -0700 Subject: [PATCH 6/6] Summarize recent changes --- CHANGES.md | 93 +++++++++++++++++++++--------------------------------- 1 file changed, 36 insertions(+), 57 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 2f392f2..ea28ee4 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,88 +1,81 @@ -Change log for microarchiver -============================ +# Change log for microarchiver -Version 1.12.0 --------------- +## Version 1.12.1 + +This release fixes a couple of issues: + +* The default network timeout was too short to get large PDF files from micropublication.org. Fixed by tripling the timeout duration. +* Image conversion exceeded an internal default in the Python Pillow package being used for image conversion. Fixed by disabling the size check. + + +# Version 1.12.0 Reports can now be written in _both_ CSV and HTML formats. -Version 1.11.0 --------------- +# Version 1.11.0 -* Add support for producing reports in HTML format. * Add support for specifying the title of the report. * Fix incorrect count of articles in ZIP file comments. -Version 1.10.7 --------------- +# Version 1.10.7 * Test for more signs of failure in `upload-to-pmc.sh`. * Make some very tiny tweaks to the format of logs. -Version 1.10.6 --------------- +# Version 1.10.6 * Assume the use of Python `virtualenv` to lock in a specific Python environment. * Fix a bug in one of the workflow scripts in which the lack of a mail message body caused the mail command to hang indefinitely. -Version 1.10.5 --------------- +# Version 1.10.5 * Add new helper function to run `curl` in the upload script for PMC. * Fix inconsistency in the PMC upload script, wherein the user and password variables were not the same name as the cron variables actually used. -Version 1.10.4 --------------- +# Version 1.10.4 * Fix bug in date handling in workflow scripts. The value of the `--after-date` argument to `microarchiver` was set to the date it ran, which caused it to miss articles published on the date that it ran. The value of the date should have been modified to include the day it last ran so that the date comparison was correct. (Thanks to Nick Stiffler for catching and reporting the problem.) -Version 1.10.3 --------------- +# Version 1.10.3 (Mistaken release -- ignore this.) -Version 1.10.2 --------------- +# Version 1.10.2 * Update the workflow scripts and associated crontab template. -Version 1.10.1 --------------- +# Version 1.10.1 * Fix behavior when DataCite has no data for an article: `microarchiver` was _meant_ to flag the article and keep going, but instead it treated it as a fatal error. * Fix some documentation errors about the numeric codes returned by `microarchiver`. * Minor other improvements. -Version 1.10.0 --------------- +# Version 1.10.0 This version changes the behavior of the `-@` command-line option, such that exceptions encountered when running with the `-@` option do _not_ cause `microarchiver` to drop into an interactive debugger. The old behavior turned out to be unhelpful in practice, and moreover, it mixed two behaviors into one command-line flag. The latter was problematic when running `microarchiver` from scripts. -Version 1.9.4 --------------- +# Version 1.9.4 This version removes an unnecessary dependency on wxPython. A GUI interface was never completed for Microarchiver, and while the starting code is still in the code base in case we try to build a GUI, it doesn't have to be hooked in at this point. Removing the internal references to the GUI code allows the wxPython requirement to be removed, which in turn simplifies and speeds up installation. -Version 1.9.3 --------------- +# Version 1.9.3 * Add missing Python package requirement to requirements.txt. * Simplify PMC upload script. -Version 1.9.2 --------------- +# Version 1.9.2 * Fix broken logos and images in README.md. * Replace local version of `debug.py` with the use of [Sidetrack](https://github.com/caltechlibrary/sidetrack). @@ -90,14 +83,12 @@ Version 1.9.2 * Minor internal changes. -Version 1.9.1 -------------- +# Version 1.9.1 * Fix [issue #2](https://github.com/caltechlibrary/microarchiver/issues/2): volume number in file names is incorrectly determined -Version 1.9.0 -------------- +# Version 1.9.0 * Support output for PMC using new command-line option `-s`. * Rename the JATS XML file after the pattern _issn_-_volume_-_doi_.xml, to make it more compatible with output generated for PMC. @@ -107,14 +98,12 @@ Version 1.9.0 * Some internal code changes. -Version 1.8.0 -------------- +# Version 1.8.0 * Instead of quitting with an error if the file given to `-a` is empty, `microarchiver` will now just print a warning. -Version 1.7.0 -------------- +# Version 1.7.0 * Store JATS XML for each article, as well as any image referenced in the JATS data. Images are converted to uncompressed TIFF before being stored. * Perform JATS validation for each article by default. @@ -127,70 +116,60 @@ Version 1.7.0 * Fix miscellaneous bugs. -Version 1.6.3 -------------- +# Version 1.6.3 * Catch and handle no-content errors more gracefully. * Detect mangled XML returned by micropublication.org and handle it more gracefully. -Version 1.6.2 -------------- +# Version 1.6.2 * Fix crasher in writing comment into zip file because of reference to no-longer-existing package attribute. -Version 1.6.1 -------------- +# Version 1.6.1 * Fix broken handling of debug trace output destination. * Update `README.md` to describe changes to debug flag. -Version 1.6.0 -------------- +# Version 1.6.0 * Change the debug flag `-@` to accept an argument for where to send the debug output trace. The behavior change of `-@` is not backward compatible. * Put metadata in `setup.cfg` and change how Microarchiver gets the metadata internally. -Version 1.5.1 -------------- +# Version 1.5.1 * Fix bug in propagating network failures up to the top of main. * Fix case of variable being shadowed inside a block. -Version 1.5.0 -------------- +# Version 1.5.0 * Added new `-g` option to print the raw XML article list from the server. * Did very minor internal code refactoring. -Version 1.4.0 -------------- +# Version 1.4.0 * Added new `scripts` subdirectory with script for use with cron. * Fixed behavior: if there are no articles to archive, don't create the output directory either. -Version 1.3.0 -------------- +# Version 1.3.0 * Now if there are no articles to archive, it won't create a zip file. -Version 1.2.0 -------------- +# Version 1.2.0 * Improved installation instructions. * Changed debug flag from `-Z` to `-@`. * Internal code changes for message printing & colorization. -Version 1.1.0 -------------- +# Version 1.1.0 * **Backwards incompatible change**: command-line arguments have been significantly changed in terms of names and shortcut letters. * Addition of new `-d` command-line argument, for getting only articles published after a certain date.