From 1cd593c4bc8ae123be065f274735a6293249fde5 Mon Sep 17 00:00:00 2001 From: Michael Hucka Date: Fri, 21 Dec 2018 11:56:25 -0800 Subject: [PATCH 1/9] WIP add --lastmod option --- eprints2bags/__main__.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/eprints2bags/__main__.py b/eprints2bags/__main__.py index 60cfdc2..e09b355 100755 --- a/eprints2bags/__main__.py +++ b/eprints2bags/__main__.py @@ -78,6 +78,7 @@ base_name = ('use base name "B" for subdirectory names', 'option', 'b'), final_fmt = ('create single-file archive of bag in format "F"', 'option', 'f'), id_list = ('list of records to get (can be a file name)', 'option', 'i'), + lastmod = ('only get records modified since the given date', 'option', 'l'), missing_ok = ('do not count missing records as an error', 'flag', 'm'), output_dir = ('write output to directory "O"', 'option', 'o'), password = ('EPrints server user password', 'option', 'p'), @@ -93,9 +94,10 @@ ) def main(api_url = 'A', base_name = 'B', final_fmt = 'F', id_list = 'I', - missing_ok = False, output_dir = 'O', user = 'U', password = 'P', - quiet = False, delay = 100, no_bags = False, no_color = False, - no_keyring = False, reset_keys = False, version = False, debug = False): + lastmod = 'L', missing_ok = False, output_dir = 'O', + user = 'U', password = 'P', quiet = False, delay = 100, + no_bags = False, no_color = False, no_keyring = False, + reset_keys = False, version = False, debug = False): '''eprints2bags bags up EPrints content as BagIt bags. This program contacts an EPrints REST server whose network API is accessible @@ -113,8 +115,21 @@ def main(api_url = 'A', base_name = 'B', final_fmt = 'F', id_list = 'I', ..., 100 inclusive), or some combination thereof. In those cases, the records written will be limited to those numbered. -By default, if a record requested or implied by the arguments to -i is -missing from the EPrints server, this will count as an error and stop +If the -l option (or /l on Windows) is given, the records will be additionally +filtered to return only those whose last-modified date/time stamp is no older +than the given date/time description. Valid descriptors are those accepted +by the Python dateparser library. Make sure to enclose descriptions within +single or double quotes. Examples: + + eprints2bags -l "2 weeks ago" -a .... + eprints2bags -l "2014-08-29" -a .... + eprints2bags -l "12 Dec 2014" -a .... + eprints2bags -l "July 4, 2013" -a .... + +Last-mod filtering is applied after any -i option is processed. + +By default, if a record requested or implied by the arguments to -i and/or -l +is missing from the EPrints server, this will count as an error and stop execution of the program. If the option -m (or /m on Windows) is given, missing records will be ignored. From e8fa5294373e32813644794eef45eff92fb608d4 Mon Sep 17 00:00:00 2001 From: Michael Hucka Date: Tue, 8 Jan 2019 15:58:01 -0800 Subject: [PATCH 2/9] Require dateparser library --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 70be9fd..df9830d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ bagit>=1.7.0 colorama>=0.3.0 +dateparser>=0.7.0 humanize>=0.5.1 keyring>=12.2.0 keyrings.alt>=3.1 From 4687ad1ca8c332b696f93e29d3bdcee9729d53e1 Mon Sep 17 00:00:00 2001 From: Michael Hucka Date: Tue, 8 Jan 2019 19:38:15 -0800 Subject: [PATCH 3/9] Add helper for parsing date/time strings --- eprints2bags/data_helpers.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/eprints2bags/data_helpers.py b/eprints2bags/data_helpers.py index ccd4982..c1db6b6 100644 --- a/eprints2bags/data_helpers.py +++ b/eprints2bags/data_helpers.py @@ -2,6 +2,9 @@ data_helpers: data manipulation utilities ''' +import dateparser +import datetime + # Based on http://stackoverflow.com/a/10824484/743730 def flatten(iterable): '''Flatten a list produced by an iterable. Non-recursive.''' @@ -39,3 +42,9 @@ def expand_range(text): return [*map(str, range(int(range_list[0]), int(range_list[1]) + 1))] else: return text + + +def parse_datetime(string): + '''Parse a human-written time/date string using dateparser's parse() +function with predefined settings.''' + return dateparser.parse(string, settings = {'RETURN_AS_TIMEZONE_AWARE': True}) From d1bb8b8637bf30dcbad1d160000feaea9d833be0 Mon Sep 17 00:00:00 2001 From: Michael Hucka Date: Tue, 8 Jan 2019 19:43:22 -0800 Subject: [PATCH 4/9] Add new --lastmod command-line argument --- eprints2bags/__main__.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/eprints2bags/__main__.py b/eprints2bags/__main__.py index 6cc9e9b..07d511a 100755 --- a/eprints2bags/__main__.py +++ b/eprints2bags/__main__.py @@ -50,7 +50,7 @@ import eprints2bags from eprints2bags.constants import ON_WINDOWS, KEYRING_PREFIX -from eprints2bags.data_helpers import flatten, expand_range +from eprints2bags.data_helpers import flatten, expand_range, parse_datetime from eprints2bags.debug import set_debug, log from eprints2bags.messages import msg, color, MessageHandler from eprints2bags.network import network_available, download_files, url_host @@ -70,6 +70,10 @@ _BAG_CHECKSUMS = ["sha256", "sha512", "md5"] '''List of checksum types written with the BagIt bags.''' +_LASTMOD_PRINT_FORMAT = '%b %d %Y %H:%M:%S %Z' +'''Format in which lastmod date is printed back to the user. The value is used +with datetime.strftime().''' + # Main program. # ...................................................................... @@ -79,7 +83,7 @@ base_name = ('use base name "B" for subdirectory names', 'option', 'b'), final_fmt = ('create single-file archive of bag in format "F"', 'option', 'f'), id_list = ('list of records to get (can be a file name)', 'option', 'i'), - lastmod = ('only get records modified since the given date', 'option', 'l'), + lastmod = ('only get records modified after given date/time', 'option', 'l'), missing_ok = ('do not count missing records as an error', 'flag', 'm'), output_dir = ('write output to directory "O"', 'option', 'o'), password = ('EPrints server user password', 'option', 'p'), @@ -249,6 +253,16 @@ def main(api_url = 'A', base_name = 'B', final_fmt = 'F', id_list = 'I', else: wanted = list(parsed_id_list(id_list)) + if lastmod == 'L': + lastmod = None + else: + try: + lastmod = parse_datetime(lastmod) + lastmod_str = lastmod.strftime(_LASTMOD_PRINT_FORMAT) + if __debug__: log('Parsed lastmod as {}', lastmod_str) + except Exception as ex: + exit(say.fatal_text('Unable to parse lastmod value: {}', str(ex))) + if output_dir == 'O': output_dir = os.getcwd() if not path.isabs(output_dir): @@ -274,7 +288,7 @@ def main(api_url = 'A', base_name = 'B', final_fmt = 'F', id_list = 'I', if not user or not password: user, password = credentials(api_url, user, password, use_keyring, reset_keys) if not wanted: - if __debug__: log('Fetching records list from {}'.format(api_url)) + say.info('Fetching records list from {}', api_url) wanted = eprints_records_list(api_url, user, password) fs = fs_type(output_dir) if __debug__: log('Destination file system is {}', fs) @@ -282,8 +296,10 @@ def main(api_url = 'A', base_name = 'B', final_fmt = 'F', id_list = 'I', text = '{} is too many folders for the file system at "{}".' exit(say.fatal_text(text.format(intcomma(num_wanted), output_dir))) - say.info('Beginning to process {} EPrints {}.', intcomma(len(wanted)), + say.info('Beginning to process {} EPrints {}', intcomma(len(wanted)), 'entries' if len(wanted) > 1 else 'entry') + if lastmod: + say.info('Will only keep records modified after {}', lastmod_str) say.info('Output will be written under directory "{}"', output_dir) make_dir(output_dir) @@ -296,6 +312,10 @@ def main(api_url = 'A', base_name = 'B', final_fmt = 'F', id_list = 'I', xml = eprints_xml(number, api_url, user, password, missing_ok, say) if xml == None: continue + if lastmod and eprints_lastmod(xml) < lastmod: + say.info("{} hasn't been modified since {} -- skipping", + number, lastmod_str) + continue # Good so far. Create the directory and write the XML out. record_dir = path.join(output_dir, name_prefix + str(number)) From 6eb46c58ecbddd936342a3aca7e42b8a0fff3295 Mon Sep 17 00:00:00 2001 From: Michael Hucka Date: Tue, 8 Jan 2019 19:44:09 -0800 Subject: [PATCH 5/9] Edit the zip archive file comment for greater precision --- eprints2bags/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eprints2bags/__main__.py b/eprints2bags/__main__.py index 07d511a..da96679 100755 --- a/eprints2bags/__main__.py +++ b/eprints2bags/__main__.py @@ -467,7 +467,7 @@ def file_comments(bag): text += 'About this archive file:\n' text += '\n' text += 'This is an archive of a file directory organized in BagIt v1.0 format.\n' - text += 'The bag contains the contents from the EPrints record located at\n' + text += 'The data in the bag are the contents of the EPrints record located at\n' text += bag.info['External-Identifier'] text += '\n\n' text += 'The software used to create this archive file was:\n' From 023acff84034e763f471b6cfbb7c4c418068df54 Mon Sep 17 00:00:00 2001 From: Michael Hucka Date: Tue, 8 Jan 2019 19:44:17 -0800 Subject: [PATCH 6/9] Add eprints_lastmod() --- eprints2bags/eprints.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/eprints2bags/eprints.py b/eprints2bags/eprints.py index d9bbfa2..c991f1f 100644 --- a/eprints2bags/eprints.py +++ b/eprints2bags/eprints.py @@ -22,6 +22,7 @@ import shutil import eprints2bags +from eprints2bags.data_helpers import parse_datetime from eprints2bags.debug import log from eprints2bags.exceptions import * from eprints2bags.network import net @@ -111,6 +112,11 @@ def eprints_xml(number, base_url, user, password, missing_ok, say): return etree.fromstring(response.content) +def eprints_lastmod(xml): + lastmod_elem = xml.find('.//{' + _EPRINTS_XMLNS + '}lastmod') + return parse_datetime(lastmod_elem.text) + + def eprints_documents(xml): files = [] # Ignore documents that are derived versions of original docs. These are From 89e8e20da2c107ced0f97dd502de9921de707c48 Mon Sep 17 00:00:00 2001 From: Michael Hucka Date: Tue, 8 Jan 2019 19:44:32 -0800 Subject: [PATCH 7/9] Explain new --lastmod command-line option --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 6f57f53..561613d 100644 --- a/README.md +++ b/README.md @@ -9,13 +9,14 @@ A program for downloading records from an EPrints server and creating [BagIt](ht [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg?style=flat-square)](https://choosealicense.com/licenses/bsd-3-clause) [![Python](https://img.shields.io/badge/Python-3.5+-brightgreen.svg?style=flat-square)](http://shields.io) -[![Latest release](https://img.shields.io/badge/Latest_release-1.5.0-b44e88.svg?style=flat-square)](http://shields.io) -[![DOI](http://img.shields.io/badge/DOI-10.22002%20%2f%20D1.1150-blue.svg?style=flat-square)](https://data.caltech.edu/badge/records/1150) +[![Latest release](https://img.shields.io/badge/Latest_release-1.6.0-b44e88.svg?style=flat-square)](http://shields.io) + 🏁 Log of recent changes ----------------------- -_Version 1.5.0_: `eprints2bags` now determines which derived files to ignore for a given record by looking at the `` element for each document, and checking if the relationship is `isVolatileVersionOf`. This makes it possible to ignore thumbnail images no matter what format or file name they have. It also now stores user login & password information on a per-server basis, instead of (as previously) using a single login & password for all servers, and accepts empty user names and passwords in case an EPrints server does not need them. +_Version 1.6.0_: New command-line option `--lastmod` (`-l` for short) allows you to specify a date/time stamp, to return only those records whose last-modified date/time stamp is no older than the given description. Valid descriptors are those accepted by the Python dateparser library. Example: `eprints2bags --lastmod "yesterday at noon" -a ...`. The file [CHANGES](CHANGES.md) contains a more complete change log that includes information about previous releases. @@ -147,7 +148,8 @@ The following table summarizes all the command line options available. (Note: on | `-a`_A_ | `--api-url`_A_ | Use _A_ as the server's REST API URL | | ⚑ | | `-b`_B_ | `--base-name`_B_ | Name the records with the template _B_-n | Use only the record id number, n | | | `-f`_F_ | `--final-fmt`_F_ | Create single-file archive in format _F_ | Uncompressed ZIP archive | | -| `-i`_L_ | `--id-list`_L_ | List of records to get (can be a file name) | Fetch all records from the server | | +| `-i`_I_ | `--id-list`_I_ | List of records to get (can be a file name) | Fetch all records from the server | | +| `-l`_L_ | `--lastmod`_L_ | Filter by last-modified date/time | Don't filter by date/time | | | `-m` | `--missing-ok` | Don't count missing records as an error | Stop if missing record encountered | | | `-o`_O_ | `--output-dir`_O_ | Write outputs in the directory _O_ | Write in the current directory | | | `-u`_U_ | `--user`_U_ | User name for EPrints server login | | From f85478f966f304edffa4e29a25a1451bb3d85469 Mon Sep 17 00:00:00 2001 From: Michael Hucka Date: Tue, 8 Jan 2019 19:46:49 -0800 Subject: [PATCH 8/9] Update for lastest changes --- CHANGES.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 560b16f..cf76a3e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,10 +1,13 @@ Change log for eprints2bags =========================== -Version 1.5.1 +Version 1.6.0 ------------- +* Add new `--lastmod` command-line option * Fix failure to parse combinations of ranges passed as arguments to `-i` option +* Slightly change the comment block written to a zip archive to make it more specific +* Update help strings and text in README Version 1.5.0 From a19c1d70d1c8e8218d76cd3f50282719ccbb515d Mon Sep 17 00:00:00 2001 From: Michael Hucka Date: Tue, 8 Jan 2019 19:47:07 -0800 Subject: [PATCH 9/9] Update for 1.6.0 --- codemeta.json | 4 ++-- eprints2bags/__version__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/codemeta.json b/codemeta.json index 006c53d..83a4381 100644 --- a/codemeta.json +++ b/codemeta.json @@ -6,7 +6,7 @@ "codeRepository": "https://github.com/caltechlibrary/eprints2bags", "issueTracker": "https://github.com/caltechlibrary/eprints2bags/issues", "license": "https://github.com/caltechlibrary/eprints2bags/blob/master/LICENSE", - "version": "1.5.0", + "version": "1.6.0", "author": [ { "@type": "Person", @@ -17,7 +17,7 @@ "@id": "https://orcid.org/0000-0001-9105-5960" }], "developmentStatus": "active", - "downloadUrl": "https://github.com/caltechlibrary/eprints2bags/archive/1.5.0.zip", + "downloadUrl": "https://github.com/caltechlibrary/eprints2bags/archive/1.6.0.zip", "keywords": [ "EPrints", "BagIt", diff --git a/eprints2bags/__version__.py b/eprints2bags/__version__.py index b2e4dad..cb7e7c9 100644 --- a/eprints2bags/__version__.py +++ b/eprints2bags/__version__.py @@ -6,7 +6,7 @@ # @website https://github.com/caltechlibrary/eprints2bags # ============================================================================= -__version__ = '1.5.1' +__version__ = '1.6.0' __title__ = 'eprints2bags' __name__ = 'eprints2bags' __description__ = '''Package up EPrints materials as BagIt bags.'''