Skip to content

Commit

Permalink
Merge branch 'add-lastmod'
Browse files Browse the repository at this point in the history
  • Loading branch information
mhucka committed Jan 9, 2019
2 parents 7161f4f + a19c1d7 commit 247b05d
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 17 deletions.
5 changes: 4 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
Change log for eprints2bags
===========================

Version 1.5.1
Version 1.6.0
-------------

* Add new `--lastmod` command-line option
* Fix failure to parse combinations of ranges passed as arguments to `-i` option
* Slightly change the comment block written to a zip archive to make it more specific
* Update help strings and text in README


Version 1.5.0
Expand Down
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@ A program for downloading records from an EPrints server and creating [BagIt](ht

[![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg?style=flat-square)](https://choosealicense.com/licenses/bsd-3-clause)
[![Python](https://img.shields.io/badge/Python-3.5+-brightgreen.svg?style=flat-square)](http://shields.io)
[![Latest release](https://img.shields.io/badge/Latest_release-1.5.0-b44e88.svg?style=flat-square)](http://shields.io)
[![DOI](http://img.shields.io/badge/DOI-10.22002%20%2f%20D1.1150-blue.svg?style=flat-square)](https://data.caltech.edu/badge/records/1150)
[![Latest release](https://img.shields.io/badge/Latest_release-1.6.0-b44e88.svg?style=flat-square)](http://shields.io)
<!--
[![DOI](http://img.shields.io/badge/DOI-10.22002%20%2f%20D1.1150-blue.svg?style=flat-square)](https://data.caltech.edu/badge/records/1150) -->

🏁 Log of recent changes
-----------------------

_Version 1.5.0_: `eprints2bags` now determines which derived files to ignore for a given record by looking at the `<relation>` element for each document, and checking if the relationship is `isVolatileVersionOf`. This makes it possible to ignore thumbnail images no matter what format or file name they have. It also now stores user login & password information on a per-server basis, instead of (as previously) using a single login & password for all servers, and accepts empty user names and passwords in case an EPrints server does not need them.
_Version 1.6.0_: New command-line option `--lastmod` (`-l` for short) allows you to specify a date/time stamp, to return only those records whose last-modified date/time stamp is no older than the given description. Valid descriptors are those accepted by the Python dateparser library. Example: `eprints2bags --lastmod "yesterday at noon" -a ...`.

The file [CHANGES](CHANGES.md) contains a more complete change log that includes information about previous releases.

Expand Down Expand Up @@ -147,7 +148,8 @@ The following table summarizes all the command line options available. (Note: on
| `-a`_A_ | `--api-url`_A_ | Use _A_ as the server's REST API URL | ||
| `-b`_B_ | `--base-name`_B_ | Name the records with the template _B_-n | Use only the record id number, n | |
| `-f`_F_ | `--final-fmt`_F_ | Create single-file archive in format _F_ | Uncompressed ZIP archive | |
| `-i`_L_ | `--id-list`_L_ | List of records to get (can be a file name) | Fetch all records from the server | |
| `-i`_I_ | `--id-list`_I_ | List of records to get (can be a file name) | Fetch all records from the server | |
| `-l`_L_ | `--lastmod`_L_ | Filter by last-modified date/time | Don't filter by date/time | |
| `-m` | `--missing-ok` | Don't count missing records as an error | Stop if missing record encountered | |
| `-o`_O_ | `--output-dir`_O_ | Write outputs in the directory _O_ | Write in the current directory | |
| `-u`_U_ | `--user`_U_ | User name for EPrints server login | |
Expand Down
4 changes: 2 additions & 2 deletions codemeta.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"codeRepository": "https://github.com/caltechlibrary/eprints2bags",
"issueTracker": "https://github.com/caltechlibrary/eprints2bags/issues",
"license": "https://github.com/caltechlibrary/eprints2bags/blob/master/LICENSE",
"version": "1.5.0",
"version": "1.6.0",
"author": [
{
"@type": "Person",
Expand All @@ -17,7 +17,7 @@
"@id": "https://orcid.org/0000-0001-9105-5960"
}],
"developmentStatus": "active",
"downloadUrl": "https://github.com/caltechlibrary/eprints2bags/archive/1.5.0.zip",
"downloadUrl": "https://github.com/caltechlibrary/eprints2bags/archive/1.6.0.zip",
"keywords": [
"EPrints",
"BagIt",
Expand Down
53 changes: 44 additions & 9 deletions eprints2bags/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@

import eprints2bags
from eprints2bags.constants import ON_WINDOWS, KEYRING_PREFIX
from eprints2bags.data_helpers import flatten, expand_range
from eprints2bags.data_helpers import flatten, expand_range, parse_datetime
from eprints2bags.debug import set_debug, log
from eprints2bags.messages import msg, color, MessageHandler
from eprints2bags.network import network_available, download_files, url_host
Expand All @@ -70,6 +70,10 @@
_BAG_CHECKSUMS = ["sha256", "sha512", "md5"]
'''List of checksum types written with the BagIt bags.'''

_LASTMOD_PRINT_FORMAT = '%b %d %Y %H:%M:%S %Z'
'''Format in which lastmod date is printed back to the user. The value is used
with datetime.strftime().'''


# Main program.
# ......................................................................
Expand All @@ -79,6 +83,7 @@
base_name = ('use base name "B" for subdirectory names', 'option', 'b'),
final_fmt = ('create single-file archive of bag in format "F"', 'option', 'f'),
id_list = ('list of records to get (can be a file name)', 'option', 'i'),
lastmod = ('only get records modified after given date/time', 'option', 'l'),
missing_ok = ('do not count missing records as an error', 'flag', 'm'),
output_dir = ('write output to directory "O"', 'option', 'o'),
password = ('EPrints server user password', 'option', 'p'),
Expand All @@ -94,9 +99,10 @@
)

def main(api_url = 'A', base_name = 'B', final_fmt = 'F', id_list = 'I',
missing_ok = False, output_dir = 'O', user = 'U', password = 'P',
quiet = False, delay = 100, no_bags = False, no_color = False,
no_keyring = False, reset_keys = False, version = False, debug = False):
lastmod = 'L', missing_ok = False, output_dir = 'O',
user = 'U', password = 'P', quiet = False, delay = 100,
no_bags = False, no_color = False, no_keyring = False,
reset_keys = False, version = False, debug = False):
'''eprints2bags bags up EPrints content as BagIt bags.
This program contacts an EPrints REST server whose network API is accessible
Expand All @@ -114,8 +120,21 @@ def main(api_url = 'A', base_name = 'B', final_fmt = 'F', id_list = 'I',
..., 100 inclusive), or some combination thereof. In those cases, the
records written will be limited to those numbered.
By default, if a record requested or implied by the arguments to -i is
missing from the EPrints server, this will count as an error and stop
If the -l option (or /l on Windows) is given, the records will be additionally
filtered to return only those whose last-modified date/time stamp is no older
than the given date/time description. Valid descriptors are those accepted
by the Python dateparser library. Make sure to enclose descriptions within
single or double quotes. Examples:
eprints2bags -l "2 weeks ago" -a ....
eprints2bags -l "2014-08-29" -a ....
eprints2bags -l "12 Dec 2014" -a ....
eprints2bags -l "July 4, 2013" -a ....
Last-mod filtering is applied after any -i option is processed.
By default, if a record requested or implied by the arguments to -i and/or -l
is missing from the EPrints server, this will count as an error and stop
execution of the program. If the option -m (or /m on Windows) is given,
missing records will be ignored.
Expand Down Expand Up @@ -234,6 +253,16 @@ def main(api_url = 'A', base_name = 'B', final_fmt = 'F', id_list = 'I',
else:
wanted = list(parsed_id_list(id_list))

if lastmod == 'L':
lastmod = None
else:
try:
lastmod = parse_datetime(lastmod)
lastmod_str = lastmod.strftime(_LASTMOD_PRINT_FORMAT)
if __debug__: log('Parsed lastmod as {}', lastmod_str)
except Exception as ex:
exit(say.fatal_text('Unable to parse lastmod value: {}', str(ex)))

if output_dir == 'O':
output_dir = os.getcwd()
if not path.isabs(output_dir):
Expand All @@ -259,16 +288,18 @@ def main(api_url = 'A', base_name = 'B', final_fmt = 'F', id_list = 'I',
if not user or not password:
user, password = credentials(api_url, user, password, use_keyring, reset_keys)
if not wanted:
if __debug__: log('Fetching records list from {}'.format(api_url))
say.info('Fetching records list from {}', api_url)
wanted = eprints_records_list(api_url, user, password)
fs = fs_type(output_dir)
if __debug__: log('Destination file system is {}', fs)
if fs in KNOWN_SUBDIR_LIMITS and len(wanted) > KNOWN_SUBDIR_LIMITS[fs]:
text = '{} is too many folders for the file system at "{}".'
exit(say.fatal_text(text.format(intcomma(num_wanted), output_dir)))

say.info('Beginning to process {} EPrints {}.', intcomma(len(wanted)),
say.info('Beginning to process {} EPrints {}', intcomma(len(wanted)),
'entries' if len(wanted) > 1 else 'entry')
if lastmod:
say.info('Will only keep records modified after {}', lastmod_str)
say.info('Output will be written under directory "{}"', output_dir)
make_dir(output_dir)

Expand All @@ -281,6 +312,10 @@ def main(api_url = 'A', base_name = 'B', final_fmt = 'F', id_list = 'I',
xml = eprints_xml(number, api_url, user, password, missing_ok, say)
if xml == None:
continue
if lastmod and eprints_lastmod(xml) < lastmod:
say.info("{} hasn't been modified since {} -- skipping",
number, lastmod_str)
continue

# Good so far. Create the directory and write the XML out.
record_dir = path.join(output_dir, name_prefix + str(number))
Expand Down Expand Up @@ -432,7 +467,7 @@ def file_comments(bag):
text += 'About this archive file:\n'
text += '\n'
text += 'This is an archive of a file directory organized in BagIt v1.0 format.\n'
text += 'The bag contains the contents from the EPrints record located at\n'
text += 'The data in the bag are the contents of the EPrints record located at\n'
text += bag.info['External-Identifier']
text += '\n\n'
text += 'The software used to create this archive file was:\n'
Expand Down
2 changes: 1 addition & 1 deletion eprints2bags/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# @website https://github.com/caltechlibrary/eprints2bags
# =============================================================================

__version__ = '1.5.1'
__version__ = '1.6.0'
__title__ = 'eprints2bags'
__name__ = 'eprints2bags'
__description__ = '''Package up EPrints materials as BagIt bags.'''
Expand Down
9 changes: 9 additions & 0 deletions eprints2bags/data_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
data_helpers: data manipulation utilities
'''

import dateparser
import datetime

# Based on http://stackoverflow.com/a/10824484/743730
def flatten(iterable):
'''Flatten a list produced by an iterable. Non-recursive.'''
Expand Down Expand Up @@ -39,3 +42,9 @@ def expand_range(text):
return [*map(str, range(int(range_list[0]), int(range_list[1]) + 1))]
else:
return text


def parse_datetime(string):
'''Parse a human-written time/date string using dateparser's parse()
function with predefined settings.'''
return dateparser.parse(string, settings = {'RETURN_AS_TIMEZONE_AWARE': True})
6 changes: 6 additions & 0 deletions eprints2bags/eprints.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import shutil

import eprints2bags
from eprints2bags.data_helpers import parse_datetime
from eprints2bags.debug import log
from eprints2bags.exceptions import *
from eprints2bags.network import net
Expand Down Expand Up @@ -111,6 +112,11 @@ def eprints_xml(number, base_url, user, password, missing_ok, say):
return etree.fromstring(response.content)


def eprints_lastmod(xml):
lastmod_elem = xml.find('.//{' + _EPRINTS_XMLNS + '}lastmod')
return parse_datetime(lastmod_elem.text)


def eprints_documents(xml):
files = []
# Ignore documents that are derived versions of original docs. These are
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
bagit>=1.7.0
colorama>=0.3.0
dateparser>=0.7.0
humanize>=0.5.1
keyring>=12.2.0
keyrings.alt>=3.1
Expand Down

0 comments on commit 247b05d

Please sign in to comment.