From e073a1041ee34071c50f7cae4928c24385d48f4c Mon Sep 17 00:00:00 2001 From: Sean Kelly Date: Tue, 7 Apr 2020 17:20:43 -0500 Subject: [PATCH 1/2] Resolutions for #13 and #21 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Resolve #21 with a new driver program `aipsip` that generates both the AIP and uses it to make the SIP as well, leaving all in the current working directory (along with two—count 'em, *two*—PDS labels for the price of one!). - Updates the Python `setuptools` metadata to generate the new `aipsip` (helps with #21). - Refactors logging and command-line argument setup (also for #21). - Unifies logging between `aipgen` and `sipgen` with the new `aipsip` so that there are `--debug` and `--quiet` options; without either you get a nominal amount of "hand-holding" of output. - Resolve #13 so that instead of billions of redundant XML parsing and XPath lookups we use a local `sqlite3` database and LRU caching. - Factor out XML parsing from `aipgen` and `sipgen` so we can apply caching. - Clear up logging messages so we can know what's calling what. - Create a temp DB in `sipgen` and populate it with mappings from lidvids to XML files for rapid lookups - But see also #25 for other uses of that DB. - Add standardized `--version` arguments for all three programs. With these changes, running `sipgen` on my Mac¹ can process a 272GiB `insight_cameras` export in 1:03. On `pdsimg-int1`, it handles the 1.5TiB`insight_cameras` dataset in under 4 hours. Footnotes: - ¹2.4 GHz 8-core Intel Core i9, SSD - ²2.3 GHz 8-core Intel Xeon Gold 6140, unknown drive --- setup.py | 3 +- src/pds/aipgen/aip.py | 35 +++--- src/pds/aipgen/constants.py | 10 ++ src/pds/aipgen/main.py | 97 ++++++++++++++++ src/pds/aipgen/sip.py | 226 ++++++++++++++++++------------------ src/pds/aipgen/utils.py | 44 ++++++- 6 files changed, 275 insertions(+), 140 deletions(-) create mode 100644 src/pds/aipgen/main.py diff --git a/setup.py b/setup.py index cec9397..acdf95f 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,8 @@ entry_points={ 'console_scripts': [ 'sipgen=pds.aipgen.sip:main', - 'aipgen=pds.aipgen.aip:main' + 'aipgen=pds.aipgen.aip:main', + 'aipsip=pds.aipgen.main:main' ] }, namespace_packages=['pds'], diff --git a/src/pds/aipgen/aip.py b/src/pds/aipgen/aip.py index c9e8212..e18c0d1 100644 --- a/src/pds/aipgen/aip.py +++ b/src/pds/aipgen/aip.py @@ -31,7 +31,7 @@ from .constants import PDS_NS_URI, XML_SCHEMA_INSTANCE_NS_URI, PDS_SCHEMA_URL, XML_MODEL_PI, INFORMATION_MODEL_VERSION -from .utils import getPrimariesAndOtherInfo, getMD5 +from .utils import getPrimariesAndOtherInfo, getMD5, parseXML, addLoggingArguments from datetime import datetime from lxml import etree import argparse, logging, sys, os, os.path, hashlib @@ -41,6 +41,7 @@ # --------- # For ``--help``: +_version = '0.0.0' _description = '''Generate an Archive Information Package or AIP. An AIP consists of three files: ➀ a "checksum manifest" which contains MD5 hashes of *all* files in a product; ➁ a "transfer manifest" which lists the "lidvids" for files within each XML label mentioned in a product; and @@ -53,12 +54,8 @@ # Comment to insert near the top of an AIP XML label _iaComment = 'Parse name from bundle logical_identifier, e.g. urn:nasa:pds:ladee_mission_bundle would be ladee_mission_bundle' - -# Logging -# ------- - +# Logging: _logger = logging.getLogger(__name__) -logging.basicConfig(format='%(levelname)s %(message)s', level=logging.INFO) # Functions @@ -94,7 +91,7 @@ def _getLIDVIDandFileInventory(xmlFile): identifier, return None and None. ''' _logger.debug('📜 Analyzing XML in %s', xmlFile) - tree = etree.parse(xmlFile) + tree = parseXML(xmlFile) root = tree.getroot() matches = root.findall(f'./{{{PDS_NS_URI}}}Identification_Area/{{{PDS_NS_URI}}}logical_identifier') if not matches: @@ -291,10 +288,10 @@ def _writeLabel( tree.write(labelOutputFile, encoding='utf-8', xml_declaration=True, pretty_print=True) -def _process(bundle): +def process(bundle): '''Generate a "checksum manifest", a "transfer manifest", and a PDS label from the given ``bundle``, which is an open file stream (with a ``name`` atribute) on the local - filesystem. + filesystem. Return the name of the generated checksum manifest file. ''' _logger.info('🏃‍♀️ Starting AIP generation for %s', bundle.name) d = os.path.dirname(os.path.abspath(bundle.name)) @@ -304,7 +301,7 @@ def _process(bundle): strippedLogicalID = bundleLID.split(':')[-1] # Easy one: the checksum† manifest - # †It's actually an MD5 hash, not a checksum 😅 + # †It's actually an MD5 *hash*, not a checksum 😅 chksumFN = strippedLogicalID + '_checksum_manifest_v' + bundleVID + '.tab' chksumMD5, chksumSize, chksumNum = _writeChecksumManifest(chksumFN, d) @@ -328,27 +325,25 @@ def _process(bundle): xferSize, xferNum ) - _logger.info('🎉 Success! All done, files generated:') + _logger.info('🎉 Success! AIP done, files generated:') _logger.info('• Checksum manifest: %s', chksumFN) _logger.info('• Transfer manifest: %s', xferFN) - _logger.info('• XML label: %s', labelFN) + _logger.info('• XML label for them both: %s', labelFN) + return chksumFN def main(): - '''Check the command-line for options and create a SIP from the given bundle XML''' + '''Check the command-line for options and create an AIP from the given bundle XML''' parser = argparse.ArgumentParser(description=_description) + parser.add_argument('--version', action='version', version=f'%(prog)s {_version}') + addLoggingArguments(parser) parser.add_argument( 'bundle', type=argparse.FileType('rb'), metavar='IN-BUNDLE.XML', help='Root bundle XML file to read' ) - parser.add_argument( - '-v', '--verbose', default=False, action='store_true', - help='Verbose logging; defaults %(default)s' - ) args = parser.parse_args() - if args.verbose: - _logger.setLevel(logging.DEBUG) + logging.basicConfig(level=args.loglevel, format='%(levelname)s %(message)s') _logger.debug('⚙️ command line args = %r', args) - _process(args.bundle) + process(args.bundle) _logger.info('👋 Thanks for using this program! Bye!') sys.exit(0) diff --git a/src/pds/aipgen/constants.py b/src/pds/aipgen/constants.py index 056adb6..df33a14 100644 --- a/src/pds/aipgen/constants.py +++ b/src/pds/aipgen/constants.py @@ -52,3 +52,13 @@ # Filename extension to use with PDS labels PDS_LABEL_FILENAME_EXTENSION = '.xml' + +# Command-line names for hash algorithms mapped to Python *implementation* +# name which are standardized (as lower case, no less) in the ``hashlib``. +# There are a lot more possible message digest algorithms, but we choose +# to support just three. +HASH_ALGORITHMS = { + 'MD5': 'md5', + 'SHA-1': 'sha1', + 'SHA-256': 'sha256', +} diff --git a/src/pds/aipgen/main.py b/src/pds/aipgen/main.py new file mode 100644 index 0000000..9c28e39 --- /dev/null +++ b/src/pds/aipgen/main.py @@ -0,0 +1,97 @@ +# encoding: utf-8 +# +# Copyright © 2020 California Institute of Technology ("Caltech"). +# ALL RIGHTS RESERVED. U.S. Government sponsorship acknowledged. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# • Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# • Redistributions must reproduce the above copyright notice, this list of +# conditions and the following disclaimer in the documentation and/or other +# materials provided with the distribution. +# • Neither the name of Caltech nor its operating division, the Jet Propulsion +# Laboratory, nor the names of its contributors may be used to endorse or +# promote products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +u'''AIP and SIP generation''' + +from .aip import process as aipProcess +from .constants import HASH_ALGORITHMS +from .sip import addSIParguments +from .sip import produce as sipProcess +from .utils import addLoggingArguments +import argparse, sys, logging + + +# Constants +# --------- + +# For ``--help``: +_version = '0.0.0' +_description = ''' +Generate an Archive Information Package (AIP) and a Submission Information +Package (SIP). This creates three files for the AIP in the current directory +(overwriting them if they already exist): ➀ a "checksum manifest" which +contains MD5 hashes of *all* files in a product; ➁ a "transfer manifest" which +lists the "lidvids" for files within each XML label mentioned in a product; +and ➂ an XML label for these two files. It also creates two files for the SIP +(also overwriting them if they exist): ➀ A "SIP manifest" file; and an XML +label of that file too. The names of the generated files are based on the +logical identifier found in the bundle file, and any existing files are +overwritten. The names of the generated files are printed upon successful +completion. +''' + +# Logging: +_logger = logging.getLogger(__name__) + + +# Functions +# --------- + +def main(): + '''Make an AIP and a SIP''' + parser = argparse.ArgumentParser(description=_description) + parser.add_argument('--version', action='version', version=f'%(prog)s {_version}') + addSIParguments(parser) + addLoggingArguments(parser) + parser.add_argument( + 'bundle', type=argparse.FileType('rb'), metavar='IN-BUNDLE.XML', help='Bundle XML file to read' + ) + args = parser.parse_args() + logging.basicConfig(level=args.loglevel, format='%(levelname)s %(message)s') + _logger.debug('⚙️ command line args = %r', args) + chksumFN = aipProcess(args.bundle) + with open(chksumFN, 'rb') as chksumStream: + sipProcess( + args.bundle, + HASH_ALGORITHMS[args.algorithm], + args.url, + args.insecure, + args.site, + args.offline, + args.bundle_base_url, + chksumStream + ) + _logger.info("👋 That's it! Thanks for making an AIP and SIP with us today. Bye!") + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/src/pds/aipgen/sip.py b/src/pds/aipgen/sip.py index 904417b..c696e7e 100644 --- a/src/pds/aipgen/sip.py +++ b/src/pds/aipgen/sip.py @@ -33,13 +33,15 @@ from .constants import ( INFORMATION_MODEL_VERSION, PDS_NS_URI, XML_SCHEMA_INSTANCE_NS_URI, XML_MODEL_PI, - PDS_SCHEMA_URL, AIP_PRODUCT_URI_PREFIX, PDS_LABEL_FILENAME_EXTENSION + PDS_SCHEMA_URL, AIP_PRODUCT_URI_PREFIX, PDS_LABEL_FILENAME_EXTENSION, HASH_ALGORITHMS +) +from .utils import ( + getPrimariesAndOtherInfo, getMD5, getLogicalIdentifierAndFileInventory, parseXML, getDigest, addLoggingArguments ) -from .utils import getPrimariesAndOtherInfo, getMD5, getLogicalIdentifierAndFileInventory from datetime import datetime from lxml import etree from urllib.parse import urlparse -import argparse, logging, hashlib, pysolr, urllib.request, os.path, re, sys +import argparse, logging, hashlib, pysolr, urllib.request, os.path, re, sys, sqlite3, tempfile # Defaults & Constants @@ -84,20 +86,10 @@ # Internal reference boilerplate _intRefBoilerplate = 'Links this SIP to the specific version of the bundle product in the PDS registry system' -# Command-line names for various hash algorithms, mapped to Python implementation name—OK, -# not really! The Python implementation names are lowercase, but thankfully ``hashlib`` doesn't -# care. (Still not sure why these were up-cased.) -_algorithms = { - 'MD5': 'MD5', - 'SHA-1': 'SHA1', - 'SHA-256': 'SHA256', -} - # Logging # ------- _logger = logging.getLogger(__name__) -logging.basicConfig(format='%(levelname)s %(message)s', level=logging.WARNING) # Functions @@ -142,42 +134,31 @@ def _getPLines(tabFilePath): # All values in collection manifest table must be a lidvid, if not throw an error if '::' not in match.group(1): msg = ('Invalid collection manifest. All records must contain ' - 'lidvids but found "' + match.group(1)) + 'lidvids but found "' + match.group(1) + '"') raise Exception(msg) - lidvids.add(match.group(1)) - return lidvids -def _findLidVidsInXMLFiles(lidvid, xmlFiles): - '''Look in each of the ``xmlFiles`` for an XPath from root to ``Identification_Area`` to - ``logical_identifier`` and ``version_id`` and see if they form a matching ``lidvid``. - If it does, add that XML file to a set as a ``file:`` style URL and return the set. +def _findLidVidsInXMLFiles(lidvid, con): + '''Query the database ``con``nection for the ``lidvid`` to see if there's a match + based on an earlier population in the DB's ``lidvids`` table. If so, add those + XML files as a ``file:`` style URL to a set of matching files. ''' matchingFiles = set() - for xmlFile in xmlFiles: - _logger.debug('Parsing XML in %s', xmlFile) - tree = etree.parse(xmlFile) - root = tree.getroot() - matches = root.findall(f'./{{{PDS_NS_URI}}}Identification_Area/{{{PDS_NS_URI}}}logical_identifier') - if not matches: continue - lid = matches[0].text.strip() - - matches = root.findall(f'./{{{PDS_NS_URI}}}Identification_Area/{{{PDS_NS_URI}}}version_id') - if not matches: continue - vid = matches[0].text.strip() - - if lidvid.strip() == lid + '::' + vid: - matchingFiles.add('file:' + xmlFile) - matchingFiles |= _getAssociatedProducts(tree, os.path.dirname(xmlFile)) - break - + cursor = con.cursor() + cursor.execute('''SELECT xmlFile FROM lidvids WHERE lidvid = ?''', (lidvid,)) + for xmlFile in cursor.fetchall(): + xmlFile = xmlFile[0] + matchingFiles.add('file:' + xmlFile) + tree = parseXML(xmlFile) + matchingFiles |= _getAssociatedProducts(tree, os.path.dirname(xmlFile)) return matchingFiles def _getAssociatedProducts(root, filepath): - '''Parse the XML for all the files associated with it that make up the product''' + '''Parse the XML at ``root`` for all the files associated with it that make up the product, + preprending ``filepath`` to each match.''' products = set() matches = root.findall(f'//{{{PDS_NS_URI}}}File/{{{PDS_NS_URI}}}file_name') matches.extend(root.findall(f'//{{{PDS_NS_URI}}}Document_File/{{{PDS_NS_URI}}}file_name')) @@ -188,11 +169,12 @@ def _getAssociatedProducts(root, filepath): return products -def _getLocalFileInfo(bundle, primaries, bundleLidvid): +def _getLocalFileInfo(bundle, primaries, bundleLidvid, con): '''Search all XML files (except for the ``bundle`` file) in the same directory as ``bundle`` and look for all XPath ``Product_Collection/Identification_Area/logical_identifier`` values that match any of the primary names in ``primaries``. If we get a match, note all XPath - ``File_Area_Inventory/File/file_name`` entries for later inclusion. + ``File_Area_Inventory/File/file_name`` entries for later inclusion. We use the database + at ``con`` to make a working table for noting this info as we go along. What does later inclusion mean? We open each of those files (usually ending in ``.tab`` or ``.TAB``) and look for any lines with ``P,\\s*`` at the front: the next token is a magical @@ -208,33 +190,49 @@ def _getLocalFileInfo(bundle, primaries, bundleLidvid): # Match up lidsvids with xmlFiles lidvidsToFiles = {} + # Set up a database table to map many-to-many lidvids to xml files + with con: + cursor = con.cursor() + cursor.execute('''CREATE TABLE IF NOT EXISTS lidvids ( + lidvid text NOT NULL, + xmlFile text NOT NULL + )''') + cursor.execute('''CREATE INDEX IF NOT EXISTS lidvidIndex ON lidvids (lidvid)''') + # Add bundle to manifest lidvidsToFiles[bundleLidvid] = {'file:' + bundle} + # OK, here we go root = os.path.dirname(bundle) xmlFiles = set() + # Locate all the XML files for dirpath, dirnames, filenames in os.walk(root): - xmlFiles |= set([os.path.join(dirpath, i) for i in filenames if i.endswith(PDS_LABEL_FILENAME_EXTENSION) or i.endswith(PDS_LABEL_FILENAME_EXTENSION.upper())]) - - for xmlFile in xmlFiles: - # Need to check for lid or lidvid depending on what is specified in the bundle - lid, lidvid, tabs = getLogicalIdentifierAndFileInventory(xmlFile) - if not lid or not tabs: continue - if lid in primaries or lidvid in primaries: - # This will probably always be the case working with an offline directory tree - lidvidsToFiles[lidvid] = {'file:' + xmlFile} - for tab in tabs: - lidvids |= _getPLines(tab) - lidvidsToFiles[lidvid].add('file:' + tab) - + xmlFiles |= set([os.path.join(dirpath, i) for i in filenames if i.lower().endswith(PDS_LABEL_FILENAME_EXTENSION.lower())]) + + # Get the lidvids and inventory of files mentioned in each xml file + with con: + for xmlFile in xmlFiles: + # Need to check for lid or lidvid depending on what is specified in the bundle + lid, lidvid, tabs = getLogicalIdentifierAndFileInventory(xmlFile) + if not lid or not tabs: continue + if lid in primaries or lidvid in primaries: + # This will probably always be the case working with an offline directory tree + lidvidsToFiles[lidvid] = {'file:' + xmlFile} + for tab in tabs: + lidvids |= _getPLines(tab) + lidvidsToFiles[lidvid].add('file:' + tab) + for lidvid in lidvids: + con.execute('INSERT INTO lidvids (lidvid, xmlFile) VALUES (?,?)', (lidvid, xmlFile)) + + # Now go through each lidvid mentioned by the PLines in each inventory tab and find their xml files for lidvid in lidvids: - # Look in all XML files for the lidvids - products = _findLidVidsInXMLFiles(lidvid, xmlFiles) + products = _findLidVidsInXMLFiles(lidvid, con) matching = lidvidsToFiles.get(lidvid, set()) matching |= products lidvidsToFiles[lidvid] = matching + # That should do it return lidvidsToFiles @@ -245,19 +243,13 @@ def _getDigests(lidvidsToFiles, hashName): ''' withDigests = [] # TODO: potential optimization is that if the same file appears for multiple lidvids we retrieve it - # multiple times; we could cache previous hash computations + # multiple times; we could cache previous hash computations. + # BANDAID: ``getDigests`` has an LRU cache. for lidvid, files in lidvidsToFiles.items(): for url in files: try: - hashish = hashlib.new(hashName) - _logger.debug('Getting «%s» for hashing with %s', url, hashName) - with urllib.request.urlopen(url) as i: - while True: - buf = i.read(_bufsiz) - if len(buf) == 0: break - hashish.update(buf) - # FIXME: some hash algorithms take variable length digests; we should filter those out - withDigests.append((url, hashish.hexdigest(), lidvid)) + d = getDigest(url, hashName) + withDigests.append((url, d, lidvid)) except urllib.error.URLError as error: _logger.info('Problem retrieving «%s» for digest: %r; ignoring', url, error) return withDigests @@ -278,7 +270,6 @@ def _writeTable(hashedFiles, hashName, manifest, offline, baseURL, basePathToRep if offline: if baseURL.endswith('/'): baseURL = baseURL[:-1] - url = baseURL + urlparse(url).path.replace(basePathToReplace, '') entry = f'{digest}\t{hashName}\t{url}\t{lidvid.strip()}\r\n'.encode('utf-8') @@ -401,39 +392,43 @@ def _writeLabel(logicalID, versionID, title, digest, size, numEntries, hashName, tree.write(labelOutputFile, encoding='utf-8', xml_declaration=True, pretty_print=True) -def _produce(bundle, hashName, registryServiceURL, insecureConnectionFlag, site, offline, baseURL, aipFile): +def produce(bundle, hashName, registryServiceURL, insecureConnectionFlag, site, offline, baseURL, aipFile): '''Produce a SIP from the given bundle''' - # Get the bundle path - bundle = os.path.abspath(bundle.name) - - # Get the bundle's primary collections and other useful info - primaries, bundleLID, title, bundleVID = getPrimariesAndOtherInfo(bundle) - strippedLogicalID = bundleLID.split(':')[-1] - - filename = strippedLogicalID + '_sip_v' + bundleVID - manifestFileName, labelFileName = filename + '.tab', filename + PDS_LABEL_FILENAME_EXTENSION - if offline: - lidvidsToFiles = _getLocalFileInfo(bundle, primaries, bundleLID + '::' + bundleVID) - else: - _logger.warning('The remote functionality with registry in the loop is still in development.') - lidvidsToFiles = _getFileInfo(primaries, registryServiceURL, insecureConnectionFlag) - - hashedFiles = _getDigests(lidvidsToFiles, hashName) - with open(manifestFileName, 'wb') as manifest: - md5, size = _writeTable(hashedFiles, hashName, manifest, offline, baseURL, os.path.dirname(os.path.dirname(bundle))) - with open(labelFileName, 'wb') as label: - _writeLabel(bundleLID, bundleVID, title, md5, size, len(hashedFiles), hashName, manifestFileName, site, label, aipFile) - return manifestFileName, labelFileName - - -def main(): - '''Check the command-line for options and create a SIP from the given bundle XML''' - parser = argparse.ArgumentParser(description=_description) - parser.add_argument( - 'bundle', type=argparse.FileType('rb'), metavar='IN-BUNDLE.XML', help='Bundle XML file to read' - ) + # Make a temp file to use as a database; TODO: could pass ``delete=False`` in + # the future for sharing this DB amongst many processes for some fancy multiprocessing + _logger.info('👟 Submission Information Package (SIP) Generator, version %s', _version) + with tempfile.NamedTemporaryFile() as dbfile: + con = sqlite3.connect(dbfile.name) + _logger.debug('→ Database file (deleted) is %sf', dbfile.name) + + # Get the bundle path + bundle = os.path.abspath(bundle.name) + + # Get the bundle's primary collections and other useful info + primaries, bundleLID, title, bundleVID = getPrimariesAndOtherInfo(bundle) + strippedLogicalID = bundleLID.split(':')[-1] + filename = strippedLogicalID + '_sip_v' + bundleVID + manifestFileName, labelFileName = filename + '.tab', filename + PDS_LABEL_FILENAME_EXTENSION + if offline: + lidvidsToFiles = _getLocalFileInfo(bundle, primaries, bundleLID + '::' + bundleVID, con) + else: + _logger.warning('⚠️ The remote functionality with registry in the loop is still in development.') + lidvidsToFiles = _getFileInfo(primaries, registryServiceURL, insecureConnectionFlag) + + hashedFiles = _getDigests(lidvidsToFiles, hashName) + with open(manifestFileName, 'wb') as manifest: + md5, size = _writeTable(hashedFiles, hashName, manifest, offline, baseURL, os.path.dirname(os.path.dirname(bundle))) + with open(labelFileName, 'wb') as label: + _writeLabel(bundleLID, bundleVID, title, md5, size, len(hashedFiles), hashName, manifestFileName, site, label, aipFile) + _logger.info('🎉 Success! From %s, generated these output files:', bundle) + _logger.info('• SIP Manifest: %s', manifestFileName) + _logger.info('• XML label for the SIP: %s', labelFileName) + return manifestFileName, labelFileName + + +def addSIParguments(parser): parser.add_argument( - '-a', '--algorithm', default='MD5', choices=sorted(_algorithms.keys()), + '-a', '--algorithm', default='MD5', choices=sorted(HASH_ALGORITHMS.keys()), help='File hash (checksum) algorithm; default %(default)s' ) parser.add_argument( @@ -453,30 +448,36 @@ def main(): '-k', '--insecure', default=False, action='store_true', help='Ignore SSL/TLS security issues; default %(default)s' ) - parser.add_argument( - '-c', '--aip', type=argparse.FileType('rb'), metavar='AIP-CHECKSUM-MANIFEST.TAB', - help='Archive Information Product checksum manifest file' - ) parser.add_argument( '-b', '--bundle-base-url', required=False, default='file:/', help='Base URL prepended to URLs in the generated manifest for local files in "offline" mode' ) - parser.add_argument( - '-v', '--verbose', default=False, action='store_true', - help='Verbose logging; defaults %(default)s' - ) # TODO: ``pds4_information_model_version`` is parsed into the arg namespace but is otherwise ignored parser.add_argument( '-i', '--pds4-information-model-version', default=INFORMATION_MODEL_VERSION, help='Specify PDS4 Information Model version to generate SIP. Must be 1.13.0.0+; default %(default)s' ) + + +def main(): + '''Check the command-line for options and create a SIP from the given bundle XML''' + parser = argparse.ArgumentParser(description=_description) + parser.add_argument('--version', action='version', version=f'%(prog)s {_version}') + addSIParguments(parser) + addLoggingArguments(parser) + parser.add_argument( + 'bundle', type=argparse.FileType('rb'), metavar='IN-BUNDLE.XML', help='Bundle XML file to read' + ) + parser.add_argument( + '-c', '--aip', type=argparse.FileType('rb'), metavar='AIP-CHECKSUM-MANIFEST.TAB', + help='Archive Information Product checksum manifest file' + ) args = parser.parse_args() - if args.verbose: - _logger.setLevel(logging.DEBUG) - _logger.debug('command line args = %r', args) - manifest, label = _produce( + logging.basicConfig(level=args.loglevel, format='%(levelname)s %(message)s') + _logger.debug('⚙️ command line args = %r', args) + manifest, label = produce( args.bundle, - _algorithms[args.algorithm], + HASH_ALGORITHMS[args.algorithm], args.url, args.insecure, args.site, @@ -484,10 +485,7 @@ def main(): args.bundle_base_url, args.aip ) - print(f'⚙︎ ``sipgen`` — Submission Information Package (SIP) Generator, version {_version}', file=sys.stderr) - print(f'🎉 Success! From {args.bundle.name}, generated these output files:', file=sys.stderr) - print(f'• Manifest: {manifest}', file=sys.stderr) - print(f'• Label: {label}', file=sys.stderr) + _logger.info('INFO 👋 All done. Thanks for making a SIP. Bye!') sys.exit(0) diff --git a/src/pds/aipgen/utils.py b/src/pds/aipgen/utils.py index f1fe073..726d8fb 100644 --- a/src/pds/aipgen/utils.py +++ b/src/pds/aipgen/utils.py @@ -32,7 +32,7 @@ from .constants import PDS_NS_URI from lxml import etree -import logging, hashlib, os.path +import logging, hashlib, os.path, functools, urllib # Logging @@ -44,17 +44,39 @@ # ----------------- _bufsiz = 512 # Is there a better place to set this—or a better place to find it? +_xmlCacheSize = 2**16 +_digestCacheSize = 2**16 # Functions # --------- + +@functools.lru_cache(maxsize=_xmlCacheSize) +def parseXML(f): + '''Parse the XML in object ``f``''' + return etree.parse(f) + + +@functools.lru_cache(maxsize=_digestCacheSize) +def getDigest(url, hashName): + '''Compute a digest of the object at url and return it as a hex string''' + hashish = hashlib.new(hashName) + _logger.debug('Getting «%s» for hashing with %s', url, hashName) + with urllib.request.urlopen(url) as i: + while True: + buf = i.read(_bufsiz) + if len(buf) == 0: break + hashish.update(buf) + return hashish.hexdigest() # XXX We do not support hashes with varialbe-length digests + + def getPrimariesAndOtherInfo(bundle): '''Get the "primaries" from the given bundle XML plus the logical identifier, plus the title plus the version ID (this function does too much)''' - _logger.debug('Parsing XML in %r', bundle) + _logger.debug('Fetching primaries and other info by parsing XML in %r', bundle) primaries = set() - tree = etree.parse(bundle) + tree = parseXML(bundle) root = tree.getroot() members = root.findall(f'.//{{{PDS_NS_URI}}}Bundle_Member_Entry') for member in members: @@ -93,8 +115,8 @@ def getLogicalIdentifierAndFileInventory(xmlFile): in ``File`` in ``File_Area_Inventory`` entries. If there's no logical identifier, just return None, None, None ''' - _logger.debug('Parsing XML in %s', xmlFile) - tree = etree.parse(xmlFile) + _logger.debug('Getting logical IDs and file inventories by parsing XML in %s', xmlFile) + tree = parseXML(xmlFile) root = tree.getroot() matches = root.findall(f'./{{{PDS_NS_URI}}}Identification_Area/{{{PDS_NS_URI}}}logical_identifier') if not matches: return None, None, None @@ -109,3 +131,15 @@ def getLogicalIdentifierAndFileInventory(xmlFile): matches = root.findall(f'./{{{PDS_NS_URI}}}File_Area_Inventory/{{{PDS_NS_URI}}}File/{{{PDS_NS_URI}}}file_name') return lid, lidvid, [os.path.join(dirname, i.text.strip()) for i in matches] + + +def addLoggingArguments(parser): + '''Add command-line arguments to the given argument ``parser`` to support logging.''' + parser.add_argument( + '-d', '--debug', action='store_const', dest='loglevel', const=logging.DEBUG, default=logging.INFO, + help='Log debugging messages for developers' + ) + parser.add_argument( + '-q', '--quiet', action='store_const', dest='loglevel', const=logging.WARNING, + help="Don't log informational messages" + ) From 1df1564c2e3f6ecbf92a78a35a55e1fcd1b9d39c Mon Sep 17 00:00:00 2001 From: Jordan Padams Date: Sat, 11 Apr 2020 16:01:50 -0700 Subject: [PATCH 2/2] Improvements for usability and bug fixes for validate errors * After running validate, there were a few minor fixes that needed to be implemented. * Commented out / removed several CLI options for the time being until functionality is fully developed. * Updated file naming to take into the account bundle versioning separate from the AIP/SIP version * Updated docs per new pds-deep-archive script which combines aipgen and sipgen. Refs #21 --- README.rst | 97 +------------------- docs/source/development/index.rst | 4 +- docs/source/usage/index.rst | 145 ++++++++++++++---------------- setup.py | 2 +- src/pds/aipgen/aip.py | 27 +++--- src/pds/aipgen/main.py | 42 ++++++--- src/pds/aipgen/sip.py | 83 ++++++++++------- 7 files changed, 168 insertions(+), 232 deletions(-) diff --git a/README.rst b/README.rst index 595098e..4cfdc8a 100644 --- a/README.rst +++ b/README.rst @@ -10,6 +10,8 @@ Archival Information System (OAIS_) standards. Features ======== +• Provides an exectuble Python script ``pds-deep-archive``. Run ``pds-deep-archive --help`` for + more details. • Provides an exectuble Python script ``aipgen``. Run ``aipgen --help`` for more details. • Provides an exectuble Python script ``sipgen``. Run ``sipgen --help`` for @@ -42,6 +44,7 @@ well as ``libxsl2`` 1.1.28 or later. 4. You should now be able to run the deep archive utilities:: + (pds-deep-archive) bash> pds-deep-archive --help (pds-deep-archive) bash> aipgen --help (pds-deep-archive) bash> sipgen --help @@ -63,102 +66,10 @@ To build the software for distribution: 3. A tar.gz should now be available in the ``dist/`` directory for distribution. -Usage -===== - -1. If not already activated, activate your virtualenv:: - - bash> $HOME/.virtualenvs/pds-deep-archive/bin/activate - (pds-deep-archive) bash> - -2. Then you can run aipgen. Here's a basic example using data in the test directory:: - - (pds-deep-archive) bash> aipgen test/data/ladee_test/ladee_mission_bundle/LADEE_Bundle_1101.xml - INFO 🏃‍♀️ Starting AIP generation for test/data/ladee_test/ladee_mission_bundle/LADEE_Bundle_1101.xml - INFO 🧾 Writing checksum manifest for /Users/kelly/Documents/Clients/JPL/PDS/Development/pds-deep-archive/test/data/ladee_test/ladee_mission_bundle to ladee_mission_bundle_checksum_manifest_v1.0.tab - INFO 🚢 Writing transfer manifest for /Users/kelly/Documents/Clients/JPL/PDS/Development/pds-deep-archive/test/data/ladee_test/ladee_mission_bundle to ladee_mission_bundle_transfer_manifest_v1.0.tab - INFO 🏷 Writing AIP label to ladee_mission_bundle_aip_v1.0.xml - INFO 🎉 Success! All done, files generated: - INFO • Checksum manifest: ladee_mission_bundle_checksum_manifest_v1.0.tab - INFO • Transfer manifest: ladee_mission_bundle_transfer_manifest_v1.0.tab - INFO • XML label: ladee_mission_bundle_aip_v1.0.xml - INFO 👋 Thanks for using this program! Bye! - -3. You can also run sipgen. Here is a basic usage example using data in the test directory:: - - (pds-deep-archive) bash> sipgen -c ladee_mission_bundle_checksum_manifest_v1.0.tab -s PDS_ATM -n -b https://atmos.nmsu.edu/PDS/data/PDS4/LADEE/ test/data/ladee_test/ladee_mission_bundle/LADEE_Bundle_1101.xml - ⚙︎ ``sipgen`` — Submission Information Package (SIP) Generator, version 0.0.0 - 🎉 Success! From test/data/ladee_test/ladee_mission_bundle/LADEE_Bundle_1101.xml, generated these output files: - • Manifest: ladee_mission_bundle_sip_v1.0.tab - • Label: ladee_mission_bundle_sip_v1.0.xml - - Note how the checksum manifest from ``aipgen`` was the input to ``-c`` in - ``sipgen``. - -Full usage from the ``--help`` flag to ``aipgen``:: - - usage: aipgen [-h] [-v] IN-BUNDLE.XML - - Generate an Archive Information Package or AIP. An AIP consists of three - files: ➀ a "checksum manifest" which contains MD5 hashes of *all* files in a - product; ➁ a "transfer manifest" which lists the "lidvids" for files within - each XML label mentioned in a product; and ➂ an XML label for these two files. - You can use the checksum manifest file ➀ as input to ``sipgen`` in order to - create a Submission Information Package. - - positional arguments: - IN-BUNDLE.XML Root bundle XML file to read - - optional arguments: - -h, --help show this help message and exit - -v, --verbose Verbose logging; defaults False - -And usage from the ``--help`` flag for ``sipgen``:: - - usage: sipgen [-h] [-a {MD5,SHA-1,SHA-256}] -s - {PDS_ATM,PDS_ENG,PDS_GEO,PDS_IMG,PDS_JPL,PDS_NAI,PDS_PPI,PDS_PSI,PDS_RNG,PDS_SBN} - [-u URL | -n] [-k] [-c AIP-CHECKSUM-MANIFEST.TAB] - [-b BUNDLE_BASE_URL] [-v] [-i PDS4_INFORMATION_MODEL_VERSION] - IN-BUNDLE.XML - - Generate Submission Information Packages (SIPs) from bundles. This program - takes a bundle XML file as input and produces two output files: ① A Submission - Information Package (SIP) manifest file; and ② A PDS XML label of that file. - The files are created in the current working directory when this program is - run. The names of the files are based on the logical identifier found in the - bundle file, and any existing files are overwritten. The names of the - generated files are printed upon successful completion. - - positional arguments: - IN-BUNDLE.XML Bundle XML file to read - - optional arguments: - -h, --help show this help message and exit - -a {MD5,SHA-1,SHA-256}, --algorithm {MD5,SHA-1,SHA-256} - File hash (checksum) algorithm; default MD5 - -s {PDS_ATM,PDS_ENG,PDS_GEO,PDS_IMG,PDS_JPL,PDS_NAI,PDS_PPI,PDS_PSI,PDS_RNG,PDS_SBN}, --site {PDS_ATM,PDS_ENG,PDS_GEO,PDS_IMG,PDS_JPL,PDS_NAI,PDS_PPI,PDS_PSI,PDS_RNG,PDS_SBN} - Provider site ID for the manifest's label; default - None - -u URL, --url URL URL to the registry service; default https://pds-dev- - el7.jpl.nasa.gov/services/registry/pds - -n, --offline Run offline, scanning bundle directory for matching - files instead of querying registry service - -k, --insecure Ignore SSL/TLS security issues; default False - -c AIP-CHECKSUM-MANIFEST.TAB, --aip AIP-CHECKSUM-MANIFEST.TAB - Archive Information Product checksum manifest file - -b BUNDLE_BASE_URL, --bundle-base-url BUNDLE_BASE_URL - Base URL prepended to URLs in the generated manifest - for local files in "offline" mode - -v, --verbose Verbose logging; defaults False - -i PDS4_INFORMATION_MODEL_VERSION, --pds4-information-model-version PDS4_INFORMATION_MODEL_VERSION - Specify PDS4 Information Model version to generate - SIP. Must be 1.13.0.0+; default 1.13.0.0 - - Documentation ============= -Additional documentation is available in the ``docs`` directory and also TBD. +Installation and Usage information can be found in the documentation online at https://nasa-pds-incubator.github.io/pds-deep-archive/ or the latest version is maintained under the ``docs`` directory. diff --git a/docs/source/development/index.rst b/docs/source/development/index.rst index c1a04fd..1c45688 100644 --- a/docs/source/development/index.rst +++ b/docs/source/development/index.rst @@ -9,8 +9,8 @@ build it out:: python3 bootstrap.py bin/buildout -At this point, you'll have the ``aipgen`` and ``sipgen`` programs ready to run -as ``bin/aipgen`` and ``bin/sipgen`` that's set up to use source Python code +At this point, you'll have the ``pds-deep-archive``, ``aipgen``, ``sipgen`` programs ready to run +as ``bin/pds-deep-archive``, ``bin/aipgen``, and ``bin/sipgen`` that's set up to use source Python code under ``src``. Changes you make to the code are reflected in ``bin/sipgen`` immediately. diff --git a/docs/source/usage/index.rst b/docs/source/usage/index.rst index 8cae93f..c10f92a 100644 --- a/docs/source/usage/index.rst +++ b/docs/source/usage/index.rst @@ -1,81 +1,77 @@ 🏃‍♀️ Usage =========== -This package provides two executables, ``aipgen`` that generats Archive -Information Packages; and ``sipgen``, that generates Submission Information -Package (SIP)—both from PDS bundles. - -Running ``aipgen --help`` or ``sipgen --help`` will give a summary of the +This package provides one primary executable, ``pds-deep-archive`` that generates both +and Archive Information Package (AIP) and a Submission Information Package (SIP). The +SIP is what is delivered by the PDS to the NASA Space Science Data Coordinated Archive (NSSDCA). +For more information about the products produced, see the following references: +* OAIS Information - http://www.oais.info/ +* AIP Information - https://www.iasa-web.org/tc04/archival-information-package-aip +* SIP Information - https://www.iasa-web.org/tc04/submission-information-package-sip + +This package also comes with the two sub-components of ``pds-deep-archive`` that can be ran +individually: +* ``aipgen`` that generates Archive Information Packages from a PDS4 bundle +* ``sipgen`` that generates Submission Information from a PDS4 bundle + +Running ``pds-deep-archive --help`` will give a summary of the command-line invocation, its required arguments, and any options that refine the behavior. For example, to create an AIP from the LADEE 1101 bundle in -``test/data/ladee_test/ladee_mission_bundle/LADEE_Bundle_1101.xml`` run:: +``test/data/ladee_test/mission_bundle/LADEE_Bundle_1101.xml`` run:: - aipgen test/data/ladee_test/ladee_mission_bundle/LADEE_Bundle_1101.xml + aipgen test/data/ladee_test/mission_bundle/LADEE_Bundle_1101.xml The program will print:: - INFO 🏃‍♀️ Starting AIP generation for test/data/ladee_test/ladee_mission_bundle/LADEE_Bundle_1101.xml - INFO 🧾 Writing checksum manifest for /Users/kelly/Documents/Clients/JPL/PDS/Development/pds-deep-archive/test/data/ladee_test/ladee_mission_bundle to ladee_mission_bundle_checksum_manifest_v1.0.tab - INFO 🚢 Writing transfer manifest for /Users/kelly/Documents/Clients/JPL/PDS/Development/pds-deep-archive/test/data/ladee_test/ladee_mission_bundle to ladee_mission_bundle_transfer_manifest_v1.0.tab - INFO 🏷 Writing AIP label to ladee_mission_bundle_aip_v1.0.xml - INFO 🎉 Success! All done, files generated: - INFO • Checksum manifest: ladee_mission_bundle_checksum_manifest_v1.0.tab - INFO • Transfer manifest: ladee_mission_bundle_transfer_manifest_v1.0.tab - INFO • XML label: ladee_mission_bundle_aip_v1.0.xml - INFO 👋 Thanks for using this program! Bye! - -This creates three output files in the current directory as part of the AIP: + INFO 👟 PDS Deep Archive, version 0.0.0 + INFO 🏃‍♀️ Starting AIP generation for test/data/ladee_test/mission_bundle/LADEE_Bundle_1101.xml -• ``ladee_mission_bundle_checksum_manifest_v1.0.tab``, the checksum manifest -• ``ladee_mission_bundle_transfer_manifest_v1.0.tab``, the transfer manifest -• ``ladee_mission_bundle_aip_v1.0.xml``, the label for these two files + INFO 🎉 Success! AIP done, files generated: + INFO • Checksum manifest: ladee_mission_bundle_v1.0_checksum_manifest_v1.0.tab + INFO • Transfer manifest: ladee_mission_bundle_v1.0_transfer_manifest_v1.0.tab + INFO • XML label for them both: ladee_mission_bundle_v1.0_aip_v1.0.xml -The checkum manifest may then be fed into ``sipgen`` to create the SIP:: + INFO 🏃‍♀️ Starting SIP generation for test/data/ladee_test/mission_bundle/LADEE_Bundle_1101.xml - sipgen --aip ladee_mission_bundle_checksum_manifest_v1.0.tab ladee_mission_bundle_checksum_manifest_v1.0.tab --s PDS_ATM --offline --bundle-base-url https://atmos.nmsu.edu/PDS/data/PDS4/LADEE/ test/data/ladee_test/ladee_mission_bundle/LADEE_Bundle_1101.xml + INFO 🎉 Success! From /Users/jpadams/Documents/proj/pds/pdsen/workspace/pds-deep-archive/test/data/ladee_test/mission_bundle/LADEE_Bundle_1101.xml, generated these output files: + INFO • SIP Manifest: ladee_mission_bundle_v1.0_sip_v1.0.tab + INFO • XML label for the SIP: ladee_mission_bundle_v1.0_sip_v1.0.xml -This program will print:: + INFO 👋 That's it! Thanks for making an AIP and SIP with us today. Bye! - ⚙︎ ``sipgen`` — Submission Information Package (SIP) Generator, version 0.0.0 - 🎉 Success! From test/data/ladee_test/ladee_mission_bundle/LADEE_Bundle_1101.xml, generated these output files: - • Manifest: ladee_mission_bundle_sip_v1.0.tab - • Label: ladee_mission_bundle_sip_v1.0.xml +This creates 5 output files in the current directory as part of the AIP and SIP Generation: -And two new files will appear in the current directory: +• ``ladee_mission_bundle_v1.0_checksum_manifest_v1.0.tab``, the checksum manifest +• ``ladee_mission_bundle_v1.0_transfer_manifest_v1.0.tab``, the transfer manifest +• ``ladee_mission_bundle_v1.0_aip_v1.0.xml``, the label for these two files -• ``ladee_mission_bundle_sip_v1.0.tab``, the created SIP manifest as a +• ``ladee_mission_bundle_v1.0_sip_v1.0.tab``, the created SIP manifest as a tab-separated values file. -• ``ladee_mission_bundle_sip_v1.0.xml``, an PDS label for the SIP file. - -For reference, the full "usage" message from ``aipgen`` is:: - - usage: aipgen [-h] [-v] IN-BUNDLE.XML - - Generate an Archive Information Package or AIP. An AIP consists of three - files: ➀ a "checksum manifest" which contains MD5 hashes of *all* files in a - product; ➁ a "transfer manifest" which lists the "lidvids" for files within - each XML label mentioned in a product; and ➂ an XML label for these two files. - You can use the checksum manifest file ➀ as input to ``sipgen`` in order to - create a Submission Information Package. - - positional arguments: - IN-BUNDLE.XML Root bundle XML file to read - - optional arguments: - -h, --help show this help message and exit - -v, --verbose Verbose logging; defaults False - -For reference, the full "usage" message from ``sipgen`` follows:: - - usage: sipgen [-h] [-a {MD5,SHA-1,SHA-256}] -s - {PDS_ATM,PDS_ENG,PDS_GEO,PDS_IMG,PDS_JPL,PDS_NAI,PDS_PPI,PDS_PSI,PDS_RNG,PDS_SBN} - [-u URL | -n] [-k] [-c AIP-CHECKSUM-MANIFEST.TAB] - [-b BUNDLE_BASE_URL] [-v] [-i PDS4_INFORMATION_MODEL_VERSION] - IN-BUNDLE.XML +• ``ladee_mission_bundle_v1.0_sip_v1.0.xml``, an PDS label for the SIP file. + +For reference, the full "usage" message from ``pds-deep-archive`` is:: + + $ pds-deep-archive --help + usage: pds-deep-archive [-h] [--version] -s + {PDS_ATM,PDS_ENG,PDS_GEO,PDS_IMG,PDS_JPL,PDS_NAI,PDS_PPI,PDS_PSI,PDS_RNG,PDS_SBN} + [-n] -b BUNDLE_BASE_URL [-d] [-q] + IN-BUNDLE.XML + + Generate an Archive Information Package (AIP) and a Submission Information + Package (SIP). This creates three files for the AIP in the current directory + (overwriting them if they already exist): + ➀ a "checksum manifest" which contains MD5 hashes of *all* files in a product + ➁ a "transfer manifest" which lists the "lidvids" for files within each XML + label mentioned in a product + ➂ an XML label for these two files. + + It also creates two files for the SIP (also overwriting them if they exist): + ① A "SIP manifest" file; and an XML label of that file too. The names of + the generated files are based on the logical identifier found in the + bundle file, and any existing files are overwritten. The names of the + generated files are printed upon successful completion. + ② A PDS XML label of that file. - Generate Submission Information Packages (SIPs) from bundles. This program - takes a bundle XML file as input and produces two output files: ① A Submission - Information Package (SIP) manifest file; and ② A PDS XML label of that file. The files are created in the current working directory when this program is run. The names of the files are based on the logical identifier found in the bundle file, and any existing files are overwritten. The names of the @@ -86,22 +82,19 @@ For reference, the full "usage" message from ``sipgen`` follows:: optional arguments: -h, --help show this help message and exit - -a {MD5,SHA-1,SHA-256}, --algorithm {MD5,SHA-1,SHA-256} - File hash (checksum) algorithm; default MD5 + --version show program's version number and exit -s {PDS_ATM,PDS_ENG,PDS_GEO,PDS_IMG,PDS_JPL,PDS_NAI,PDS_PPI,PDS_PSI,PDS_RNG,PDS_SBN}, --site {PDS_ATM,PDS_ENG,PDS_GEO,PDS_IMG,PDS_JPL,PDS_NAI,PDS_PPI,PDS_PSI,PDS_RNG,PDS_SBN} - Provider site ID for the manifest's label; default - None - -u URL, --url URL URL to the registry service; default https://pds-dev- - el7.jpl.nasa.gov/services/registry/pds + Provider site ID for the manifest's label -n, --offline Run offline, scanning bundle directory for matching - files instead of querying registry service - -k, --insecure Ignore SSL/TLS security issues; default False - -c AIP-CHECKSUM-MANIFEST.TAB, --aip AIP-CHECKSUM-MANIFEST.TAB - Archive Information Product checksum manifest file + files instead of querying registry service. NOTE: By + default, set to True until online mode is available. -b BUNDLE_BASE_URL, --bundle-base-url BUNDLE_BASE_URL - Base URL prepended to URLs in the generated manifest - for local files in "offline" mode - -v, --verbose Verbose logging; defaults False - -i PDS4_INFORMATION_MODEL_VERSION, --pds4-information-model-version PDS4_INFORMATION_MODEL_VERSION - Specify PDS4 Information Model version to generate - SIP. Must be 1.13.0.0+; default 1.13.0.0 + Base URL for Node data archive. This URL will be + prepended to the bundle directory to form URLs to the + products. For example, if we are generating a SIP for + mission_bundle/LADEE_Bundle_1101.xml, and bundle-base- + url is https://atmos.nmsu.edu/PDS/data/PDS4/LADEE/, + the URL in the SIP will be https://atmos.nmsu.edu/PDS/ + data/PDS4/LADEE/mission_bundle/LADEE_Bundle_1101.xml. + -d, --debug Log debugging messages for developers + -q, --quiet Don't log informational messages \ No newline at end of file diff --git a/setup.py b/setup.py index acdf95f..45c8ea5 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ 'console_scripts': [ 'sipgen=pds.aipgen.sip:main', 'aipgen=pds.aipgen.aip:main', - 'aipsip=pds.aipgen.main:main' + 'pds-deep-archive=pds.aipgen.main:main' ] }, namespace_packages=['pds'], diff --git a/src/pds/aipgen/aip.py b/src/pds/aipgen/aip.py index e18c0d1..d49f6b6 100644 --- a/src/pds/aipgen/aip.py +++ b/src/pds/aipgen/aip.py @@ -67,7 +67,7 @@ def _writeChecksumManifest(checksumManifestFN, dn): Return the hex MD5 digest, byte size of the file we created, and the number of records in the file. ''' - _logger.info('🧾 Writing checksum manifest for %s to %s', dn, checksumManifestFN) + _logger.debug('🧾 Writing checksum manifest for %s to %s', dn, checksumManifestFN) md5, size, count = hashlib.new('md5'), 0, 0 prefixLen = len(dn) with open(checksumManifestFN, 'wb') as o: @@ -77,7 +77,7 @@ def _writeChecksumManifest(checksumManifestFN, dn): with open(fileToHash, 'rb') as i: digest = getMD5(i) strippedFN = fileToHash[prefixLen + 1:] - entry = f'{digest}\t{strippedFN}\n'.encode('utf-8') + entry = f'{digest}\t{strippedFN}\r\n'.encode('utf-8') o.write(entry) md5.update(entry) size += len(entry) @@ -121,7 +121,7 @@ def _writeTransferManifest(transferManifestFN, dn): transfer manifest at the top level of the bundle file given and turn all ``/`` directory separators into backslashes. Return a triple of the MD5 digest, byte size, and number of entries in the transfer manifest we created.''' - _logger.info('🚢 Writing transfer manifest for %s to %s', dn, transferManifestFN) + _logger.debug('🚢 Writing transfer manifest for %s to %s', dn, transferManifestFN) md5, size, count = hashlib.new('md5'), 0, 0 lidvidsToFiles = {} for dirpath, dirnames, filenames in os.walk(dn): @@ -138,8 +138,9 @@ def _writeTransferManifest(transferManifestFN, dn): with open(transferManifestFN, 'wb') as o: for lidvid, filenames in lidvidsToFiles.items(): for fn in filenames: - transformedFN = '\\' + fn[prefixLen + 1:].replace('/', '\\') - entry = f'{lidvid:255} {transformedFN:255}\n'.encode('utf-8') + # transformedFN = '\\' + fn[prefixLen + 1:].replace('/', '\\') + transformedFN = '/' + fn[prefixLen + 1:] + entry = f'{lidvid:255}{transformedFN:255}\r\n'.encode('utf-8') o.write(entry) md5.update(entry) size += len(entry) @@ -177,7 +178,7 @@ def _writeLabel( • ``xferNum`` — count of records in the transfer manifest file ''' - _logger.info('🏷 Writing AIP label to %s', labelOutputFile) + _logger.debug('🏷 Writing AIP label to %s\n', labelOutputFile) ts = datetime.utcnow() ts = datetime(ts.year, ts.month, ts.day, ts.hour, ts.minute, ts.second, microsecond=0, tzinfo=None) @@ -263,6 +264,7 @@ def _writeLabel( tm.append(rc) etree.SubElement(rc, prefix + 'fields').text = '2' etree.SubElement(rc, prefix + 'groups').text = '0' + etree.SubElement(rc, prefix + 'record_length', unit='byte').text = '512' fc = etree.Element(prefix + 'Field_Character') rc.append(fc) etree.SubElement(fc, prefix + 'name').text = 'LIDVID' @@ -272,7 +274,7 @@ def _writeLabel( fc = etree.Element(prefix + 'Field_Character') rc.append(fc) etree.SubElement(fc, prefix + 'name').text = 'File Specification Name' - etree.SubElement(fc, prefix + 'field_location', unit='byte').text = '2' + etree.SubElement(fc, prefix + 'field_location', unit='byte').text = '256' etree.SubElement(fc, prefix + 'data_type').text = 'ASCII_File_Specification_Name' etree.SubElement(fc, prefix + 'field_length', unit='byte').text = '255' @@ -293,12 +295,12 @@ def process(bundle): ``bundle``, which is an open file stream (with a ``name`` atribute) on the local filesystem. Return the name of the generated checksum manifest file. ''' - _logger.info('🏃‍♀️ Starting AIP generation for %s', bundle.name) + _logger.info('🏃‍♀️ Starting AIP generation for %s\n', bundle.name) d = os.path.dirname(os.path.abspath(bundle.name)) # Get the bundle's primary collections and other useful info primaries, bundleLID, title, bundleVID = getPrimariesAndOtherInfo(bundle) - strippedLogicalID = bundleLID.split(':')[-1] + strippedLogicalID = bundleLID.split(':')[-1] + '_v' + bundleVID # Easy one: the checksum† manifest # †It's actually an MD5 *hash*, not a checksum 😅 @@ -328,13 +330,14 @@ def process(bundle): _logger.info('🎉 Success! AIP done, files generated:') _logger.info('• Checksum manifest: %s', chksumFN) _logger.info('• Transfer manifest: %s', xferFN) - _logger.info('• XML label for them both: %s', labelFN) + _logger.info('• XML label for them both: %s\n', labelFN) return chksumFN def main(): '''Check the command-line for options and create an AIP from the given bundle XML''' - parser = argparse.ArgumentParser(description=_description) + parser = argparse.ArgumentParser(description=_description, + formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--version', action='version', version=f'%(prog)s {_version}') addLoggingArguments(parser) parser.add_argument( @@ -344,7 +347,7 @@ def main(): logging.basicConfig(level=args.loglevel, format='%(levelname)s %(message)s') _logger.debug('⚙️ command line args = %r', args) process(args.bundle) - _logger.info('👋 Thanks for using this program! Bye!') + _logger.info('👋 Thanks for using this program! Bye!\n\n') sys.exit(0) diff --git a/src/pds/aipgen/main.py b/src/pds/aipgen/main.py index 9c28e39..34a9022 100644 --- a/src/pds/aipgen/main.py +++ b/src/pds/aipgen/main.py @@ -47,15 +47,23 @@ _description = ''' Generate an Archive Information Package (AIP) and a Submission Information Package (SIP). This creates three files for the AIP in the current directory -(overwriting them if they already exist): ➀ a "checksum manifest" which -contains MD5 hashes of *all* files in a product; ➁ a "transfer manifest" which -lists the "lidvids" for files within each XML label mentioned in a product; -and ➂ an XML label for these two files. It also creates two files for the SIP -(also overwriting them if they exist): ➀ A "SIP manifest" file; and an XML -label of that file too. The names of the generated files are based on the -logical identifier found in the bundle file, and any existing files are -overwritten. The names of the generated files are printed upon successful -completion. +(overwriting them if they already exist): +➀ a "checksum manifest" which contains MD5 hashes of *all* files in a product +➁ a "transfer manifest" which lists the "lidvids" for files within each XML + label mentioned in a product +➂ an XML label for these two files. + +It also creates two files for the SIP (also overwriting them if they exist): +① A "SIP manifest" file; and an XML label of that file too. The names of + the generated files are based on the logical identifier found in the + bundle file, and any existing files are overwritten. The names of the + generated files are printed upon successful completion. +② A PDS XML label of that file. + +The files are created in the current working directory when this program is +run. The names of the files are based on the logical identifier found in the +bundle file, and any existing files are overwritten. The names of the +generated files are printed upon successful completion. ''' # Logging: @@ -67,7 +75,8 @@ def main(): '''Make an AIP and a SIP''' - parser = argparse.ArgumentParser(description=_description) + parser = argparse.ArgumentParser(description=_description, + formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--version', action='version', version=f'%(prog)s {_version}') addSIParguments(parser) addLoggingArguments(parser) @@ -76,20 +85,25 @@ def main(): ) args = parser.parse_args() logging.basicConfig(level=args.loglevel, format='%(levelname)s %(message)s') + _logger.info('👟 PDS Deep Archive, version %s', _version) _logger.debug('⚙️ command line args = %r', args) chksumFN = aipProcess(args.bundle) with open(chksumFN, 'rb') as chksumStream: sipProcess( args.bundle, - HASH_ALGORITHMS[args.algorithm], - args.url, - args.insecure, + # TODO: Temporarily hardcoding these values until other modes are available + # HASH_ALGORITHMS[args.algorithm], + # args.url, + # args.insecure, + HASH_ALGORITHMS['MD5'], + '', + '', args.site, args.offline, args.bundle_base_url, chksumStream ) - _logger.info("👋 That's it! Thanks for making an AIP and SIP with us today. Bye!") + _logger.info("👋 That's it! Thanks for making an AIP and SIP with us today. Bye!\n\n") sys.exit(0) diff --git a/src/pds/aipgen/sip.py b/src/pds/aipgen/sip.py index 97a58c8..2411d9f 100644 --- a/src/pds/aipgen/sip.py +++ b/src/pds/aipgen/sip.py @@ -288,7 +288,6 @@ def _writeLabel(logicalID, versionID, title, digest, size, numEntries, hashName, to write an MD5 hash into this label. If it's not given, we default to writing an MD5 of zeros. ''' - # Get the current time, but drop the microsecond resolution ts = datetime.utcnow() ts = datetime(ts.year, ts.month, ts.day, ts.hour, ts.minute, ts.second, microsecond=0, tzinfo=None) @@ -309,7 +308,7 @@ def _writeLabel(logicalID, versionID, title, digest, size, numEntries, hashName, identificationArea = etree.Element(prefix + 'Identification_Area') root.append(identificationArea) - logicalIdentifier = _sipDeepURIPrefix + logicalID.split(':')[-1] + ':' + versionID + logicalIdentifier = _sipDeepURIPrefix + logicalID.split(':')[-1] + '_v' + versionID etree.SubElement(identificationArea, prefix + 'logical_identifier').text = logicalIdentifier etree.SubElement(identificationArea, prefix + 'version_id').text = '1.0' etree.SubElement(identificationArea, prefix + 'title').text = 'Submission Information Package for the ' + title @@ -330,7 +329,7 @@ def _writeLabel(logicalID, versionID, title, digest, size, numEntries, hashName, etree.SubElement(deep, prefix + 'manifest_checksum').text = digest etree.SubElement(deep, prefix + 'checksum_type').text = 'MD5' etree.SubElement(deep, prefix + 'manifest_url').text = 'file:' + os.path.abspath(manifestFile) - etree.SubElement(deep, prefix + 'aip_lidvid').text = AIP_PRODUCT_URI_PREFIX + logicalID.split(':')[-1] + etree.SubElement(deep, prefix + 'aip_lidvid').text = AIP_PRODUCT_URI_PREFIX + logicalID.split(':')[-1]+ '_v' + versionID + '::1.0' aipMD5 = getMD5(aipFile) if aipFile else '00000000000000000000000000000000' etree.SubElement(deep, prefix + 'aip_label_checksum').text = aipMD5 @@ -396,18 +395,20 @@ def produce(bundle, hashName, registryServiceURL, insecureConnectionFlag, site, '''Produce a SIP from the given bundle''' # Make a temp file to use as a database; TODO: could pass ``delete=False`` in # the future for sharing this DB amongst many processes for some fancy multiprocessing - _logger.info('👟 Submission Information Package (SIP) Generator, version %s', _version) with tempfile.NamedTemporaryFile() as dbfile: con = sqlite3.connect(dbfile.name) _logger.debug('→ Database file (deleted) is %sf', dbfile.name) + _logger.info('🏃‍♀️ Starting SIP generation for %s\n', bundle.name) + # Get the bundle path bundle = os.path.abspath(bundle.name) # Get the bundle's primary collections and other useful info primaries, bundleLID, title, bundleVID = getPrimariesAndOtherInfo(bundle) - strippedLogicalID = bundleLID.split(':')[-1] - filename = strippedLogicalID + '_sip_v' + bundleVID + # strippedLogicalID = bundleLID.split(':')[-1] + strippedLogicalID = bundleLID.split(':')[-1] + '_v' + bundleVID + filename = strippedLogicalID + '_sip_v1.0' manifestFileName, labelFileName = filename + '.tab', filename + PDS_LABEL_FILENAME_EXTENSION if offline: lidvidsToFiles = _getLocalFileInfo(bundle, primaries, bundleLID + '::' + bundleVID, con) @@ -422,46 +423,56 @@ def produce(bundle, hashName, registryServiceURL, insecureConnectionFlag, site, _writeLabel(bundleLID, bundleVID, title, md5, size, len(hashedFiles), hashName, manifestFileName, site, label, aipFile) _logger.info('🎉 Success! From %s, generated these output files:', bundle) _logger.info('• SIP Manifest: %s', manifestFileName) - _logger.info('• XML label for the SIP: %s', labelFileName) + _logger.info('• XML label for the SIP: %s\n', labelFileName) return manifestFileName, labelFileName def addSIParguments(parser): - parser.add_argument( - '-a', '--algorithm', default='MD5', choices=sorted(HASH_ALGORITHMS.keys()), - help='File hash (checksum) algorithm; default %(default)s' - ) + # TODO: Temporarily commenting out this argument until an input manifest is available + # parser.add_argument( + # '-a', '--algorithm', default='MD5', choices=sorted(HASH_ALGORITHMS.keys()), + # help='File hash (checksum) algorithm; default %(default)s' + # ) parser.add_argument( '-s', '--site', required=True, choices=_providerSiteIDs, - help="Provider site ID for the manifest's label; default %(default)s" + help="Provider site ID for the manifest's label" ) group = parser.add_mutually_exclusive_group(required=False) + + # TODO: Temporarily setting offline to True by default until online mode is available + # group.add_argument( + # '-u', '--url', default=_registryServiceURL, + # help='URL to the registry service; default %(default)s' + # ) + + # TODO: Temporarily setting offline to True by default until online mode is available group.add_argument( - '-u', '--url', default=_registryServiceURL, - help='URL to the registry service; default %(default)s' - ) - group.add_argument( - '-n', '--offline', default=False, action='store_true', - help='Run offline, scanning bundle directory for matching files instead of querying registry service' - ) - parser.add_argument( - '-k', '--insecure', default=False, action='store_true', - help='Ignore SSL/TLS security issues; default %(default)s' + '-n', '--offline', default=True, action='store_true', + help='Run offline, scanning bundle directory for matching files instead of querying registry service.'+ + ' NOTE: By default, set to True until online mode is available.' ) + + # TODO: Temporarily commenting out until online mode is available + # parser.add_argument( + # '-k', '--insecure', default=False, action='store_true', + # help='Ignore SSL/TLS security issues; default %(default)s' + # ) + + # TODO: Temporarily setting to be required by default until online mode is available parser.add_argument( - '-b', '--bundle-base-url', required=False, default='file:/', - help='Base URL prepended to URLs in the generated manifest for local files in "offline" mode' - ) - # TODO: ``pds4_information_model_version`` is parsed into the arg namespace but is otherwise ignored - parser.add_argument( - '-i', '--pds4-information-model-version', default=INFORMATION_MODEL_VERSION, - help='Specify PDS4 Information Model version to generate SIP. Must be 1.13.0.0+; default %(default)s' + '-b', '--bundle-base-url', required=True, + help='Base URL for Node data archive. This URL will be prepended to' + + ' the bundle directory to form URLs to the products. For example,' + ' if we are generating a SIP for mission_bundle/LADEE_Bundle_1101.xml,' + ' and bundle-base-url is https://atmos.nmsu.edu/PDS/data/PDS4/LADEE/,' + ' the URL in the SIP will be https://atmos.nmsu.edu/PDS/data/PDS4/LADEE/mission_bundle/LADEE_Bundle_1101.xml.' ) def main(): '''Check the command-line for options and create a SIP from the given bundle XML''' - parser = argparse.ArgumentParser(description=_description) + parser = argparse.ArgumentParser(description=_description, + formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--version', action='version', version=f'%(prog)s {_version}') addSIParguments(parser) addLoggingArguments(parser) @@ -479,15 +490,19 @@ def main(): parser.error('--bundle-base-url is required when in offline mode (--offline).') manifest, label = _produce( args.bundle, - HASH_ALGORITHMS[args.algorithm], - args.url, - args.insecure, + # TODO: Temporarily hardcoding these values until other modes are available + # HASH_ALGORITHMS[args.algorithm], + # args.url, + # args.insecure, + HASH_ALGORITHMS['MD5'], + '', + '', args.site, args.offline, args.bundle_base_url, args.aip ) - _logger.info('INFO 👋 All done. Thanks for making a SIP. Bye!') + _logger.info('INFO 👋 All done. Thanks for making a SIP. Bye!\n\n') sys.exit(0)