Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create new pds-deep-archive program and improve performance #26

Merged
merged 3 commits into from
Apr 11, 2020
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Resolutions for #13 and #21
- Resolve #21 with a new driver program `aipsip` that generates both the AIP and uses it to make the SIP as well, leaving all in the current working directory (along with two—count 'em, *two*—PDS labels for the price of one!).
    - Updates the Python `setuptools` metadata to generate the new `aipsip` (helps with #21).
    - Refactors logging and command-line argument setup (also for #21).
- Unifies logging between `aipgen` and `sipgen` with the new `aipsip` so that there are `--debug` and `--quiet` options; without either you get a nominal amount of "hand-holding" of output.
- Resolve #13 so that instead of billions of redundant XML parsing and XPath lookups we use a local `sqlite3` database and LRU caching.
    - Factor out XML parsing from `aipgen` and `sipgen` so we can apply caching.
    - Clear up logging messages so we can know what's calling what.
    - Create a temp DB in `sipgen` and populate it with mappings from lidvids to XML files for rapid lookups
        - But see also #25 for other uses of that DB.
- Add standardized `--version` arguments for all three programs.

With these changes, running `sipgen` on my Mac¹ can process a 272GiB `insight_cameras` export in 1:03. On `pdsimg-int1`, it handles the 1.5TiB`insight_cameras` dataset in under 4 hours.

Footnotes:

- ¹2.4 GHz 8-core Intel Core i9, SSD
- ²2.3 GHz 8-core Intel Xeon Gold 6140, unknown drive
nutjob4life committed Apr 7, 2020
commit e073a1041ee34071c50f7cae4928c24385d48f4c
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -64,7 +64,8 @@
entry_points={
'console_scripts': [
'sipgen=pds.aipgen.sip:main',
'aipgen=pds.aipgen.aip:main'
'aipgen=pds.aipgen.aip:main',
'aipsip=pds.aipgen.main:main'
]
},
namespace_packages=['pds'],
35 changes: 15 additions & 20 deletions src/pds/aipgen/aip.py
Original file line number Diff line number Diff line change
@@ -31,7 +31,7 @@


from .constants import PDS_NS_URI, XML_SCHEMA_INSTANCE_NS_URI, PDS_SCHEMA_URL, XML_MODEL_PI, INFORMATION_MODEL_VERSION
from .utils import getPrimariesAndOtherInfo, getMD5
from .utils import getPrimariesAndOtherInfo, getMD5, parseXML, addLoggingArguments
from datetime import datetime
from lxml import etree
import argparse, logging, sys, os, os.path, hashlib
@@ -41,6 +41,7 @@
# ---------

# For ``--help``:
_version = '0.0.0'
_description = '''Generate an Archive Information Package or AIP. An AIP consists of three files:
➀ a "checksum manifest" which contains MD5 hashes of *all* files in a product;
➁ a "transfer manifest" which lists the "lidvids" for files within each XML label mentioned in a product; and
@@ -53,12 +54,8 @@
# Comment to insert near the top of an AIP XML label
_iaComment = 'Parse name from bundle logical_identifier, e.g. urn:nasa:pds:ladee_mission_bundle would be ladee_mission_bundle'


# Logging
# -------

# Logging:
_logger = logging.getLogger(__name__)
logging.basicConfig(format='%(levelname)s %(message)s', level=logging.INFO)


# Functions
@@ -94,7 +91,7 @@ def _getLIDVIDandFileInventory(xmlFile):
identifier, return None and None.
'''
_logger.debug('📜 Analyzing XML in %s', xmlFile)
tree = etree.parse(xmlFile)
tree = parseXML(xmlFile)
root = tree.getroot()
matches = root.findall(f'./{{{PDS_NS_URI}}}Identification_Area/{{{PDS_NS_URI}}}logical_identifier')
if not matches:
@@ -291,10 +288,10 @@ def _writeLabel(
tree.write(labelOutputFile, encoding='utf-8', xml_declaration=True, pretty_print=True)


def _process(bundle):
def process(bundle):
'''Generate a "checksum manifest", a "transfer manifest", and a PDS label from the given
``bundle``, which is an open file stream (with a ``name`` atribute) on the local
filesystem.
filesystem. Return the name of the generated checksum manifest file.
'''
_logger.info('🏃‍♀️ Starting AIP generation for %s', bundle.name)
d = os.path.dirname(os.path.abspath(bundle.name))
@@ -304,7 +301,7 @@ def _process(bundle):
strippedLogicalID = bundleLID.split(':')[-1]

# Easy one: the checksum† manifest
# †It's actually an MD5 hash, not a checksum 😅
# †It's actually an MD5 *hash*, not a checksum 😅
chksumFN = strippedLogicalID + '_checksum_manifest_v' + bundleVID + '.tab'
chksumMD5, chksumSize, chksumNum = _writeChecksumManifest(chksumFN, d)

@@ -328,27 +325,25 @@ def _process(bundle):
xferSize,
xferNum
)
_logger.info('🎉 Success! All done, files generated:')
_logger.info('🎉 Success! AIP done, files generated:')
_logger.info('• Checksum manifest: %s', chksumFN)
_logger.info('• Transfer manifest: %s', xferFN)
_logger.info('• XML label: %s', labelFN)
_logger.info('• XML label for them both: %s', labelFN)
return chksumFN


def main():
'''Check the command-line for options and create a SIP from the given bundle XML'''
'''Check the command-line for options and create an AIP from the given bundle XML'''
parser = argparse.ArgumentParser(description=_description)
parser.add_argument('--version', action='version', version=f'%(prog)s {_version}')
addLoggingArguments(parser)
parser.add_argument(
'bundle', type=argparse.FileType('rb'), metavar='IN-BUNDLE.XML', help='Root bundle XML file to read'
)
parser.add_argument(
'-v', '--verbose', default=False, action='store_true',
help='Verbose logging; defaults %(default)s'
)
args = parser.parse_args()
if args.verbose:
_logger.setLevel(logging.DEBUG)
logging.basicConfig(level=args.loglevel, format='%(levelname)s %(message)s')
_logger.debug('⚙️ command line args = %r', args)
_process(args.bundle)
process(args.bundle)
_logger.info('👋 Thanks for using this program! Bye!')
sys.exit(0)

10 changes: 10 additions & 0 deletions src/pds/aipgen/constants.py
Original file line number Diff line number Diff line change
@@ -52,3 +52,13 @@

# Filename extension to use with PDS labels
PDS_LABEL_FILENAME_EXTENSION = '.xml'

# Command-line names for hash algorithms mapped to Python *implementation*
# name which are standardized (as lower case, no less) in the ``hashlib``.
# There are a lot more possible message digest algorithms, but we choose
# to support just three.
HASH_ALGORITHMS = {
'MD5': 'md5',
'SHA-1': 'sha1',
'SHA-256': 'sha256',
}
97 changes: 97 additions & 0 deletions src/pds/aipgen/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# encoding: utf-8
#
# Copyright © 2020 California Institute of Technology ("Caltech").
# ALL RIGHTS RESERVED. U.S. Government sponsorship acknowledged.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# • Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# • Redistributions must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other
# materials provided with the distribution.
# • Neither the name of Caltech nor its operating division, the Jet Propulsion
# Laboratory, nor the names of its contributors may be used to endorse or
# promote products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.


u'''AIP and SIP generation'''

from .aip import process as aipProcess
from .constants import HASH_ALGORITHMS
from .sip import addSIParguments
from .sip import produce as sipProcess
from .utils import addLoggingArguments
import argparse, sys, logging


# Constants
# ---------

# For ``--help``:
_version = '0.0.0'
_description = '''
Generate an Archive Information Package (AIP) and a Submission Information
Package (SIP). This creates three files for the AIP in the current directory
(overwriting them if they already exist): ➀ a "checksum manifest" which
contains MD5 hashes of *all* files in a product; ➁ a "transfer manifest" which
lists the "lidvids" for files within each XML label mentioned in a product;
and ➂ an XML label for these two files. It also creates two files for the SIP
(also overwriting them if they exist): ➀ A "SIP manifest" file; and an XML
label of that file too. The names of the generated files are based on the
logical identifier found in the bundle file, and any existing files are
overwritten. The names of the generated files are printed upon successful
completion.
'''

# Logging:
_logger = logging.getLogger(__name__)


# Functions
# ---------

def main():
'''Make an AIP and a SIP'''
parser = argparse.ArgumentParser(description=_description)
parser.add_argument('--version', action='version', version=f'%(prog)s {_version}')
addSIParguments(parser)
addLoggingArguments(parser)
parser.add_argument(
'bundle', type=argparse.FileType('rb'), metavar='IN-BUNDLE.XML', help='Bundle XML file to read'
)
args = parser.parse_args()
logging.basicConfig(level=args.loglevel, format='%(levelname)s %(message)s')
_logger.debug('⚙️ command line args = %r', args)
chksumFN = aipProcess(args.bundle)
with open(chksumFN, 'rb') as chksumStream:
sipProcess(
args.bundle,
HASH_ALGORITHMS[args.algorithm],
args.url,
args.insecure,
args.site,
args.offline,
args.bundle_base_url,
chksumStream
)
_logger.info("👋 That's it! Thanks for making an AIP and SIP with us today. Bye!")
sys.exit(0)


if __name__ == '__main__':
main()
226 changes: 112 additions & 114 deletions src/pds/aipgen/sip.py
Original file line number Diff line number Diff line change
@@ -33,13 +33,15 @@

from .constants import (
INFORMATION_MODEL_VERSION, PDS_NS_URI, XML_SCHEMA_INSTANCE_NS_URI, XML_MODEL_PI,
PDS_SCHEMA_URL, AIP_PRODUCT_URI_PREFIX, PDS_LABEL_FILENAME_EXTENSION
PDS_SCHEMA_URL, AIP_PRODUCT_URI_PREFIX, PDS_LABEL_FILENAME_EXTENSION, HASH_ALGORITHMS
)
from .utils import (
getPrimariesAndOtherInfo, getMD5, getLogicalIdentifierAndFileInventory, parseXML, getDigest, addLoggingArguments
)
from .utils import getPrimariesAndOtherInfo, getMD5, getLogicalIdentifierAndFileInventory
from datetime import datetime
from lxml import etree
from urllib.parse import urlparse
import argparse, logging, hashlib, pysolr, urllib.request, os.path, re, sys
import argparse, logging, hashlib, pysolr, urllib.request, os.path, re, sys, sqlite3, tempfile


# Defaults & Constants
@@ -84,20 +86,10 @@
# Internal reference boilerplate
_intRefBoilerplate = 'Links this SIP to the specific version of the bundle product in the PDS registry system'

# Command-line names for various hash algorithms, mapped to Python implementation name—OK,
# not really! The Python implementation names are lowercase, but thankfully ``hashlib`` doesn't
# care. (Still not sure why these were up-cased.)
_algorithms = {
'MD5': 'MD5',
'SHA-1': 'SHA1',
'SHA-256': 'SHA256',
}


# Logging
# -------
_logger = logging.getLogger(__name__)
logging.basicConfig(format='%(levelname)s %(message)s', level=logging.WARNING)


# Functions
@@ -142,42 +134,31 @@ def _getPLines(tabFilePath):
# All values in collection manifest table must be a lidvid, if not throw an error
if '::' not in match.group(1):
msg = ('Invalid collection manifest. All records must contain '
'lidvids but found "' + match.group(1))
'lidvids but found "' + match.group(1) + '"')
raise Exception(msg)

lidvids.add(match.group(1))

return lidvids


def _findLidVidsInXMLFiles(lidvid, xmlFiles):
'''Look in each of the ``xmlFiles`` for an XPath from root to ``Identification_Area`` to
``logical_identifier`` and ``version_id`` and see if they form a matching ``lidvid``.
If it does, add that XML file to a set as a ``file:`` style URL and return the set.
def _findLidVidsInXMLFiles(lidvid, con):
'''Query the database ``con``nection for the ``lidvid`` to see if there's a match
based on an earlier population in the DB's ``lidvids`` table. If so, add those
XML files as a ``file:`` style URL to a set of matching files.
'''
matchingFiles = set()
for xmlFile in xmlFiles:
_logger.debug('Parsing XML in %s', xmlFile)
tree = etree.parse(xmlFile)
root = tree.getroot()
matches = root.findall(f'./{{{PDS_NS_URI}}}Identification_Area/{{{PDS_NS_URI}}}logical_identifier')
if not matches: continue
lid = matches[0].text.strip()

matches = root.findall(f'./{{{PDS_NS_URI}}}Identification_Area/{{{PDS_NS_URI}}}version_id')
if not matches: continue
vid = matches[0].text.strip()

if lidvid.strip() == lid + '::' + vid:
matchingFiles.add('file:' + xmlFile)
matchingFiles |= _getAssociatedProducts(tree, os.path.dirname(xmlFile))
break

cursor = con.cursor()
cursor.execute('''SELECT xmlFile FROM lidvids WHERE lidvid = ?''', (lidvid,))
for xmlFile in cursor.fetchall():
xmlFile = xmlFile[0]
matchingFiles.add('file:' + xmlFile)
tree = parseXML(xmlFile)
matchingFiles |= _getAssociatedProducts(tree, os.path.dirname(xmlFile))
return matchingFiles


def _getAssociatedProducts(root, filepath):
'''Parse the XML for all the files associated with it that make up the product'''
'''Parse the XML at ``root`` for all the files associated with it that make up the product,
preprending ``filepath`` to each match.'''
products = set()
matches = root.findall(f'//{{{PDS_NS_URI}}}File/{{{PDS_NS_URI}}}file_name')
matches.extend(root.findall(f'//{{{PDS_NS_URI}}}Document_File/{{{PDS_NS_URI}}}file_name'))
@@ -188,11 +169,12 @@ def _getAssociatedProducts(root, filepath):
return products


def _getLocalFileInfo(bundle, primaries, bundleLidvid):
def _getLocalFileInfo(bundle, primaries, bundleLidvid, con):
'''Search all XML files (except for the ``bundle`` file) in the same directory as ``bundle``
and look for all XPath ``Product_Collection/Identification_Area/logical_identifier`` values
that match any of the primary names in ``primaries``. If we get a match, note all XPath
``File_Area_Inventory/File/file_name`` entries for later inclusion.
``File_Area_Inventory/File/file_name`` entries for later inclusion. We use the database
at ``con`` to make a working table for noting this info as we go along.
What does later inclusion mean? We open each of those files (usually ending in ``.tab`` or
``.TAB``) and look for any lines with ``P,\\s*`` at the front: the next token is a magical
@@ -208,33 +190,49 @@ def _getLocalFileInfo(bundle, primaries, bundleLidvid):
# Match up lidsvids with xmlFiles
lidvidsToFiles = {}

# Set up a database table to map many-to-many lidvids to xml files
with con:
cursor = con.cursor()
cursor.execute('''CREATE TABLE IF NOT EXISTS lidvids (
lidvid text NOT NULL,
xmlFile text NOT NULL
)''')
cursor.execute('''CREATE INDEX IF NOT EXISTS lidvidIndex ON lidvids (lidvid)''')

# Add bundle to manifest
lidvidsToFiles[bundleLidvid] = {'file:' + bundle}

# OK, here we go
root = os.path.dirname(bundle)
xmlFiles = set()

# Locate all the XML files
for dirpath, dirnames, filenames in os.walk(root):
xmlFiles |= set([os.path.join(dirpath, i) for i in filenames if i.endswith(PDS_LABEL_FILENAME_EXTENSION) or i.endswith(PDS_LABEL_FILENAME_EXTENSION.upper())])

for xmlFile in xmlFiles:
# Need to check for lid or lidvid depending on what is specified in the bundle
lid, lidvid, tabs = getLogicalIdentifierAndFileInventory(xmlFile)
if not lid or not tabs: continue
if lid in primaries or lidvid in primaries:
# This will probably always be the case working with an offline directory tree
lidvidsToFiles[lidvid] = {'file:' + xmlFile}
for tab in tabs:
lidvids |= _getPLines(tab)
lidvidsToFiles[lidvid].add('file:' + tab)

xmlFiles |= set([os.path.join(dirpath, i) for i in filenames if i.lower().endswith(PDS_LABEL_FILENAME_EXTENSION.lower())])

# Get the lidvids and inventory of files mentioned in each xml file
with con:
for xmlFile in xmlFiles:
# Need to check for lid or lidvid depending on what is specified in the bundle
lid, lidvid, tabs = getLogicalIdentifierAndFileInventory(xmlFile)
if not lid or not tabs: continue
if lid in primaries or lidvid in primaries:
# This will probably always be the case working with an offline directory tree
lidvidsToFiles[lidvid] = {'file:' + xmlFile}
for tab in tabs:
lidvids |= _getPLines(tab)
lidvidsToFiles[lidvid].add('file:' + tab)
for lidvid in lidvids:
con.execute('INSERT INTO lidvids (lidvid, xmlFile) VALUES (?,?)', (lidvid, xmlFile))

# Now go through each lidvid mentioned by the PLines in each inventory tab and find their xml files
for lidvid in lidvids:
# Look in all XML files for the lidvids
products = _findLidVidsInXMLFiles(lidvid, xmlFiles)
products = _findLidVidsInXMLFiles(lidvid, con)
matching = lidvidsToFiles.get(lidvid, set())
matching |= products
lidvidsToFiles[lidvid] = matching

# That should do it
return lidvidsToFiles


@@ -245,19 +243,13 @@ def _getDigests(lidvidsToFiles, hashName):
'''
withDigests = []
# TODO: potential optimization is that if the same file appears for multiple lidvids we retrieve it
# multiple times; we could cache previous hash computations
# multiple times; we could cache previous hash computations.
# BANDAID: ``getDigests`` has an LRU cache.
for lidvid, files in lidvidsToFiles.items():
for url in files:
try:
hashish = hashlib.new(hashName)
_logger.debug('Getting «%s» for hashing with %s', url, hashName)
with urllib.request.urlopen(url) as i:
while True:
buf = i.read(_bufsiz)
if len(buf) == 0: break
hashish.update(buf)
# FIXME: some hash algorithms take variable length digests; we should filter those out
withDigests.append((url, hashish.hexdigest(), lidvid))
d = getDigest(url, hashName)
withDigests.append((url, d, lidvid))
except urllib.error.URLError as error:
_logger.info('Problem retrieving «%s» for digest: %r; ignoring', url, error)
return withDigests
@@ -278,7 +270,6 @@ def _writeTable(hashedFiles, hashName, manifest, offline, baseURL, basePathToRep
if offline:
if baseURL.endswith('/'):
baseURL = baseURL[:-1]

url = baseURL + urlparse(url).path.replace(basePathToReplace, '')

entry = f'{digest}\t{hashName}\t{url}\t{lidvid.strip()}\r\n'.encode('utf-8')
@@ -401,39 +392,43 @@ def _writeLabel(logicalID, versionID, title, digest, size, numEntries, hashName,
tree.write(labelOutputFile, encoding='utf-8', xml_declaration=True, pretty_print=True)


def _produce(bundle, hashName, registryServiceURL, insecureConnectionFlag, site, offline, baseURL, aipFile):
def produce(bundle, hashName, registryServiceURL, insecureConnectionFlag, site, offline, baseURL, aipFile):
'''Produce a SIP from the given bundle'''
# Get the bundle path
bundle = os.path.abspath(bundle.name)

# Get the bundle's primary collections and other useful info
primaries, bundleLID, title, bundleVID = getPrimariesAndOtherInfo(bundle)
strippedLogicalID = bundleLID.split(':')[-1]

filename = strippedLogicalID + '_sip_v' + bundleVID
manifestFileName, labelFileName = filename + '.tab', filename + PDS_LABEL_FILENAME_EXTENSION
if offline:
lidvidsToFiles = _getLocalFileInfo(bundle, primaries, bundleLID + '::' + bundleVID)
else:
_logger.warning('The remote functionality with registry in the loop is still in development.')
lidvidsToFiles = _getFileInfo(primaries, registryServiceURL, insecureConnectionFlag)

hashedFiles = _getDigests(lidvidsToFiles, hashName)
with open(manifestFileName, 'wb') as manifest:
md5, size = _writeTable(hashedFiles, hashName, manifest, offline, baseURL, os.path.dirname(os.path.dirname(bundle)))
with open(labelFileName, 'wb') as label:
_writeLabel(bundleLID, bundleVID, title, md5, size, len(hashedFiles), hashName, manifestFileName, site, label, aipFile)
return manifestFileName, labelFileName


def main():
'''Check the command-line for options and create a SIP from the given bundle XML'''
parser = argparse.ArgumentParser(description=_description)
parser.add_argument(
'bundle', type=argparse.FileType('rb'), metavar='IN-BUNDLE.XML', help='Bundle XML file to read'
)
# Make a temp file to use as a database; TODO: could pass ``delete=False`` in
# the future for sharing this DB amongst many processes for some fancy multiprocessing
_logger.info('👟 Submission Information Package (SIP) Generator, version %s', _version)
with tempfile.NamedTemporaryFile() as dbfile:
con = sqlite3.connect(dbfile.name)
_logger.debug('→ Database file (deleted) is %sf', dbfile.name)

# Get the bundle path
bundle = os.path.abspath(bundle.name)

# Get the bundle's primary collections and other useful info
primaries, bundleLID, title, bundleVID = getPrimariesAndOtherInfo(bundle)
strippedLogicalID = bundleLID.split(':')[-1]
filename = strippedLogicalID + '_sip_v' + bundleVID
manifestFileName, labelFileName = filename + '.tab', filename + PDS_LABEL_FILENAME_EXTENSION
if offline:
lidvidsToFiles = _getLocalFileInfo(bundle, primaries, bundleLID + '::' + bundleVID, con)
else:
_logger.warning('⚠️ The remote functionality with registry in the loop is still in development.')
lidvidsToFiles = _getFileInfo(primaries, registryServiceURL, insecureConnectionFlag)

hashedFiles = _getDigests(lidvidsToFiles, hashName)
with open(manifestFileName, 'wb') as manifest:
md5, size = _writeTable(hashedFiles, hashName, manifest, offline, baseURL, os.path.dirname(os.path.dirname(bundle)))
with open(labelFileName, 'wb') as label:
_writeLabel(bundleLID, bundleVID, title, md5, size, len(hashedFiles), hashName, manifestFileName, site, label, aipFile)
_logger.info('🎉 Success! From %s, generated these output files:', bundle)
_logger.info('• SIP Manifest: %s', manifestFileName)
_logger.info('• XML label for the SIP: %s', labelFileName)
return manifestFileName, labelFileName


def addSIParguments(parser):
parser.add_argument(
'-a', '--algorithm', default='MD5', choices=sorted(_algorithms.keys()),
'-a', '--algorithm', default='MD5', choices=sorted(HASH_ALGORITHMS.keys()),
help='File hash (checksum) algorithm; default %(default)s'
)
parser.add_argument(
@@ -453,41 +448,44 @@ def main():
'-k', '--insecure', default=False, action='store_true',
help='Ignore SSL/TLS security issues; default %(default)s'
)
parser.add_argument(
'-c', '--aip', type=argparse.FileType('rb'), metavar='AIP-CHECKSUM-MANIFEST.TAB',
help='Archive Information Product checksum manifest file'
)
parser.add_argument(
'-b', '--bundle-base-url', required=False, default='file:/',
help='Base URL prepended to URLs in the generated manifest for local files in "offline" mode'
)
parser.add_argument(
'-v', '--verbose', default=False, action='store_true',
help='Verbose logging; defaults %(default)s'
)
# TODO: ``pds4_information_model_version`` is parsed into the arg namespace but is otherwise ignored
parser.add_argument(
'-i', '--pds4-information-model-version', default=INFORMATION_MODEL_VERSION,
help='Specify PDS4 Information Model version to generate SIP. Must be 1.13.0.0+; default %(default)s'
)


def main():
'''Check the command-line for options and create a SIP from the given bundle XML'''
parser = argparse.ArgumentParser(description=_description)
parser.add_argument('--version', action='version', version=f'%(prog)s {_version}')
addSIParguments(parser)
addLoggingArguments(parser)
parser.add_argument(
'bundle', type=argparse.FileType('rb'), metavar='IN-BUNDLE.XML', help='Bundle XML file to read'
)
parser.add_argument(
'-c', '--aip', type=argparse.FileType('rb'), metavar='AIP-CHECKSUM-MANIFEST.TAB',
help='Archive Information Product checksum manifest file'
)
args = parser.parse_args()
if args.verbose:
_logger.setLevel(logging.DEBUG)
_logger.debug('command line args = %r', args)
manifest, label = _produce(
logging.basicConfig(level=args.loglevel, format='%(levelname)s %(message)s')
_logger.debug('⚙️ command line args = %r', args)
manifest, label = produce(
args.bundle,
_algorithms[args.algorithm],
HASH_ALGORITHMS[args.algorithm],
args.url,
args.insecure,
args.site,
args.offline,
args.bundle_base_url,
args.aip
)
print(f'⚙︎ ``sipgen`` — Submission Information Package (SIP) Generator, version {_version}', file=sys.stderr)
print(f'🎉 Success! From {args.bundle.name}, generated these output files:', file=sys.stderr)
print(f'• Manifest: {manifest}', file=sys.stderr)
print(f'• Label: {label}', file=sys.stderr)
_logger.info('INFO 👋 All done. Thanks for making a SIP. Bye!')
sys.exit(0)


44 changes: 39 additions & 5 deletions src/pds/aipgen/utils.py
Original file line number Diff line number Diff line change
@@ -32,7 +32,7 @@

from .constants import PDS_NS_URI
from lxml import etree
import logging, hashlib, os.path
import logging, hashlib, os.path, functools, urllib


# Logging
@@ -44,17 +44,39 @@
# -----------------

_bufsiz = 512 # Is there a better place to set this—or a better place to find it?
_xmlCacheSize = 2**16
_digestCacheSize = 2**16


# Functions
# ---------


@functools.lru_cache(maxsize=_xmlCacheSize)
def parseXML(f):
'''Parse the XML in object ``f``'''
return etree.parse(f)


@functools.lru_cache(maxsize=_digestCacheSize)
def getDigest(url, hashName):
'''Compute a digest of the object at url and return it as a hex string'''
hashish = hashlib.new(hashName)
_logger.debug('Getting «%s» for hashing with %s', url, hashName)
with urllib.request.urlopen(url) as i:
while True:
buf = i.read(_bufsiz)
if len(buf) == 0: break
hashish.update(buf)
return hashish.hexdigest() # XXX We do not support hashes with varialbe-length digests


def getPrimariesAndOtherInfo(bundle):
'''Get the "primaries" from the given bundle XML plus the logical identifier,
plus the title plus the version ID (this function does too much)'''
_logger.debug('Parsing XML in %r', bundle)
_logger.debug('Fetching primaries and other info by parsing XML in %r', bundle)
primaries = set()
tree = etree.parse(bundle)
tree = parseXML(bundle)
root = tree.getroot()
members = root.findall(f'.//{{{PDS_NS_URI}}}Bundle_Member_Entry')
for member in members:
@@ -93,8 +115,8 @@ def getLogicalIdentifierAndFileInventory(xmlFile):
in ``File`` in ``File_Area_Inventory`` entries. If there's no logical identifier, just return
None, None, None
'''
_logger.debug('Parsing XML in %s', xmlFile)
tree = etree.parse(xmlFile)
_logger.debug('Getting logical IDs and file inventories by parsing XML in %s', xmlFile)
tree = parseXML(xmlFile)
root = tree.getroot()
matches = root.findall(f'./{{{PDS_NS_URI}}}Identification_Area/{{{PDS_NS_URI}}}logical_identifier')
if not matches: return None, None, None
@@ -109,3 +131,15 @@ def getLogicalIdentifierAndFileInventory(xmlFile):
matches = root.findall(f'./{{{PDS_NS_URI}}}File_Area_Inventory/{{{PDS_NS_URI}}}File/{{{PDS_NS_URI}}}file_name')

return lid, lidvid, [os.path.join(dirname, i.text.strip()) for i in matches]


def addLoggingArguments(parser):
'''Add command-line arguments to the given argument ``parser`` to support logging.'''
parser.add_argument(
'-d', '--debug', action='store_const', dest='loglevel', const=logging.DEBUG, default=logging.INFO,
help='Log debugging messages for developers'
)
parser.add_argument(
'-q', '--quiet', action='store_const', dest='loglevel', const=logging.WARNING,
help="Don't log informational messages"
)