From e521da3ed585f393bc7baaa994f206a6497f990e Mon Sep 17 00:00:00 2001 From: Tom Morrell Date: Tue, 30 Oct 2018 16:14:34 -0700 Subject: [PATCH] Tech report and download autmation --- README.md | 8 +++++ caltech_authors_tech_report.py | 57 +++++++++++++++++++++++++++------- caltech_thesis.py | 20 ++++++++++-- setup.py | 4 +-- 4 files changed, 73 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index bb74898..dcb6f48 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,11 @@ infile = open('10271.xml',encoding="utf8") eprint = xmltodict.parse(infile.read())['eprints']['eprint'] ``` +## Downloading Eprints XML files + +You can use Eprints ids (e.g. 82938) to download Eprints xml files by adding a +`-ids` option to any command. This requires eputil to be installed on your local systems. + ## Using caltech_thesis.py Download .xml files from thesis.library.caltech.edu/rest/eprint/1234.xml and put @@ -41,3 +46,6 @@ DataCite XML files will appear. If you want to mint DOIs add the `-mint` option and if you want to make test DOIs add the `-test` option to the command line. +## Using caltech_authors_tech_report.py + +Will only work with items with the `monograph` item type (Report or Paper). diff --git a/caltech_authors_tech_report.py b/caltech_authors_tech_report.py index 32134e4..b799f65 100644 --- a/caltech_authors_tech_report.py +++ b/caltech_authors_tech_report.py @@ -1,6 +1,6 @@ import xmltodict from datacite import DataCiteMDSClient,schema40 -import glob,json,datetime,re +import glob,json,datetime,re,getpass import os,argparse,subprocess def cleanhtml(raw_html): @@ -9,9 +9,24 @@ def cleanhtml(raw_html): return cleantext def epxml_to_datacite(eprint): - + + print(eprint['type']) + if eprint['type'] != 'monograph': + raise Exception("This code has only been tested on tech reports") + metadata = {} + item_types = { + "discussion_paper":"Discussion Paper", + "documentation":"Documentation", + "manual":"Manual", + "other":"Other", + "project_report":"Project Report", + "report":"Report", + "technical_report":"Technical Report", + "white_paper":"White Paper", + "working_paper":"Working Paper"} + #Transforming Metadata #Creators newa = [] @@ -65,7 +80,7 @@ def epxml_to_datacite(eprint): metadata['publicationYear'] = eprint['date'].split('-')[0] else: metadata['publicationYear'] = eprint['date'] - metadata['resourceType']={'resourceTypeGeneral':"Text",'resourceType':"Technical Report"} + metadata['resourceType']={'resourceTypeGeneral':"Text",'resourceType':item_types[eprint['monograph_type']]} if 'doi' in eprint: metadata['identifier'] = {'identifier':eprint['doi'],'identifierType':"DOI"} @@ -86,28 +101,28 @@ def epxml_to_datacite(eprint): for item in eprint['other_numbering_system']['item']: ids.append({'alternateIdentifier':item['id'],'alternateIdentifierType':item['name']}) - if 'series_name' in eprint and if 'number' in eprint: + if 'series_name' in eprint and 'number' in eprint: name_and_series = [eprint['series_name'],eprint['number']] elif 'other_numbering_system' in eprint: ids = [] #Assume first is correct item = eprint['other_numbering_system']['item'][0] - name_and_series = [item['name'],item['id']] + name_and_series = [item['name']['#text'],item['id']] elif 'local_group' in eprint: resolver = eprint['official_url'].split(':') - number = resolver[-1].split('.')[1] + number = resolver[-1] name_and_series = [eprint['local_group']['item'],number] else: resolver = eprint['official_url'].split(':') name = resolver[1].split('/')[-1] - number = resolver[-1].split('.')[1] + number = resolver[-1] name_and_series = [name,number] #Save Series Info description = [{'descriptionType':"Abstract",\ 'description':cleanhtml(eprint['abstract'])}] description +=\ - [{'descriptionType':'SeriesInfo','description',name_and_series[0]+' '+name_and_series[1]}] + [{'descriptionType':'SeriesInformation','description':name_and_series[0]+' '+name_and_series[1]}] metadata['descriptions'] = description ids.append({'alternateIdentifier':name_and_series[1],'alternateIdentifierType':name_and_series[0]}) @@ -117,14 +132,19 @@ def epxml_to_datacite(eprint): metadata['language'] = 'English' #Subjects + sub_arr = [] if "keywords" in eprint: subjects = eprint['keywords'].split(';') if len(subjects) == 1: subjects = eprint['keywords'].split(',') - array = [] for s in subjects: - array.append({'subject':s.strip()}) - metadata['subjects']=array + sub_arr.append({'subject':s.strip()}) + + if 'classification_code' in eprint: + sub_arr.append({'subject':eprint['classification_code']}) + + if len(sub_arr) != 0: + metadata['subjects']=sub_arr if 'funders' in eprint: array = [] @@ -177,14 +197,29 @@ def epxml_to_datacite(eprint): return metadata +def download_records(ids): + username = input('Enter your CaltechAUTHORS username: ') + password = getpass.getpass() + + for idv in ids: + url = 'https://'+username+':'+password+'@authors.library.caltech.edu/rest/eprint/' + record_url = url + str(idv) +'.xml' + record = subprocess.check_output(["eputil",record_url],universal_newlines=True) + outfile = open(idv+'.xml','w') + outfile.write(record) + if __name__ == '__main__': parser = argparse.ArgumentParser(description=\ "Make DataCite standard metadata for records from CaltechAUTHORS and register DOIs") parser.add_argument('-mint', action='store_true', help='Mint DOIs') parser.add_argument('-test', action='store_true', help='Only register test DOI') + parser.add_argument('-ids',nargs='*',help="CaltechAUTHORS IDs to download XML files") args = parser.parse_args() + if len(args.ids) > 0: + download_records(args.ids) + files = glob.glob('*.xml') for f in files: if 'datacite' not in f: diff --git a/caltech_thesis.py b/caltech_thesis.py index e1fc9b2..9bc5e87 100644 --- a/caltech_thesis.py +++ b/caltech_thesis.py @@ -1,8 +1,19 @@ import xmltodict from datacite import DataCiteMDSClient,schema40 -import glob,json,datetime,re +import glob,json,datetime,re,getpass import os,argparse,subprocess +def download_records(ids): + username = input('Enter your CaltechTHESIS username: ') + password = getpass.getpass() + + for idv in ids: + url = 'https://'+username+':'+password+'@thesis.library.caltech.edu/rest/eprint/' + record_url = url + str(idv) +'.xml' + record = subprocess.check_output(["eputil",record_url],universal_newlines=True) + outfile = open(idv+'.xml','w') + outfile.write(record) + def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) @@ -96,8 +107,7 @@ def epxml_to_datacite(eprint): #Deal with single item listings eprint['other_numbering_system']['item'] = [eprint['other_numbering_system']['item']] for item in eprint['other_numbering_system']['item']: - print - ids.append({'alternateIdentifier':item['id'],'alternateIdentifierType':item['name']}) + ids.append({'alternateIdentifier':item['id'],'alternateIdentifierType':item['name']['#text']}) metadata['alternateIdentifiers'] = ids metadata['descriptions'] =[{'descriptionType':"Abstract",\ @@ -195,8 +205,12 @@ def epxml_to_datacite(eprint): "Make DataCite standard metadata for records from CaltechTHESIS and register DOIs") parser.add_argument('-mint', action='store_true', help='Mint DOIs') parser.add_argument('-test', action='store_true', help='Only register test DOI') + parser.add_argument('-ids',nargs='*',help="CaltechTHESIS IDs to download XML files") args = parser.parse_args() + if len(args.ids) > 0: + download_records(args.ids) + files = glob.glob('*.xml') for f in files: if 'datacite' not in f: diff --git a/setup.py b/setup.py index 328e594..55a5bf3 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,8 @@ from setuptools import setup setup( name = 'epxml_to_datacite', - version ='0.9', - py_modules = ["caltech_thesis"], + version ='0.10', + py_modules = ["caltech_thesis","caltech_authors_tech_report"], data_files=[('.',['thesis-subjects.txt'])], install_requires=[ 'xmltodict',