Tech report and download autmation

caltechlibrary · Oct 30, 2018 · e521da3 · e521da3
1 parent 7085c68
commit e521da3
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -33,6 +33,11 @@ infile = open('10271.xml',encoding="utf8")
 eprint = xmltodict.parse(infile.read())['eprints']['eprint']
 ```
 
+## Downloading Eprints XML files
+
+You can use Eprints ids (e.g. 82938) to download Eprints xml files by adding a
+`-ids` option to any command.  This requires eputil to be installed on your local systems.
+
 ## Using caltech_thesis.py
 
 Download .xml files from thesis.library.caltech.edu/rest/eprint/1234.xml and put 
@@ -41,3 +46,6 @@ DataCite XML files will appear.  If you want to mint DOIs add the `-mint`
 option and if you want to make test DOIs add the `-test` option to the command
 line.  
 
+## Using caltech_authors_tech_report.py
+
+Will only work with items with the `monograph` item type (Report or Paper).
diff --git a/caltech_authors_tech_report.py b/caltech_authors_tech_report.py
@@ -1,6 +1,6 @@
 import xmltodict
 from datacite import DataCiteMDSClient,schema40
-import glob,json,datetime,re
+import glob,json,datetime,re,getpass
 import os,argparse,subprocess
 
 def cleanhtml(raw_html):
@@ -9,9 +9,24 @@ def cleanhtml(raw_html):
     return cleantext
 
 def epxml_to_datacite(eprint):
-
+
+    print(eprint['type'])
+    if eprint['type'] != 'monograph':
+        raise Exception("This code has only been tested on tech reports")
+
     metadata = {}
 
+    item_types = {
+            "discussion_paper":"Discussion Paper",
+            "documentation":"Documentation",
+            "manual":"Manual",
+            "other":"Other",
+            "project_report":"Project Report",
+            "report":"Report",
+            "technical_report":"Technical Report",
+            "white_paper":"White Paper",
+            "working_paper":"Working Paper"}
+
     #Transforming Metadata
     #Creators
     newa = []
@@ -65,7 +80,7 @@ def epxml_to_datacite(eprint):
         metadata['publicationYear'] = eprint['date'].split('-')[0]
     else:
         metadata['publicationYear'] = eprint['date']
-    metadata['resourceType']={'resourceTypeGeneral':"Text",'resourceType':"Technical Report"}
+    metadata['resourceType']={'resourceTypeGeneral':"Text",'resourceType':item_types[eprint['monograph_type']]}
 
     if 'doi' in eprint:
             metadata['identifier'] = {'identifier':eprint['doi'],'identifierType':"DOI"}
@@ -86,28 +101,28 @@ def epxml_to_datacite(eprint):
         for item in eprint['other_numbering_system']['item']:
             ids.append({'alternateIdentifier':item['id'],'alternateIdentifierType':item['name']})
 
-    if 'series_name' in eprint and if 'number' in eprint:
+    if 'series_name' in eprint and 'number' in eprint:
         name_and_series = [eprint['series_name'],eprint['number']]
     elif 'other_numbering_system' in eprint:
         ids = []
         #Assume first is correct
         item = eprint['other_numbering_system']['item'][0]
-        name_and_series = [item['name'],item['id']]
+        name_and_series = [item['name']['#text'],item['id']]
     elif 'local_group' in eprint:
         resolver = eprint['official_url'].split(':')
-        number = resolver[-1].split('.')[1]
+        number = resolver[-1]
         name_and_series = [eprint['local_group']['item'],number]
     else:
         resolver = eprint['official_url'].split(':')
         name = resolver[1].split('/')[-1]
-        number = resolver[-1].split('.')[1]
+        number = resolver[-1]
         name_and_series = [name,number]
 
     #Save Series Info
     description = [{'descriptionType':"Abstract",\
             'description':cleanhtml(eprint['abstract'])}]
     description +=\
-    [{'descriptionType':'SeriesInfo','description',name_and_series[0]+' '+name_and_series[1]}] 
+            [{'descriptionType':'SeriesInformation','description':name_and_series[0]+' '+name_and_series[1]}] 
     metadata['descriptions'] = description
 
     ids.append({'alternateIdentifier':name_and_series[1],'alternateIdentifierType':name_and_series[0]})
@@ -117,14 +132,19 @@ def epxml_to_datacite(eprint):
     metadata['language'] = 'English'
 
     #Subjects
+    sub_arr = []
     if "keywords" in eprint:
         subjects = eprint['keywords'].split(';')
         if len(subjects) == 1:
             subjects = eprint['keywords'].split(',')
-        array = []
         for s in subjects:
-            array.append({'subject':s.strip()})
-        metadata['subjects']=array
+            sub_arr.append({'subject':s.strip()})
+
+    if 'classification_code' in eprint:
+        sub_arr.append({'subject':eprint['classification_code']})
+
+    if len(sub_arr) != 0:
+        metadata['subjects']=sub_arr
 
     if 'funders' in eprint:
         array = []
@@ -177,14 +197,29 @@ def epxml_to_datacite(eprint):
 
     return metadata
 
+def download_records(ids):
+    username = input('Enter your CaltechAUTHORS username: ')
+    password = getpass.getpass()
+
+    for idv in ids:
+        url = 'https://'+username+':'+password+'@authors.library.caltech.edu/rest/eprint/'
+        record_url = url + str(idv) +'.xml'
+        record = subprocess.check_output(["eputil",record_url],universal_newlines=True)
+        outfile = open(idv+'.xml','w')
+        outfile.write(record)
+
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser(description=\
         "Make DataCite standard metadata for records from CaltechAUTHORS and register DOIs")
     parser.add_argument('-mint', action='store_true', help='Mint DOIs')
     parser.add_argument('-test', action='store_true', help='Only register test DOI')
+    parser.add_argument('-ids',nargs='*',help="CaltechAUTHORS IDs to download XML files")
     args = parser.parse_args()
 
+    if len(args.ids) > 0:
+        download_records(args.ids)
+
     files = glob.glob('*.xml')
     for f in files:
         if 'datacite' not in f:

diff --git a/caltech_thesis.py b/caltech_thesis.py
@@ -1,8 +1,19 @@
 import xmltodict
 from datacite import DataCiteMDSClient,schema40
-import glob,json,datetime,re
+import glob,json,datetime,re,getpass
 import os,argparse,subprocess
 
+def download_records(ids):
+    username = input('Enter your CaltechTHESIS username: ')
+    password = getpass.getpass()
+
+    for idv in ids:
+        url = 'https://'+username+':'+password+'@thesis.library.caltech.edu/rest/eprint/'
+        record_url = url + str(idv) +'.xml'
+        record = subprocess.check_output(["eputil",record_url],universal_newlines=True)
+        outfile = open(idv+'.xml','w')
+        outfile.write(record)
+
 def cleanhtml(raw_html):
     cleanr = re.compile('<.*?>')
     cleantext = re.sub(cleanr, '', raw_html)
@@ -96,8 +107,7 @@ def epxml_to_datacite(eprint):
             #Deal with single item listings
             eprint['other_numbering_system']['item'] = [eprint['other_numbering_system']['item']]
         for item in eprint['other_numbering_system']['item']:
-            print
-            ids.append({'alternateIdentifier':item['id'],'alternateIdentifierType':item['name']})
+            ids.append({'alternateIdentifier':item['id'],'alternateIdentifierType':item['name']['#text']})
         metadata['alternateIdentifiers'] = ids
 
     metadata['descriptions'] =[{'descriptionType':"Abstract",\
@@ -195,8 +205,12 @@ def epxml_to_datacite(eprint):
         "Make DataCite standard metadata for records from CaltechTHESIS and register DOIs")
     parser.add_argument('-mint', action='store_true', help='Mint DOIs')
     parser.add_argument('-test', action='store_true', help='Only register test DOI')
+    parser.add_argument('-ids',nargs='*',help="CaltechTHESIS IDs to download XML files")
     args = parser.parse_args()
 
+    if len(args.ids) > 0:
+        download_records(args.ids)
+
     files = glob.glob('*.xml')
     for f in files:
         if 'datacite' not in f:

diff --git a/setup.py b/setup.py
@@ -1,8 +1,8 @@
 from setuptools import setup
 setup(
         name = 'epxml_to_datacite',
-        version ='0.9',
-        py_modules = ["caltech_thesis"],
+        version ='0.10',
+        py_modules = ["caltech_thesis","caltech_authors_tech_report"],
         data_files=[('.',['thesis-subjects.txt'])],
         install_requires=[
             'xmltodict',