Complete round trip

caltechlibrary · Dec 17, 2018 · a13d0e0 · a13d0e0
1 parent be301a3
commit a13d0e0
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 27 deletions.
diff --git a/caltech_authors_tech_report.py b/caltech_authors_tech_report.py
@@ -1,7 +1,21 @@
 import xmltodict
 from datacite import DataCiteMDSClient,schema40
 import glob,json,datetime,re,getpass
-import os,argparse,subprocess
+import os,argparse,subprocess,requests
+
+def download_records(ids,username,password):
+    for idv in ids:
+        url = 'https://'+username+':'+password+'@authors.library.caltech.edu/rest/eprint/'
+        record_url = url + str(idv) +'.xml'
+        record = subprocess.check_output(["eputil",record_url],universal_newlines=True)
+        outfile = open(idv+'.xml','w')
+        outfile.write(record)
+
+def update_repo_doi(record_number,repo_url,identifier,username,password):
+    url = repo_url + '/rest/eprint/'+str(record_number)+'/doi.txt'
+    headers = {'content-type':'text/plain'}
+    response = requests.put(url,data=identifier,headers=headers,auth=(username,password))
+    print(response)
 
 def cleanhtml(raw_html):
     cleanr = re.compile('<.*?>')
@@ -88,8 +102,11 @@ def epxml_to_datacite(eprint):
             metadata['identifier'] = {'identifier':'10.5072/1','identifierType':"DOI"}
 
     #Waterfall for determining series name and number
-    description = [{'descriptionType':"Abstract",\
+    if 'abstract' in eprint:
+        description = [{'descriptionType':"Abstract",\
             'description':cleanhtml(eprint['abstract'])}]
+    else:
+        description = []
     name_and_series = []
     ids = []
 
@@ -119,8 +136,6 @@ def epxml_to_datacite(eprint):
         name_and_series = [name,number]
 
     #Save Series Info
-    description = [{'descriptionType':"Abstract",\
-            'description':cleanhtml(eprint['abstract'])}]
     description +=\
             [{'descriptionType':'SeriesInformation','description':name_and_series[0]+' '+name_and_series[1]}] 
     metadata['descriptions'] = description
@@ -197,28 +212,30 @@ def epxml_to_datacite(eprint):
 
     return metadata
 
-def download_records(ids):
-    username = input('Enter your CaltechAUTHORS username: ')
-    password = getpass.getpass()
-
-    for idv in ids:
-        url = 'https://'+username+':'+password+'@authors.library.caltech.edu/rest/eprint/'
-        record_url = url + str(idv) +'.xml'
-        record = subprocess.check_output(["eputil",record_url],universal_newlines=True)
-        outfile = open(idv+'.xml','w')
-        outfile.write(record)
-
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser(description=\
         "Make DataCite standard metadata for records from CaltechAUTHORS and register DOIs")
     parser.add_argument('-mint', action='store_true', help='Mint DOIs')
     parser.add_argument('-test', action='store_true', help='Only register test DOI')
     parser.add_argument('-ids',nargs='*',help="CaltechAUTHORS IDs to download XML files")
+    parser.add_argument('-id_file',nargs='*',help="TSV file with CaltechAUTHORS records to mint DOIs")
     args = parser.parse_args()
 
-    if len(args.ids) > 0:
-        download_records(args.ids)
+    r_user = input('Enter your CaltechAUTHORS username: ')
+    r_pass = getpass.getpass()
+
+    if args.ids != None:
+        download_records(args.ids,r_user,r_pass)
+
+    if args.id_file != None:
+        with open(args.id_file[0]) as infile:
+            ids = []
+            reader = csv.reader(infile, delimiter='\t')
+            for row in reader:
+                if row[0] != 'Eprint ID':
+                    ids.append(row[0])
+        download_records(ids,r_user,r_pass)
 
     files = glob.glob('*.xml')
     for f in files:
@@ -233,8 +250,11 @@ def download_records(ids):
             metadata = epxml_to_datacite(eprint)
 
             #Validation fails on Windows
-            valid =  schema40.validate(metadata)
-            #Debugging if this fails
+            if os.name == 'nt':
+                valid == True
+            else:
+                valid =  schema40.validate(metadata)
+            #Debugging if verification fails
             if valid == False:
                 v = schema40.validator.validate(metadata)
                 errors = sorted(v.iter_errors(instance), key=lambda e: e.path)
@@ -250,10 +270,20 @@ def download_records(ids):
                 outfile.write(xml)
 
             else:
+
+                #What record in eprints are we dealing with?
+                record_number = eprint['eprintid']
+
                 if args.test== True:
-                    prefix = '10.5072'
+                    #Existing test record
+                    record_number=5756
+                    prefix = '10.33569'
+                    url = 'https://mds.test.datacite.org'
+                    repo_url = 'http://authorstest.library.caltech.edu'
                 else:
                     prefix = '10.7907'
+                    url='https://mds.datacite.org'
+                    repo_url = 'https://authors.library.caltech.edu'
 
                 #Get our DataCite password
                 infile = open('pw','r')
@@ -264,8 +294,19 @@ def download_records(ids):
                 username='CALTECH.LIBRARY',
                 password=password,
                 prefix=prefix,
+                url=url
                 )
 
+                #Double check if there is an existing identifier
+                if 'doi' in eprint:
+                    print("Record ",eprint['eprintid']," already has a DOI: ",eprint['doi'])
+                    print("Minting a new DOI will replace the one in Eprints")
+                    print("But the origional DOI will still exist")
+                    response = input("Are you SURE you want to mint a new DOI? (Type Yes to continue): ")
+                    if response != 'Yes':
+                        print("Exiting - please remove records where you don't want to mint DOIs")
+                        exit()
+
                 #Provide prefix to let DataCite generate DOI
                 metadata['identifier'] = {'identifier':str(prefix),'identifierType':'DOI'}
 
@@ -275,5 +316,12 @@ def download_records(ids):
                 identifier = result.split('(')[1].split(')')[0]
                 d.doi_post(identifier,eprint['official_url'])
                 print('Minted DOI: '+identifier)
+                update_repo_doi(record_number,repo_url,identifier,r_user,r_pass)
+
+    response = input("Do you want to clean up the xml files in your local directory? (Y or N)")
+    if response == 'Y':
+        files = glob.glob('*.xml')
+        for f in files:
+            os.remove(f)
 
 
diff --git a/caltech_thesis.py b/caltech_thesis.py
@@ -242,8 +242,11 @@ def epxml_to_datacite(eprint):
             metadata = epxml_to_datacite(eprint)
 
             #Validation fails on Windows
-            valid =  schema40.validate(metadata)
-            #Debugging if this fails
+            if os.name == 'nt':
+                valid == True
+            else:
+                valid =  schema40.validate(metadata)
+            #Debugging if verification fails
             if valid == False:
                 v = schema40.validator.validate(metadata)
                 errors = sorted(v.iter_errors(instance), key=lambda e: e.path)
@@ -272,7 +275,7 @@ def epxml_to_datacite(eprint):
                 else:
                     prefix = '10.7907'
                     url='https://mds.datacite.org'
-                    repo_url = 'https://authors.library.caltech.edu'
+                    repo_url = 'https://thesis.library.caltech.edu'
 
                 #Get our DataCite password
                 infile = open('pw','r')

diff --git a/codemeta.json b/codemeta.json
@@ -6,7 +6,7 @@
   "codeRepository": "https://github.com/caltechlibrary/epxml_to_datacite",
   "issueTracker": "https://github.com/caltechlibrary/epxml_to_datacite/issues",
   "license": "https://data.caltech.edu/license",
-  "version": "0.8.1",
+  "version": "0.11.0",
   "author": [
     {
       "@type": "Person",
@@ -17,7 +17,7 @@
       "@id": "https://orcid.org/0000-0001-9266-5146"
     }],
   "developmentStatus": "active",
-  "downloadUrl": "https://github.com/caltechlibrary/ames/archive/0.8.1.zip",
+  "downloadUrl": "https://github.com/caltechlibrary/ames/archive/0.11.0.zip",
   "keywords": [
     "GitHub",
     "Eprints",

diff --git a/setup.py b/setup.py
@@ -1,11 +1,12 @@
 from setuptools import setup
 setup(
         name = 'epxml_to_datacite',
-        version ='0.10',
+        version ='0.11.0',
         py_modules = ["caltech_thesis","caltech_authors_tech_report"],
         data_files=[('.',['thesis-subjects.txt'])],
         install_requires=[
             'xmltodict',
-            'datacite'
+            'datacite',
+            'requests'
         ]
     )