Skip to content

Commit

Permalink
Complete round trip
Browse files Browse the repository at this point in the history
  • Loading branch information
tmorrell committed Dec 17, 2018
1 parent be301a3 commit a13d0e0
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 27 deletions.
88 changes: 68 additions & 20 deletions caltech_authors_tech_report.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,21 @@
import xmltodict
from datacite import DataCiteMDSClient,schema40
import glob,json,datetime,re,getpass
import os,argparse,subprocess
import os,argparse,subprocess,requests

def download_records(ids,username,password):
for idv in ids:
url = 'https://'+username+':'+password+'@authors.library.caltech.edu/rest/eprint/'
record_url = url + str(idv) +'.xml'
record = subprocess.check_output(["eputil",record_url],universal_newlines=True)
outfile = open(idv+'.xml','w')
outfile.write(record)

def update_repo_doi(record_number,repo_url,identifier,username,password):
url = repo_url + '/rest/eprint/'+str(record_number)+'/doi.txt'
headers = {'content-type':'text/plain'}
response = requests.put(url,data=identifier,headers=headers,auth=(username,password))
print(response)

def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
Expand Down Expand Up @@ -88,8 +102,11 @@ def epxml_to_datacite(eprint):
metadata['identifier'] = {'identifier':'10.5072/1','identifierType':"DOI"}

#Waterfall for determining series name and number
description = [{'descriptionType':"Abstract",\
if 'abstract' in eprint:
description = [{'descriptionType':"Abstract",\
'description':cleanhtml(eprint['abstract'])}]
else:
description = []
name_and_series = []
ids = []

Expand Down Expand Up @@ -119,8 +136,6 @@ def epxml_to_datacite(eprint):
name_and_series = [name,number]

#Save Series Info
description = [{'descriptionType':"Abstract",\
'description':cleanhtml(eprint['abstract'])}]
description +=\
[{'descriptionType':'SeriesInformation','description':name_and_series[0]+' '+name_and_series[1]}]
metadata['descriptions'] = description
Expand Down Expand Up @@ -197,28 +212,30 @@ def epxml_to_datacite(eprint):

return metadata

def download_records(ids):
username = input('Enter your CaltechAUTHORS username: ')
password = getpass.getpass()

for idv in ids:
url = 'https://'+username+':'+password+'@authors.library.caltech.edu/rest/eprint/'
record_url = url + str(idv) +'.xml'
record = subprocess.check_output(["eputil",record_url],universal_newlines=True)
outfile = open(idv+'.xml','w')
outfile.write(record)

if __name__ == '__main__':

parser = argparse.ArgumentParser(description=\
"Make DataCite standard metadata for records from CaltechAUTHORS and register DOIs")
parser.add_argument('-mint', action='store_true', help='Mint DOIs')
parser.add_argument('-test', action='store_true', help='Only register test DOI')
parser.add_argument('-ids',nargs='*',help="CaltechAUTHORS IDs to download XML files")
parser.add_argument('-id_file',nargs='*',help="TSV file with CaltechAUTHORS records to mint DOIs")
args = parser.parse_args()

if len(args.ids) > 0:
download_records(args.ids)
r_user = input('Enter your CaltechAUTHORS username: ')
r_pass = getpass.getpass()

if args.ids != None:
download_records(args.ids,r_user,r_pass)

if args.id_file != None:
with open(args.id_file[0]) as infile:
ids = []
reader = csv.reader(infile, delimiter='\t')
for row in reader:
if row[0] != 'Eprint ID':
ids.append(row[0])
download_records(ids,r_user,r_pass)

files = glob.glob('*.xml')
for f in files:
Expand All @@ -233,8 +250,11 @@ def download_records(ids):
metadata = epxml_to_datacite(eprint)

#Validation fails on Windows
valid = schema40.validate(metadata)
#Debugging if this fails
if os.name == 'nt':
valid == True
else:
valid = schema40.validate(metadata)
#Debugging if verification fails
if valid == False:
v = schema40.validator.validate(metadata)
errors = sorted(v.iter_errors(instance), key=lambda e: e.path)
Expand All @@ -250,10 +270,20 @@ def download_records(ids):
outfile.write(xml)

else:

#What record in eprints are we dealing with?
record_number = eprint['eprintid']

if args.test== True:
prefix = '10.5072'
#Existing test record
record_number=5756
prefix = '10.33569'
url = 'https://mds.test.datacite.org'
repo_url = 'http://authorstest.library.caltech.edu'
else:
prefix = '10.7907'
url='https://mds.datacite.org'
repo_url = 'https://authors.library.caltech.edu'

#Get our DataCite password
infile = open('pw','r')
Expand All @@ -264,8 +294,19 @@ def download_records(ids):
username='CALTECH.LIBRARY',
password=password,
prefix=prefix,
url=url
)

#Double check if there is an existing identifier
if 'doi' in eprint:
print("Record ",eprint['eprintid']," already has a DOI: ",eprint['doi'])
print("Minting a new DOI will replace the one in Eprints")
print("But the origional DOI will still exist")
response = input("Are you SURE you want to mint a new DOI? (Type Yes to continue): ")
if response != 'Yes':
print("Exiting - please remove records where you don't want to mint DOIs")
exit()

#Provide prefix to let DataCite generate DOI
metadata['identifier'] = {'identifier':str(prefix),'identifierType':'DOI'}

Expand All @@ -275,5 +316,12 @@ def download_records(ids):
identifier = result.split('(')[1].split(')')[0]
d.doi_post(identifier,eprint['official_url'])
print('Minted DOI: '+identifier)
update_repo_doi(record_number,repo_url,identifier,r_user,r_pass)

response = input("Do you want to clean up the xml files in your local directory? (Y or N)")
if response == 'Y':
files = glob.glob('*.xml')
for f in files:
os.remove(f)


9 changes: 6 additions & 3 deletions caltech_thesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,11 @@ def epxml_to_datacite(eprint):
metadata = epxml_to_datacite(eprint)

#Validation fails on Windows
valid = schema40.validate(metadata)
#Debugging if this fails
if os.name == 'nt':
valid == True
else:
valid = schema40.validate(metadata)
#Debugging if verification fails
if valid == False:
v = schema40.validator.validate(metadata)
errors = sorted(v.iter_errors(instance), key=lambda e: e.path)
Expand Down Expand Up @@ -272,7 +275,7 @@ def epxml_to_datacite(eprint):
else:
prefix = '10.7907'
url='https://mds.datacite.org'
repo_url = 'https://authors.library.caltech.edu'
repo_url = 'https://thesis.library.caltech.edu'

#Get our DataCite password
infile = open('pw','r')
Expand Down
4 changes: 2 additions & 2 deletions codemeta.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"codeRepository": "https://github.com/caltechlibrary/epxml_to_datacite",
"issueTracker": "https://github.com/caltechlibrary/epxml_to_datacite/issues",
"license": "https://data.caltech.edu/license",
"version": "0.8.1",
"version": "0.11.0",
"author": [
{
"@type": "Person",
Expand All @@ -17,7 +17,7 @@
"@id": "https://orcid.org/0000-0001-9266-5146"
}],
"developmentStatus": "active",
"downloadUrl": "https://github.com/caltechlibrary/ames/archive/0.8.1.zip",
"downloadUrl": "https://github.com/caltechlibrary/ames/archive/0.11.0.zip",
"keywords": [
"GitHub",
"Eprints",
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from setuptools import setup
setup(
name = 'epxml_to_datacite',
version ='0.10',
version ='0.11.0',
py_modules = ["caltech_thesis","caltech_authors_tech_report"],
data_files=[('.',['thesis-subjects.txt'])],
install_requires=[
'xmltodict',
'datacite'
'datacite',
'requests'
]
)

0 comments on commit a13d0e0

Please sign in to comment.