Skip to content

Commit

Permalink
Tech report and download autmation
Browse files Browse the repository at this point in the history
  • Loading branch information
tmorrell committed Oct 30, 2018
1 parent 7085c68 commit e521da3
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 16 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ infile = open('10271.xml',encoding="utf8")
eprint = xmltodict.parse(infile.read())['eprints']['eprint']
```

## Downloading Eprints XML files

You can use Eprints ids (e.g. 82938) to download Eprints xml files by adding a
`-ids` option to any command. This requires eputil to be installed on your local systems.

## Using caltech_thesis.py

Download .xml files from thesis.library.caltech.edu/rest/eprint/1234.xml and put
Expand All @@ -41,3 +46,6 @@ DataCite XML files will appear. If you want to mint DOIs add the `-mint`
option and if you want to make test DOIs add the `-test` option to the command
line.

## Using caltech_authors_tech_report.py

Will only work with items with the `monograph` item type (Report or Paper).
57 changes: 46 additions & 11 deletions caltech_authors_tech_report.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import xmltodict
from datacite import DataCiteMDSClient,schema40
import glob,json,datetime,re
import glob,json,datetime,re,getpass
import os,argparse,subprocess

def cleanhtml(raw_html):
Expand All @@ -9,9 +9,24 @@ def cleanhtml(raw_html):
return cleantext

def epxml_to_datacite(eprint):


print(eprint['type'])
if eprint['type'] != 'monograph':
raise Exception("This code has only been tested on tech reports")

metadata = {}

item_types = {
"discussion_paper":"Discussion Paper",
"documentation":"Documentation",
"manual":"Manual",
"other":"Other",
"project_report":"Project Report",
"report":"Report",
"technical_report":"Technical Report",
"white_paper":"White Paper",
"working_paper":"Working Paper"}

#Transforming Metadata
#Creators
newa = []
Expand Down Expand Up @@ -65,7 +80,7 @@ def epxml_to_datacite(eprint):
metadata['publicationYear'] = eprint['date'].split('-')[0]
else:
metadata['publicationYear'] = eprint['date']
metadata['resourceType']={'resourceTypeGeneral':"Text",'resourceType':"Technical Report"}
metadata['resourceType']={'resourceTypeGeneral':"Text",'resourceType':item_types[eprint['monograph_type']]}

if 'doi' in eprint:
metadata['identifier'] = {'identifier':eprint['doi'],'identifierType':"DOI"}
Expand All @@ -86,28 +101,28 @@ def epxml_to_datacite(eprint):
for item in eprint['other_numbering_system']['item']:
ids.append({'alternateIdentifier':item['id'],'alternateIdentifierType':item['name']})

if 'series_name' in eprint and if 'number' in eprint:
if 'series_name' in eprint and 'number' in eprint:
name_and_series = [eprint['series_name'],eprint['number']]
elif 'other_numbering_system' in eprint:
ids = []
#Assume first is correct
item = eprint['other_numbering_system']['item'][0]
name_and_series = [item['name'],item['id']]
name_and_series = [item['name']['#text'],item['id']]
elif 'local_group' in eprint:
resolver = eprint['official_url'].split(':')
number = resolver[-1].split('.')[1]
number = resolver[-1]
name_and_series = [eprint['local_group']['item'],number]
else:
resolver = eprint['official_url'].split(':')
name = resolver[1].split('/')[-1]
number = resolver[-1].split('.')[1]
number = resolver[-1]
name_and_series = [name,number]

#Save Series Info
description = [{'descriptionType':"Abstract",\
'description':cleanhtml(eprint['abstract'])}]
description +=\
[{'descriptionType':'SeriesInfo','description',name_and_series[0]+' '+name_and_series[1]}]
[{'descriptionType':'SeriesInformation','description':name_and_series[0]+' '+name_and_series[1]}]
metadata['descriptions'] = description

ids.append({'alternateIdentifier':name_and_series[1],'alternateIdentifierType':name_and_series[0]})
Expand All @@ -117,14 +132,19 @@ def epxml_to_datacite(eprint):
metadata['language'] = 'English'

#Subjects
sub_arr = []
if "keywords" in eprint:
subjects = eprint['keywords'].split(';')
if len(subjects) == 1:
subjects = eprint['keywords'].split(',')
array = []
for s in subjects:
array.append({'subject':s.strip()})
metadata['subjects']=array
sub_arr.append({'subject':s.strip()})

if 'classification_code' in eprint:
sub_arr.append({'subject':eprint['classification_code']})

if len(sub_arr) != 0:
metadata['subjects']=sub_arr

if 'funders' in eprint:
array = []
Expand Down Expand Up @@ -177,14 +197,29 @@ def epxml_to_datacite(eprint):

return metadata

def download_records(ids):
username = input('Enter your CaltechAUTHORS username: ')
password = getpass.getpass()

for idv in ids:
url = 'https://'+username+':'+password+'@authors.library.caltech.edu/rest/eprint/'
record_url = url + str(idv) +'.xml'
record = subprocess.check_output(["eputil",record_url],universal_newlines=True)
outfile = open(idv+'.xml','w')
outfile.write(record)

if __name__ == '__main__':

parser = argparse.ArgumentParser(description=\
"Make DataCite standard metadata for records from CaltechAUTHORS and register DOIs")
parser.add_argument('-mint', action='store_true', help='Mint DOIs')
parser.add_argument('-test', action='store_true', help='Only register test DOI')
parser.add_argument('-ids',nargs='*',help="CaltechAUTHORS IDs to download XML files")
args = parser.parse_args()

if len(args.ids) > 0:
download_records(args.ids)

files = glob.glob('*.xml')
for f in files:
if 'datacite' not in f:
Expand Down
20 changes: 17 additions & 3 deletions caltech_thesis.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,19 @@
import xmltodict
from datacite import DataCiteMDSClient,schema40
import glob,json,datetime,re
import glob,json,datetime,re,getpass
import os,argparse,subprocess

def download_records(ids):
username = input('Enter your CaltechTHESIS username: ')
password = getpass.getpass()

for idv in ids:
url = 'https://'+username+':'+password+'@thesis.library.caltech.edu/rest/eprint/'
record_url = url + str(idv) +'.xml'
record = subprocess.check_output(["eputil",record_url],universal_newlines=True)
outfile = open(idv+'.xml','w')
outfile.write(record)

def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
Expand Down Expand Up @@ -96,8 +107,7 @@ def epxml_to_datacite(eprint):
#Deal with single item listings
eprint['other_numbering_system']['item'] = [eprint['other_numbering_system']['item']]
for item in eprint['other_numbering_system']['item']:
print
ids.append({'alternateIdentifier':item['id'],'alternateIdentifierType':item['name']})
ids.append({'alternateIdentifier':item['id'],'alternateIdentifierType':item['name']['#text']})
metadata['alternateIdentifiers'] = ids

metadata['descriptions'] =[{'descriptionType':"Abstract",\
Expand Down Expand Up @@ -195,8 +205,12 @@ def epxml_to_datacite(eprint):
"Make DataCite standard metadata for records from CaltechTHESIS and register DOIs")
parser.add_argument('-mint', action='store_true', help='Mint DOIs')
parser.add_argument('-test', action='store_true', help='Only register test DOI')
parser.add_argument('-ids',nargs='*',help="CaltechTHESIS IDs to download XML files")
args = parser.parse_args()

if len(args.ids) > 0:
download_records(args.ids)

files = glob.glob('*.xml')
for f in files:
if 'datacite' not in f:
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from setuptools import setup
setup(
name = 'epxml_to_datacite',
version ='0.9',
py_modules = ["caltech_thesis"],
version ='0.10',
py_modules = ["caltech_thesis","caltech_authors_tech_report"],
data_files=[('.',['thesis-subjects.txt'])],
install_requires=[
'xmltodict',
Expand Down

0 comments on commit e521da3

Please sign in to comment.