Skip to content

Commit

Permalink
Merge pull request #8 from caltechlibrary/rsdoiel-master
Browse files Browse the repository at this point in the history
Add in reporting improvements and CaltechDATA citation updating features
  • Loading branch information
tmorrell authored Feb 19, 2019
2 parents d039491 + 794d64f commit b6ee967
Show file tree
Hide file tree
Showing 15 changed files with 623 additions and 115 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@ CaltechDATA_Logo_cropped.png
build/
dist/
ames.egg-info/
# Test files
*.csv
updates
39 changes: 32 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ account using `export DATACITE=`
#### Usage
Type `python run_media_update.py`.

### CaltechDATA metadata updates
### CaltechDATA metadata checks

This will run checks on the quality of metadata in CaltechDATA. Currently this
verifies whether redundent links are present in the related identifier section.
Expand All @@ -96,6 +96,19 @@ verifies whether redundent links are present in the related identifier section.
You need to set environmental variables with your token to access
CaltechDATA `export TINDTOK=`

#### Usage
Type `python run_caltechdata_checks.py`.

### CaltechDATA metadata updates

This will improve the quality of metadata in CaltechDATA. Currently this
adds a recommended citation to the descriptions and can update metadata with
DataCite.

#### Setup
You need to set environmental variables with your token to access
CaltechDATA `export TINDTOK=`

#### Usage
Type `python run_caltechdata_updates.py`.

Expand All @@ -110,17 +123,29 @@ Matomo `export MATTOK=`
#### Usage
Type `python run_downloads.py`.

### CODA Reports (Feeds)
### CODA Reports

Runs reports on Caltech Library repositories. Current reports:

Harvest metadata from Caltech Library repositories and run reports. Current
report lists records (optionally filtered by year) and their DOIs.
- doi_report: Records (optionally filtered by year) and their DOIs.
- creator_report: Finds records where ORCIDS are known but not included. Also
lists cases where an author has two ORCIDS
- file_report: Records that have potential problems with the attached files
- status_report: Reports on any records with an incorrect status in feeds

#### Usage
Type something like `python run_coda_report.py doi_report thesis report.tsv -year 1977-1978`

- The first option is the report type (doi_report is currently the only option)
- The first option is the report type
- Next is the repository (thesis or authors)
- Next is the output file name (anything, will show up in current directory)
- You can include a -year option to return records from a specific year (1977) or a
- Next is the output file name (include .csv or .tsv extension, will show up in current directory)
- Some reports include a -year option to return just the records from a specific year (1977) or a
range (1977-1978)

There are some additional optional arguments if you want to change the default behavior.
- Adding `-source eprints` will pull report data from Eprints instead of feeds. This is
very slow. You may need to add -username and -password to provide login
credentials
- Adding `-sample XXX` allows you to select a number of randomly selected records. This makes it
more reasonable to pull data directly from Eprints.

3 changes: 3 additions & 0 deletions ames/converters/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

from .codemeta_to_datacite import codemeta_to_datacite
from .epfmt import eprint_as_xml, eprint_as_json
58 changes: 58 additions & 0 deletions ames/converters/epfmt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#
# epfmt is a Python 3.7 wrapper for the functionality of
# the epfmt command line tool which is part of the eprinttools Go
# package
#
# For the Go package see https://github.com/caltechlibrary/eprinttools.
#
import os
import io
import json
import sys
from subprocess import run, Popen, PIPE


#
# eprint_as_xml takes a Python dict of EPrint content like
# that fetched with eputil returns the object as EPrint XML.
#
def eprint_as_xml(eprint_obj):
src = json.dumps(eprint_obj)
#if not isinstance(src, bytes):
# src = src.encode('utf-8')
cmd = ['epfmt', '-xml']
try:
p = run(cmd, input = src.encode('utf-8'), capture_output = True)
except Exception as e:
sys.stderr.write(f"{e}\n")
exit_code = p.returncode
if exit_code != 0:
print(f"ERROR: {' '.join(cmd)}, exit code {exit_code}")
return None
value = p.stdout
if not isinstance(value, bytes):
value = value.encode('utf8')
return value.decode()

#
# eprint_as_json takes a Python Dict of EPrint content
# like that fetch with eputil returns the object in JSON format.
#
def eprint_as_json(eprint_obj):
src = json.dumps(eprint_obj)
if not isinstance(src, bytes):
src = src.encode('utf-8')
cmd = ['epfmt', '-json']
try:
p = run(cmd, input = src, capture_output = True)
except Exception as e:
sys.stderr.write(f"{e}\n")
exit_code = p.returncode
if exit_code != 0:
print(f"ERROR: {' '.join(cmd)}, exit code {exit_code}")
return None
value = p.stdout
if not isinstance(value, bytes):
value = value.encode('utf8')
return value.decode()

1 change: 1 addition & 0 deletions ames/harvesters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from .caltechdata import get_caltechdata
from .caltechfeeds import get_caltechfeed
from .matomo import get_downloads
from .eputil import get_eprint_keys, get_eprints, get_eprint
20 changes: 14 additions & 6 deletions ames/harvesters/caltechdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,40 @@
from caltechdata_api import decustomize_schema
import dataset

def get_caltechdata(collection):
def get_caltechdata(collection,production=True,datacite=False):

if os.path.isdir(collection) == False:
ok = dataset.init(collection)
if ok == False:
print("Dataset failed to init collection")
exit()

url = 'https://data.caltech.edu/api/records'
if production == True:
url = 'https://data.caltech.edu/api/records'
else:
url = 'https://cd-sandbox.tind.io/api/records'

response = requests.get(url+'/?size=1000')
hits = response.json()

print("Saving Records")

for h in hits['hits']['hits']:
rid = str(h['id'])
print(rid)
metadata = decustomize_schema(h['metadata'],True,True)
metadata['updated'] = h['updated']
#Get enriched metadata records (including files)
if datacite == False:
metadata = decustomize_schema(h['metadata'],True,True)
metadata['updated'] = h['updated']
else:
#Get just DataCite metadata
metadata = decustomize_schema(h['metadata'])

result = dataset.has_key(collection,rid)

if result == False:
dataset.create(collection,rid, metadata)
else:
#Could check update data, but probably not worth it
#Could check update date, but probably not worth it
dataset.update(collection,rid, metadata)

def get_multiple_links(input_collection,output_collection):
Expand Down
129 changes: 129 additions & 0 deletions ames/harvesters/eputil.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#
# eputil.py is a Python 3.7 wrapper for the Go eprinttools
# eputil command line program.
#
# For Go package see https://github.com/caltechlibrary/eprinttools.
#
import os
import json
import sys
from subprocess import run, Popen, PIPE
from datetime import datetime, timedelta


#
# get_eprint_keys returns a list of keys available from the
# EPrints rest API indicated in the provided eprint_url.
#
# The eprint_url often is in the form containing a username/password
# for access the API. E.g.
#
# 'https://jane.doe:[email protected]'
#
def get_eprint_keys(eprint_url):
cmd = ['eputil']
cmd.append('-json')
cmd.append(eprint_url + '/rest/eprint/')
try:
p = run(cmd, capture_output = True)
except Exception as e:
sys.stderr.write(f"{e}\n")

exit_code = p.returncode
if exit_code != 0:
print(f"ERROR: {' '.join(cmd)}, exit code {exit_code}")
return None
value = p.stdout
if not isinstance(value, bytes):
value = value.encode('utf8')
src = value.decode()
if type(src) == str:
if src == "":
return []
keys = []
l = json.loads(src)
for k in l:
keys.append(f"{k}")
return keys
else:
print(f"ERROR: wrong type {type(src)} for {src}")
return None

#
# get_eprint returns a single EPrint element for given EPrint ID.
# via the EPrints rest API indicated in the provided eprint_url.
#
# The eprint_url often is in the form containing a username/password
# for access the API. E.g.
#
# 'https://jane.doe:[email protected]'
#
def get_eprint(eprint_url, eprint_id):
eprint = {}
cmd = ['eputil']
cmd.append('-json')
cmd.append(eprint_url + '/rest/eprint/' + eprint_id + '.xml')
try:
p = run(cmd, capture_output = True)
except Exception as e:
sys.stderr.write(f"{e}\n")

exit_code = p.returncode
if exit_code != 0:
print(f"ERROR: {' '.join(cmd)}, exit code {exit_code}")
return None
value = p.stdout
if not isinstance(value, bytes):
value = value.encode('utf8')
src = value.decode()
if type(src) == str:
if src == "":
return {}
obj = json.loads(src)
if 'eprint' in obj and len(obj['eprint']) > 0:
return obj['eprint'][0]
return None
else:
print(f"ERROR: wrong type {type(src)} for {src}")
return None

#
# get_eprints returns an EPrint element in List form
# for given EPrint ID via the EPrints rest API indicated in the
# provided eprint_url (the outer XML is <eprints>... rather
# than the inner XML of <eprints><eprint>...)
#
# The eprint_url often is in the form containing a username/password
# for access the API. E.g.
#
# 'https://jane.doe:[email protected]'
#
def get_eprints(eprint_url, eprint_id):
eprints = []
eprint = {}
cmd = ['eputil']
cmd.append('-json')
cmd.append(eprint_url + '/rest/eprint/' + eprint_id + '.xml')
try:
p = run(cmd, capture_output = True)
except Exception as e:
sys.stderr.write(f"{e}\n")
exit_code = p.returncode
if exit_code != 0:
print(f"ERROR: {' '.join(cmd)}, exit code {exit_code}")
return None
value = p.stdout
if not isinstance(value, bytes):
value = value.encode('utf8')
src = value.decode()
if type(src) == str:
if src == "":
return []
obj = json.loads(src)
if 'eprint' in obj and len(obj['eprint']) > 0:
return obj
return None
else:
print(f"ERROR: wrong type {type(src)} for {src}")
return None

4 changes: 3 additions & 1 deletion ames/matchers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from .caltechdata import match_cd_refs
from .caltechdata import match_codemeta
from .caltechdata import fix_multiple_links
from .update_datacite import update_datacite_media
from .caltechdata import add_citation
from .datacite import update_datacite_metadata
from .datacite import update_datacite_media
32 changes: 32 additions & 0 deletions ames/matchers/caltechdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from caltechdata_api import caltechdata_edit
from ames import codemeta_to_datacite
import dataset
import requests

def match_cd_refs():

Expand Down Expand Up @@ -133,3 +134,34 @@ def fix_multiple_links(input_collection,token):
response = caltechdata_edit(token,k,new_metadata,{},{},True)
print(response)

def add_citation(collection,token,production=True):
keys = dataset.keys(collection)
for k in keys:
record,err = dataset.read(collection,k)
if err != '':
print(err)
exit()
description = record['descriptions']
cite_exists = False
for d in description:
descr_text = d['description']
if descr_text.startswith('<br>Cite this record as:'):
cite_exists = True
if cite_exists == False:
record_doi = record['identifier']['identifier']
citation_link =\
'https://data.datacite.org/text/x-bibliography;style=apa/'
citation = requests.get(citation_link+record_doi).text
doi_url = 'https://doi.org/'+record_doi.lower()
if doi_url in citation:
#Check that we have a citation and not a server error,
#otherwise wait till next time
citation = citation.replace(doi_url,'<a href="'+doi_url+'">'+doi_url+'</a>')
#Replace link text with HTML link
n_txt = '<br>Cite this record as:<br>'+citation+\
'<br> or choose a <a href="https://crosscite.org/?doi='\
+record_doi+'"> different citation style</a>'
description.append({'descriptionType':'Other','description':n_txt})
response =\
caltechdata_edit(token,k,{'descriptions':description},{},{},production)
print(response)
Loading

0 comments on commit b6ee967

Please sign in to comment.