Merge pull request #8 from caltechlibrary/rsdoiel-master

Add in reporting improvements and CaltechDATA citation updating features
caltechlibrary · Feb 19, 2019 · b6ee967 · b6ee967
2 parents d039491 + 794d64f
commit b6ee967
Show file tree

Hide file tree

Showing 15 changed files with 623 additions and 115 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,6 @@ CaltechDATA_Logo_cropped.png
 build/
 dist/
 ames.egg-info/
+# Test files
+*.csv
+updates
diff --git a/README.md b/README.md
@@ -87,7 +87,7 @@ account using `export DATACITE=`
 #### Usage
 Type `python run_media_update.py`.  
 
-### CaltechDATA metadata updates
+### CaltechDATA metadata checks
 
 This will run checks on the quality of metadata in CaltechDATA.  Currently this
 verifies whether redundent links are present in the related identifier section.  
@@ -96,6 +96,19 @@ verifies whether redundent links are present in the related identifier section.
 You need to set environmental variables with your token to access
 CaltechDATA `export TINDTOK=`
 
+#### Usage
+Type `python run_caltechdata_checks.py`. 
+
+### CaltechDATA metadata updates
+
+This will improve the quality of metadata in CaltechDATA.  Currently this
+adds a recommended citation to the descriptions and can update metadata with
+DataCite.
+
+#### Setup
+You need to set environmental variables with your token to access
+CaltechDATA `export TINDTOK=`
+
 #### Usage
 Type `python run_caltechdata_updates.py`. 
 
@@ -110,17 +123,29 @@ Matomo `export MATTOK=`
 #### Usage
 Type `python run_downloads.py`. 
 
-### CODA Reports (Feeds)
+### CODA Reports
+
+Runs reports on Caltech Library repositories.  Current reports:
 
-Harvest metadata from Caltech Library repositories and run reports.  Current
-report lists records (optionally filtered by year) and their DOIs.
+- doi_report: Records (optionally filtered by year) and their DOIs.
+- creator_report: Finds records where ORCIDS are known but not included.  Also 
+  lists cases where an author has two ORCIDS
+- file_report: Records that have potential problems with the attached files
+- status_report: Reports on any records with an incorrect status in feeds
 
 #### Usage
 Type something like `python run_coda_report.py doi_report thesis report.tsv -year 1977-1978`
 
-- The first option is the report type (doi_report is currently the only option)
+- The first option is the report type 
 - Next is the repository (thesis or authors)
-- Next is the output file name (anything, will show up in current directory)
-- You can include a -year option to return records from a specific year (1977) or a
+- Next is the output file name (include .csv or .tsv extension, will show up in current directory)
+- Some reports include a -year option to return just the records from a specific year (1977) or a
 range (1977-1978)
 
+There are some additional optional arguments if you want to change the default behavior.
+- Adding `-source eprints` will pull report data from Eprints instead of feeds.  This is
+very slow.  You may need to add -username and -password to provide login
+credentials
+- Adding `-sample XXX` allows you to select a number of randomly selected records.  This makes it
+  more reasonable to pull data directly from Eprints.
+
diff --git a/ames/converters/__init__.py b/ames/converters/__init__.py
@@ -0,0 +1,3 @@
+
+from .codemeta_to_datacite import codemeta_to_datacite
+from .epfmt import eprint_as_xml, eprint_as_json
diff --git a/ames/converters/epfmt.py b/ames/converters/epfmt.py
@@ -0,0 +1,58 @@
+#
+# epfmt is a Python 3.7 wrapper for the functionality of
+# the epfmt command line tool which is part of the eprinttools Go
+# package
+#
+# For the Go package see https://github.com/caltechlibrary/eprinttools.
+#
+import os
+import io
+import json
+import sys
+from subprocess import run, Popen, PIPE
+
+
+#
+# eprint_as_xml takes a Python dict of EPrint content like
+# that fetched with eputil returns the object as EPrint XML.
+#
+def eprint_as_xml(eprint_obj):
+    src = json.dumps(eprint_obj)
+    #if not isinstance(src, bytes):
+    #    src = src.encode('utf-8')
+    cmd = ['epfmt', '-xml']
+    try:
+        p = run(cmd, input = src.encode('utf-8'), capture_output = True)
+    except Exception as e:
+        sys.stderr.write(f"{e}\n")
+    exit_code = p.returncode
+    if exit_code != 0:
+        print(f"ERROR: {' '.join(cmd)}, exit code {exit_code}")
+        return None
+    value = p.stdout
+    if not isinstance(value, bytes):
+        value = value.encode('utf8')
+    return value.decode()
+
+#
+# eprint_as_json takes a Python Dict of EPrint content
+# like that fetch with eputil returns the object in JSON format.
+#
+def eprint_as_json(eprint_obj):
+    src = json.dumps(eprint_obj)
+    if not isinstance(src, bytes):
+        src = src.encode('utf-8')
+    cmd = ['epfmt', '-json']
+    try:
+        p = run(cmd, input = src, capture_output = True)
+    except Exception as e:
+        sys.stderr.write(f"{e}\n")
+    exit_code = p.returncode
+    if exit_code != 0:
+        print(f"ERROR: {' '.join(cmd)}, exit code {exit_code}")
+        return None
+    value = p.stdout
+    if not isinstance(value, bytes):
+        value = value.encode('utf8')
+    return value.decode()
+
diff --git a/ames/harvesters/__init__.py b/ames/harvesters/__init__.py
@@ -3,3 +3,4 @@
 from .caltechdata import get_caltechdata
 from .caltechfeeds import get_caltechfeed
 from .matomo import get_downloads
+from .eputil import get_eprint_keys, get_eprints, get_eprint
diff --git a/ames/harvesters/caltechdata.py b/ames/harvesters/caltechdata.py
@@ -3,32 +3,40 @@
 from caltechdata_api import decustomize_schema
 import dataset
 
-def get_caltechdata(collection):
+def get_caltechdata(collection,production=True,datacite=False):
 
     if os.path.isdir(collection) == False:
         ok = dataset.init(collection)
         if ok == False:
             print("Dataset failed to init collection")
             exit()
-
-    url = 'https://data.caltech.edu/api/records'
+    if production == True:
+        url = 'https://data.caltech.edu/api/records'
+    else:
+        url = 'https://cd-sandbox.tind.io/api/records'
 
     response = requests.get(url+'/?size=1000')
     hits = response.json()
 
     print("Saving Records")
+
     for h in hits['hits']['hits']:
         rid = str(h['id'])
         print(rid)
-        metadata = decustomize_schema(h['metadata'],True,True)
-        metadata['updated'] = h['updated']
+        #Get enriched metadata records (including files)
+        if datacite == False:
+            metadata = decustomize_schema(h['metadata'],True,True)
+            metadata['updated'] = h['updated']
+        else:
+            #Get just DataCite metadata
+            metadata = decustomize_schema(h['metadata'])           
 
         result = dataset.has_key(collection,rid)
 
         if result == False:
             dataset.create(collection,rid, metadata)
         else:
-            #Could check update data, but probably not worth it
+            #Could check update date, but probably not worth it
             dataset.update(collection,rid, metadata)
 
 def get_multiple_links(input_collection,output_collection):

diff --git a/ames/harvesters/eputil.py b/ames/harvesters/eputil.py
@@ -0,0 +1,129 @@
+#
+# eputil.py is a Python 3.7 wrapper for the Go eprinttools
+# eputil command line program.
+# 
+# For Go package see https://github.com/caltechlibrary/eprinttools.
+#
+import os
+import json
+import sys
+from subprocess import run, Popen, PIPE
+from datetime import datetime, timedelta
+
+
+#
+# get_eprint_keys  returns a list of keys available from the
+# EPrints rest API indicated in the provided eprint_url. 
+#
+# The eprint_url often is in the form containing a username/password
+# for access the API. E.g. 
+#
+#     'https://jane.doe:[email protected]'
+#
+def get_eprint_keys(eprint_url):
+    cmd = ['eputil']
+    cmd.append('-json')
+    cmd.append(eprint_url + '/rest/eprint/')
+    try:
+        p = run(cmd, capture_output = True)
+    except Exception as e:
+        sys.stderr.write(f"{e}\n")
+
+    exit_code = p.returncode
+    if exit_code != 0:
+        print(f"ERROR: {' '.join(cmd)}, exit code {exit_code}")
+        return None
+    value = p.stdout
+    if not isinstance(value, bytes):
+        value = value.encode('utf8')
+    src = value.decode()
+    if type(src) == str:
+        if src == "":
+            return []
+        keys = []
+        l = json.loads(src)
+        for k in l:
+            keys.append(f"{k}")
+        return keys
+    else:
+        print(f"ERROR: wrong type {type(src)} for {src}")
+        return None
+
+#
+# get_eprint returns a single EPrint element for given EPrint ID.
+# via the EPrints rest API indicated in the provided eprint_url. 
+#
+# The eprint_url often is in the form containing a username/password
+# for access the API. E.g. 
+#
+#     'https://jane.doe:[email protected]'
+#
+def get_eprint(eprint_url, eprint_id):
+    eprint = {}
+    cmd = ['eputil']
+    cmd.append('-json')
+    cmd.append(eprint_url + '/rest/eprint/' + eprint_id + '.xml')
+    try:
+        p = run(cmd, capture_output = True)
+    except Exception as e:
+        sys.stderr.write(f"{e}\n")
+
+    exit_code = p.returncode
+    if exit_code != 0:
+        print(f"ERROR: {' '.join(cmd)}, exit code {exit_code}")
+        return None
+    value = p.stdout
+    if not isinstance(value, bytes):
+        value = value.encode('utf8')
+    src = value.decode()
+    if type(src) == str:
+        if src == "":
+            return {} 
+        obj = json.loads(src)
+        if 'eprint' in obj and len(obj['eprint']) > 0:
+            return obj['eprint'][0]
+        return None
+    else:
+        print(f"ERROR: wrong type {type(src)} for {src}")
+        return None
+
+#
+# get_eprints returns an EPrint element in List form
+# for given EPrint ID via the EPrints rest API indicated in the 
+# provided eprint_url (the outer XML is <eprints>... rather
+# than the inner XML of <eprints><eprint>...)
+#
+# The eprint_url often is in the form containing a username/password
+# for access the API. E.g. 
+#
+#     'https://jane.doe:[email protected]'
+#
+def get_eprints(eprint_url, eprint_id):
+    eprints = []
+    eprint = {}
+    cmd = ['eputil']
+    cmd.append('-json')
+    cmd.append(eprint_url + '/rest/eprint/' + eprint_id + '.xml')
+    try:
+        p = run(cmd, capture_output = True)
+    except Exception as e:
+        sys.stderr.write(f"{e}\n")
+    exit_code = p.returncode
+    if exit_code != 0:
+        print(f"ERROR: {' '.join(cmd)}, exit code {exit_code}")
+        return None
+    value = p.stdout
+    if not isinstance(value, bytes):
+        value = value.encode('utf8')
+    src = value.decode()
+    if type(src) == str:
+        if src == "":
+            return []
+        obj = json.loads(src)
+        if 'eprint' in obj and len(obj['eprint']) > 0:
+            return obj
+        return None
+    else:
+        print(f"ERROR: wrong type {type(src)} for {src}")
+        return None
+
diff --git a/ames/matchers/__init__.py b/ames/matchers/__init__.py
@@ -1,4 +1,6 @@
 from .caltechdata import match_cd_refs
 from .caltechdata import match_codemeta
 from .caltechdata import fix_multiple_links
-from .update_datacite import update_datacite_media
+from .caltechdata import add_citation
+from .datacite import update_datacite_metadata
+from .datacite import update_datacite_media
diff --git a/ames/matchers/caltechdata.py b/ames/matchers/caltechdata.py
@@ -2,6 +2,7 @@
 from caltechdata_api import caltechdata_edit
 from ames import codemeta_to_datacite
 import dataset
+import requests
 
 def match_cd_refs():
 
@@ -133,3 +134,34 @@ def fix_multiple_links(input_collection,token):
                     response = caltechdata_edit(token,k,new_metadata,{},{},True)
                     print(response)
 
+def add_citation(collection,token,production=True):
+    keys = dataset.keys(collection)
+    for k in keys:
+        record,err = dataset.read(collection,k)
+        if err != '':
+            print(err)
+            exit()
+        description = record['descriptions']
+        cite_exists = False
+        for d in description:
+            descr_text = d['description']
+            if descr_text.startswith('<br>Cite this record as:'):
+                cite_exists = True
+        if cite_exists == False:
+            record_doi = record['identifier']['identifier']
+            citation_link =\
+            'https://data.datacite.org/text/x-bibliography;style=apa/'
+            citation = requests.get(citation_link+record_doi).text
+            doi_url = 'https://doi.org/'+record_doi.lower()
+            if doi_url in citation:
+                #Check that we have a citation and not a server error,
+                #otherwise wait till next time
+                citation = citation.replace(doi_url,'<a href="'+doi_url+'">'+doi_url+'</a>')
+                #Replace link text with HTML link
+                n_txt = '<br>Cite this record as:<br>'+citation+\
+                    '<br> or choose a <a href="https://crosscite.org/?doi='\
+                    +record_doi+'"> different citation style</a>'
+                description.append({'descriptionType':'Other','description':n_txt})
+                response =\
+                caltechdata_edit(token,k,{'descriptions':description},{},{},production)
+                print(response)
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,3 +10,6 @@ CaltechDATA_Logo_cropped.png @@
     build/
     dist/
     ames.egg-info/
+    # Test files
+    *.csv
+    updates
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@

		from .codemeta_to_datacite import codemeta_to_datacite
		from .epfmt import eprint_as_xml, eprint_as_json