played further with crossref, id.loc.gov api issues

cmharlow · Oct 7, 2015 · 57b42a6 · 57b42a6
1 parent 438e06c
commit 57b42a6
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 14 deletions.
diff --git a/reconcile.py b/reconcile.py
@@ -30,19 +30,19 @@
 
 #Map the LoC query indexes to service types
 default_query = {
-    "id": "/lc",
+    "id": "LoC",
     "name": "LCNAF & LCSH",
     "index": "/authorities"
 }
 
 refine_to_lc = [
     {
-        "id": "/lc/names",
+        "id": "Names",
         "name": "Library of Congress Name Authority File",
         "index": "/authorities/names"
     },
     {
-        "id": "/lc/subjects",
+        "id": "Subjects",
         "name": "Library of Congress Subject Headings",
         "index": "/authorities/subjects"
     }
@@ -58,7 +58,7 @@
     "defaultTypes": query_types,
     "view": {
         "url": "{{id}}"
-    }
+    },
 }
 
 def jsonpify(obj):
@@ -94,18 +94,18 @@ def search(raw_query, query_type='/lc'):
     except getopt.GetoptError as e:
         app.logger.warning(e)
         return out
-    def score_xref(xref): return fuzz.token_sort_ratio(query, text.normalize(xref, PY3))
     for n in range(0, len(results[1])):
         match = False
         name = results[1][n]
         lc_uri = results[3][n]
-        #Get cross-refs from URI SKOS Ntriples graph - if exist, compare against name for highest score
+        #Get cross-refs from URI SKOS Ntriples graph for query results - if exist, compare against name for highest score
         crossRef = rdflib.Graph()
         crossRefnt = crossRef.parse(lc_uri + '.skos.nt', format='n3')
         uri = rdflib.URIRef(lc_uri)
-        #Get max score for label found and cross-refs
         xrefs = crossRefnt.objects(subject=uri, predicate=SKOS.altLabel)
-        score = reduce(max,map(score_xref,xrefs),score_xref(name))
+        #Get max score for label found and cross-refs
+        def labelsScore(foundLabel): return fuzz.token_sort_ratio(query, text.normalize(foundLabel, PY3))
+        score = reduce(max,map(labelsScore,xrefs),labelsScore(name))
         if score > 95:
             match = True
         app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + lc_uri)

diff --git a/text.py b/text.py
@@ -1,16 +1,12 @@
 """
 Taken from the Helmut project.
 https://github.com/okfn/helmut/blob/master/helmut/text.py
+In process of being reviewed, modified, for LoC optimization.
 """
 
 from unicodedata import normalize as ucnorm, category
 
 def normalize(text, PY3):
-    """ Simplify a piece of text to generate a more canonical
-    representation. This involves lowercasing, stripping trailing
-    spaces, removing symbols, diacritical marks (umlauts) and
-    converting all newlines etc. to single spaces.
-    """
     if PY3:
         if not isinstance(text, str):
             str(text, 'utf-8')
@@ -30,7 +26,7 @@ def normalize(text, PY3):
         elif cat.startswith('Z'):
             # newlines, non-breaking etc.
             filtered.append(' ')
-        elif cat.startswith('S'):
+        # elif cat.startswith('S'):
             # symbols, such as currency
             continue
         else: