Skip to content

Commit

Permalink
played further with crossref, id.loc.gov api issues
Browse files Browse the repository at this point in the history
  • Loading branch information
cmharlow committed Oct 7, 2015
1 parent 438e06c commit 57b42a6
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 14 deletions.
16 changes: 8 additions & 8 deletions reconcile.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,19 @@

#Map the LoC query indexes to service types
default_query = {
"id": "/lc",
"id": "LoC",
"name": "LCNAF & LCSH",
"index": "/authorities"
}

refine_to_lc = [
{
"id": "/lc/names",
"id": "Names",
"name": "Library of Congress Name Authority File",
"index": "/authorities/names"
},
{
"id": "/lc/subjects",
"id": "Subjects",
"name": "Library of Congress Subject Headings",
"index": "/authorities/subjects"
}
Expand All @@ -58,7 +58,7 @@
"defaultTypes": query_types,
"view": {
"url": "{{id}}"
}
},
}

def jsonpify(obj):
Expand Down Expand Up @@ -94,18 +94,18 @@ def search(raw_query, query_type='/lc'):
except getopt.GetoptError as e:
app.logger.warning(e)
return out
def score_xref(xref): return fuzz.token_sort_ratio(query, text.normalize(xref, PY3))
for n in range(0, len(results[1])):
match = False
name = results[1][n]
lc_uri = results[3][n]
#Get cross-refs from URI SKOS Ntriples graph - if exist, compare against name for highest score
#Get cross-refs from URI SKOS Ntriples graph for query results - if exist, compare against name for highest score
crossRef = rdflib.Graph()
crossRefnt = crossRef.parse(lc_uri + '.skos.nt', format='n3')
uri = rdflib.URIRef(lc_uri)
#Get max score for label found and cross-refs
xrefs = crossRefnt.objects(subject=uri, predicate=SKOS.altLabel)
score = reduce(max,map(score_xref,xrefs),score_xref(name))
#Get max score for label found and cross-refs
def labelsScore(foundLabel): return fuzz.token_sort_ratio(query, text.normalize(foundLabel, PY3))
score = reduce(max,map(labelsScore,xrefs),labelsScore(name))
if score > 95:
match = True
app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + lc_uri)
Expand Down
8 changes: 2 additions & 6 deletions text.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
"""
Taken from the Helmut project.
https://github.com/okfn/helmut/blob/master/helmut/text.py
In process of being reviewed, modified, for LoC optimization.
"""

from unicodedata import normalize as ucnorm, category

def normalize(text, PY3):
""" Simplify a piece of text to generate a more canonical
representation. This involves lowercasing, stripping trailing
spaces, removing symbols, diacritical marks (umlauts) and
converting all newlines etc. to single spaces.
"""
if PY3:
if not isinstance(text, str):
str(text, 'utf-8')
Expand All @@ -30,7 +26,7 @@ def normalize(text, PY3):
elif cat.startswith('Z'):
# newlines, non-breaking etc.
filtered.append(' ')
elif cat.startswith('S'):
# elif cat.startswith('S'):
# symbols, such as currency
continue
else:
Expand Down

0 comments on commit 57b42a6

Please sign in to comment.