Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1017 add notations to spatial objects #1020

Merged
merged 3 commits into from
Sep 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions src/main/java/de/hbz/lobid/helper/CreateWikidataNwbibMaps.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
* @author Pascal Christoph (dr0i)
*
*/
public class CreateWikidataNwbibMaps {
public final class CreateWikidataNwbibMaps {
private static final Logger LOG =
LogManager.getLogger(CreateWikidataNwbibMaps.class);
private static final String WARN =
Expand Down Expand Up @@ -64,17 +64,19 @@ public static void main(String... args) {
Resource res = it.next();
if (res.hasProperty(FOCUS))
sb.append(res + "\t"
+ res.getProperty(PREFLABEL).getLiteral().getLexicalForm() + ","
+ res.getProperty(NOTATION).getLiteral().getLexicalForm() + ","
+ res.getProperty(FOCUS).getObject() + "\n");
+ res.getProperty(PREFLABEL).getLiteral().getLexicalForm() + "|"
+ (res.hasProperty(NOTATION)
? res.getProperty(NOTATION).getLiteral().getLexicalForm() : "")
+ "|" + res.getProperty(FOCUS).getObject() + "\n");
}
if (sb.length() < 3000) {
LOG.warn("nwbib-spatial.ttl not large enough." + WARN);
} else {
try {
FileUtils.writeStringToFile(TEST_FN, sb.toString(),
StandardCharsets.UTF_8);
LOG.info("Success: created 'nwbib-spatial.tsv'");
LOG.info(
"Success: created 'nwbib-spatial.tsv from skos file at lobid.org'");
} catch (IOException e) {
LOG.warn("Couldn't write file." + WARN, e);
}
Expand Down
49 changes: 44 additions & 5 deletions src/main/java/org/lobid/resources/run/WikidataGeodata2Es.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
import com.ning.http.client.AsyncHttpClient;
import com.ning.http.client.Response;

import de.hbz.lobid.helper.CreateWikidataNwbibMaps;

/**
* Indexing wikidata geo data into our geo-enrichment service. Gets a set of
* Wikidata Entities via a SPARQL query. These entities are looked up, mapped to
Expand All @@ -61,6 +63,7 @@ public class WikidataGeodata2Es {
private static final Logger LOG =
LogManager.getLogger(WikidataGeodata2Es.class);
private static HashMap<String, String> qidMap = new HashMap<>();
private static HashMap<String, String> notationMap = new HashMap<>();
private static BufferedReader lineReader;
/** This is the root node of the geo data. */
public static final String FOCUS = "focus";
Expand All @@ -69,8 +72,11 @@ public class WikidataGeodata2Es {
private static boolean indexExists = false;
private final static String NWBIB_SPATIAL_PREFIX =
"https://nwbib.de/spatial#";
private static String qidCsvFn = "src/main/resources/string2wikidata.tsv";

private final static String PATH_TO_RESOURCES = "src/main/resources/";
private final static String QID_CSV_FN =
PATH_TO_RESOURCES + "string2wikidata.tsv";
private final static String NOTATION_CSV_FN =
PATH_TO_RESOURCES + "nwbib-spatial.tsv";
/**
* This maps the nwbib location codes to wikidata entities.
*/
Expand Down Expand Up @@ -121,6 +127,7 @@ public class WikidataGeodata2Es {
*/
public static void main(String... args)
throws UnsupportedEncodingException, IOException {
CreateWikidataNwbibMaps.main();
String indexName = INDEX_ALIAS_PREFIX + "-" + DATE;
if (!System.getProperty("indexName", "").isEmpty()) {
indexName = System.getProperty("indexName");
Expand All @@ -137,6 +144,7 @@ public static void main(String... args)
LOG.info("... so the alias is: '" + INDEX_ALIAS_PREFIX + aliasSuffix + "'");
esIndexer.setIndexAliasSuffix(aliasSuffix);
setProductionIndexerConfigs(indexName);
loadNotationMap();
LOG.info("Going to index");
loadQidMap();
qidMap.values().stream()
Expand All @@ -151,13 +159,12 @@ public static void main(String... args)

/**
* Loads the manually created QID map.
*
*/
public static void loadQidMap() {
LOG.info("going to load QID csv from " + qidCsvFn + "...");
LOG.info("going to load QID csv from " + QID_CSV_FN + "...");
String line = null;
try {
lineReader = new BufferedReader(new FileReader(qidCsvFn));
lineReader = new BufferedReader(new FileReader(QID_CSV_FN));
line = lineReader.readLine();
while (line != null) {
try {
Expand All @@ -176,6 +183,35 @@ public static void loadQidMap() {
LOG.info("... loaded " + qidMap.size() + " entries from QID csv.");
}

/**
* Loads the notations from nwbib-skos.
*/
public static void loadNotationMap() {
LOG.info("going to load 'notation' from " + NOTATION_CSV_FN + "...");
String line = null;
try {
lineReader = new BufferedReader(new FileReader(NOTATION_CSV_FN));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case you didn't know, there's a new-ish way of reading lines that I like since it's very concise. Basic idea:

for (String line : Files.readAllLines(Paths.get(NOTATION_CSV_FN))) {
  String[] nwbibSpatialTsv = line.split("\\|");
  //...
}

May be useful for the next time or something.

line = lineReader.readLine();
while (line != null) {
try {
String[] nwbibSpatialTsv = line.split("\\|");
if (!nwbibSpatialTsv[1].isEmpty())
notationMap.put(
nwbibSpatialTsv[0]
.replaceAll("https://nwbib.de/spatial#Q(.*)\t.*", "Q$1"),
nwbibSpatialTsv[1]);
} catch (Exception e) {
LOG.warn("Missing QID in " + line);
}
line = lineReader.readLine();
}
} catch (Exception e) {
LOG.warn(e.getMessage() + "\n" + line);
}
LOG.info("... loaded " + notationMap.size()
+ " entries with notations from nwbib-spatial.tsv.");
}

static void setProductionIndexerConfigs(final String INDEX_NAME) {
esIndexer.setClustername("weywot");
esIndexer.setHostname("weywot5.hbz-nrw.de");
Expand Down Expand Up @@ -297,6 +333,7 @@ public static void filterWikidataEntitiesDump2EsGeodata(
final String FILE_NAME) {
LOG.info("Load wikidata json-dump: " + FILE_NAME);
JsonNode jnode = jsonFile2JsonNode(FILE_NAME);
loadNotationMap();
stream(jnode).map(transform2lobidWikidata()) //
.forEach(index2Es());
}
Expand Down Expand Up @@ -384,6 +421,8 @@ public static Function<JsonNode, Pair<String, JsonNode>> transform2lobidWikidata
root.set("type", conceptNode);
root.put("label",
node.findPath("labels").findPath("de").findPath("value").asText());
if (notationMap.containsKey(id))
root.put("notation", notationMap.get(id));
root.set("source", sourceNode);
if (!geoNode.isMissingNode()
&& !geoNode.findPath("latitude").isMissingNode()) {
Expand Down
84 changes: 44 additions & 40 deletions src/main/resources/morph-hbz01-to-lobid.xml
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@
<data source="$[field]f1.p" name="a">
<regexp match=".*" format="http://id.loc.gov/vocabulary/relators/hnr"/>
</data>
<data source="$[field]e[1234].p" name="a"> <!-- see #150 -->
<data source="$[field]e[1234].p" name="a"> <!-- see #150 -->
<regexp match=".*" format="http://id.loc.gov/vocabulary/relators/prf"/>
</data>
<!-- default -->
Expand Down Expand Up @@ -2256,8 +2256,8 @@
</data>
<combine name="http://www.w3.org/2000/01/rdf-schema#label" value="${a}">
<data source="521-1.[at]" name="a">
<replace pattern="^http" with="Siehe: http"/>
</data>
<replace pattern="^http" with="Siehe: http"/>
</data>
</combine>
</entity>
<!-- ####################### -->
Expand Down Expand Up @@ -6556,7 +6556,7 @@
</combine>
<data source="@nwbibComplexSubjectRswkLabel907" name="http://www.w3.org/2000/01/rdf-schema#label"/>
<call-macro name="nwbibSetLobidSubject" field="907"/>
<call-macro name="nwbibSetLobidSubject" field="907"/> <!-- important to keep right order -->
<call-macro name="nwbibSetLobidSubject" field="907"/> <!-- important to keep right order -->
<call-macro name="subjectRswkId" field="907" indicator2="1"/>
<call-macro name="setConcatComplexSubjectRswkId" field="907-1"/>
<call-macro name="subjectRswkIdSetDctSubject" field="907" indicator2="1"/>
Expand All @@ -6582,7 +6582,7 @@
<call-macro name="setNwbibRdfSubject" field="912"/>
<data source="@nwbibComplexSubjectRswkLabel912" name="http://www.w3.org/2000/01/rdf-schema#label"/>
<call-macro name="nwbibSetLobidSubject" field="912"/>
<call-macro name="nwbibSetLobidSubject" field="912"/> <!-- important to keep right order -->
<call-macro name="nwbibSetLobidSubject" field="912"/> <!-- important to keep right order -->
<call-macro name="subjectRswkId" field="912" indicator2="1"/>
<call-macro name="setConcatComplexSubjectRswkId" field="912-1"/>
<call-macro name="subjectRswkIdSetDctSubject" field="912" indicator2="1"/>
Expand All @@ -6609,7 +6609,7 @@
<call-macro name="setNwbibRdfSubject" field="917"/>
<data source="@nwbibComplexSubjectRswkLabel917" name="http://www.w3.org/2000/01/rdf-schema#label"/>
<call-macro name="nwbibSetLobidSubject" field="917"/>
<call-macro name="nwbibSetLobidSubject" field="917"/> <!-- important to keep right order -->
<call-macro name="nwbibSetLobidSubject" field="917"/> <!-- important to keep right order -->
<call-macro name="subjectRswkId" field="917" indicator2="1"/>
<call-macro name="setConcatComplexSubjectRswkId" field="917-1"/>
<call-macro name="subjectRswkIdSetDctSubject" field="917" indicator2="1"/>
Expand All @@ -6636,7 +6636,7 @@
<call-macro name="setNwbibRdfSubject" field="922"/>
<data source="@nwbibComplexSubjectRswkLabel922" name="http://www.w3.org/2000/01/rdf-schema#label"/>
<call-macro name="nwbibSetLobidSubject" field="922"/>
<call-macro name="nwbibSetLobidSubject" field="922"/> <!-- important to keep right order -->
<call-macro name="nwbibSetLobidSubject" field="922"/> <!-- important to keep right order -->
<call-macro name="subjectRswkId" field="922" indicator2="1"/>
<call-macro name="setConcatComplexSubjectRswkId" field="922-1"/>
<call-macro name="subjectRswkIdSetDctSubject" field="922" indicator2="1"/>
Expand All @@ -6663,7 +6663,7 @@
<call-macro name="setNwbibRdfSubject" field="927"/>
<data source="@nwbibComplexSubjectRswkLabel927" name="http://www.w3.org/2000/01/rdf-schema#label"/>
<call-macro name="nwbibSetLobidSubject" field="927"/>
<call-macro name="nwbibSetLobidSubject" field="927"/> <!-- important to keep right order -->
<call-macro name="nwbibSetLobidSubject" field="927"/> <!-- important to keep right order -->
<call-macro name="subjectRswkId" field="927" indicator2="1"/>
<call-macro name="setConcatComplexSubjectRswkId" field="927-1"/>
<call-macro name="subjectRswkIdSetDctSubject" field="927" indicator2="1"/>
Expand Down Expand Up @@ -6744,7 +6744,7 @@
<call-macro name="setNwbibRdfSubject" field="942"/>
<data source="@nwbibComplexSubjectRswkLabel942" name="http://www.w3.org/2000/01/rdf-schema#label"/>
<call-macro name="nwbibSetLobidSubject" field="942"/>
<call-macro name="nwbibSetLobidSubject" field="942"/> <!-- important to keep right order -->
<call-macro name="nwbibSetLobidSubject" field="942"/> <!-- important to keep right order -->
<call-macro name="subjectRswkId" field="942" indicator2="1"/>
<call-macro name="setConcatComplexSubjectRswkId" field="942-1"/>
<call-macro name="subjectRswkIdSetDctSubject" field="942" indicator2="1"/>
Expand Down Expand Up @@ -7360,7 +7360,7 @@
<data source="@700uri" name="~rdf:subject"/>
<data source="@700uri" name="http://www.w3.org/2004/02/skos/core#notation">
<lookup in="nwbib-spatial"/>
<regexp match=".*,(\d{2}),.*" format="${1}"/>
<regexp match=".*\|(\d.*)\|.*" format="${1}"/>
</data>
<data source="@700uri" name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type">
<constant value="http://www.w3.org/2004/02/skos/core#Concept" />
Expand All @@ -7370,46 +7370,50 @@
</data>
<combine name="@700uri2Qid" value="${a}" flushWith="@700uri">
<choose name="a" flushWith="@700uri">
<data source="@700uri" name="a">
<data source="@700uri" name="a">
<lookup in="nwbib-spatial"/>
<regexp match=".*,(http.*)" format="${1}"/>
<regexp match=".*\|(http.*)" format="${1}"/>
</data>
<data source="@700uri" name="a">
<data source="@700uri" name="a">
<regexp match="https://nwbib.de/spatial#(.*)" format="http://www.wikidata.org/entity/${1}" />
</data>
</choose>
</combine>
<data source="@700uri2Qid" name="http://www.w3.org/2000/01/rdf-schema#label">
<lookup in="wd_itemLabelTypesCoordinates"/>
<regexp match="(.*?),.*" format="${1}"/>
</data>
<lookup in="wd_itemLabelTypesCoordinates"/>
<regexp match="(.*?),.*" format="${1}"/>
</data>
<data source="@700uri2Qid" name="http://xmlns.com/foaf/0.1/focus" />
<!-- build structure using tsv of wd_itemLabelTypesCoordinates -->
<!-- build structure using tsv of wd_itemLabelTypesCoordinates -->
<data source="@700uri2Qid" name="~rdf:subject"/>
<data source="@700uri2Qid" name="@wd_itemLabelTypesCoordinates">
<lookup in="wd_itemLabelTypesCoordinates"/>
</data>
<data source="@wd_itemLabelTypesCoordinates" name="@wd_Types">
<replace pattern="&quot;" with="" />
<regexp match=".*,(http.*),Point.*" format="${1}"/>
</data>
<combine name="@wd_Type" value="${a}" >
<data source="@wd_Types" name="a" >
<split delimiter=","/>
</data>
</combine>
<combine name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" value="${a}" reset="true" >
<data source="@wd_Type" name="a">
<replace pattern=" " with="" />
</data>
</combine>
<lookup in="wd_itemLabelTypesCoordinates"/>
</data>
<data source="@wd_itemLabelTypesCoordinates" name="@wd_Types">
<replace pattern="&quot;" with="" />
<regexp match=".*,(http.*),Point.*" format="${1}"/>
</data>
<combine name="@wd_Type" value="${a}" >
<data source="@wd_Types" name="a" >
<split delimiter=","/>
</data>
</combine>
<combine name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" value="${a}" reset="true" >
<data source="@wd_Type" name="a">
<replace pattern=" " with="" />
</data>
</combine>
<data source="@700uri2Qid" name="http://www.w3.org/2000/01/rdf-schema#label">
<lookup in="wd_itemLabelTypesCoordinates"/>
<regexp match="^(.*?)," format="${1}"/>
</data>
<entity name="http://schema.org/geo" flushWith="@700uri2Qid" reset="true">
<data source="@wd_itemLabelTypesCoordinates" name="http://schema.org/longitude">
<regexp match=".*,Point\((.*) .*\)" format="${1}"/>
</data>
<data source="@wd_itemLabelTypesCoordinates" name="http://schema.org/latitude">
<regexp match=".*,Point\(.* (.*)\)" format="${1}"/>
</data>
<data source="@wd_itemLabelTypesCoordinates" name="http://schema.org/longitude">
<regexp match=".*,Point\((.*) .*\)" format="${1}"/>
</data>
<data source="@wd_itemLabelTypesCoordinates" name="http://schema.org/latitude">
<regexp match=".*,Point\(.* (.*)\)" format="${1}"/>
</data>
</entity>
<call-macro name="setBackRdfSubject" field="@700uri"/>
<!--edoweb -->
Expand Down Expand Up @@ -7657,7 +7661,7 @@
</data>
</combine>
<data source="@nwbibSpatialSubject" name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type">
<constant value="http://www.w3.org/2004/02/skos/core#Concept"/>
<constant value="http://www.w3.org/2004/02/skos/core#Concept"/>
</data>
<combine name="~rdf:subject" value="$[ns-lobid-resource]${id}#!" flushWith="@nwbibSpatialSubject">
<data source="@id" name="id"/>
Expand Down
Loading