-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate_external_resources.py
159 lines (121 loc) · 4.77 KB
/
update_external_resources.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from shapely import wkt
ANNOTATION_IDENTIFIERS = "data/annotations_linking.csv"
def query_wikidata(uri, endpoint="https://query.wikidata.org/sparql", cache=dict()):
if uri in cache:
return cache[uri]
q = """
SELECT DISTINCT ?uri ?uriLabel ?uriDescription ?latitude ?longitude WHERE {
?uri wdt:P31|wdt:P279 [] .
OPTIONAL {
?uri p:P625 ?coordinate.
?coordinate ps:P625 ?coord.
?coordinate psv:P625 ?coordinate_node.
?coordinate_node wikibase:geoLongitude ?longitude.
?coordinate_node wikibase:geoLatitude ?latitude.
}
VALUES ?uri { <URIHIER> }
SERVICE wikibase:label { bd:serviceParam wikibase:language "nl,en,de". }
}
""".replace(
"URIHIER", uri
)
print(uri)
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(q)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
label = results["results"]["bindings"][0]["uriLabel"]["value"]
description = (
results["results"]["bindings"][0].get("uriDescription", {}).get("value")
)
latitude = results["results"]["bindings"][0].get("latitude", {}).get("value")
longitude = results["results"]["bindings"][0].get("longitude", {}).get("value")
cache[uri] = label, description, latitude, longitude
return label, description, latitude, longitude
def query_adamlink(
uri,
endpoint="https://api.lod.uba.uva.nl/datasets/ATM/ATM-KG/services/ATM-KG/sparql",
cache=dict(),
):
if uri in cache:
return cache[uri]
q = """
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix schema: <https://schema.org/>
prefix geo: <http://www.opengis.net/ont/geosparql#>
PREFIX bif: <http://www.openlinksw.com/schemas/bif#>
SELECT ?uri ?label ?description ?geometryWKT ?longitude ?latitude WHERE {
?uri a [] ;
rdfs:label ?label .
OPTIONAL {
?uri schema:description ?description .
}
OPTIONAL {
# Address
?uri schema:geoContains/geo:asWKT ?geometry .
BIND(bif:st_x(?geometry) AS ?longitude)
BIND(bif:st_y(?geometry) AS ?latitude)
}
OPTIONAL {
# Street / Building
?uri geo:hasGeometry/geo:asWKT ?geometryWKT .
# No geof function here?
}
VALUES ?uri { <URIHIER> }
}
""".replace(
"URIHIER", uri
)
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(q)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
label = results["results"]["bindings"][0]["label"]["value"]
description = results["results"]["bindings"][0].get("description", {}).get("value")
geometryWKT = results["results"]["bindings"][0].get("geometryWKT", {}).get("value")
latitude = results["results"]["bindings"][0].get("latitude", {}).get("value")
longitude = results["results"]["bindings"][0].get("longitude", {}).get("value")
if geometryWKT:
geometry = wkt.loads(geometryWKT)
latitude = geometry.centroid.y
longitude = geometry.centroid.x
cache[uri] = label, description, latitude, longitude
return label, description, latitude, longitude
def main(annotations_file):
df = pd.read_csv(annotations_file)
cache = dict()
for index, row in df.iterrows():
if pd.isna(row["uri"]):
continue
elif pd.notna(row["label"]):
continue
# Diary writers have their own biography in the project
# elif row["uri"] in (
# "http://www.wikidata.org/entity/Q113810404",
# "http://www.wikidata.org/entity/Q123396315",
# "http://www.wikidata.org/entity/Q124972258",
# "http://www.wikidata.org/entity/Q124987744",
# "http://www.wikidata.org/entity/Q108534152",
# "http://www.wikidata.org/entity/Q125020291",
# "http://www.wikidata.org/entity/Q65965451",
# ):
# continue
uri = row["uri"]
if "wikidata" in uri:
label, description, latitude, longitude = query_wikidata(uri, cache=cache)
elif "adamlink" in uri:
label, description, latitude, longitude = query_adamlink(uri, cache=cache)
else:
label, description, latitude, longitude = "", "", "", ""
df.at[index, "label"] = label
df.at[index, "description"] = description
df.at[index, "latitude"] = round(float(latitude), 6) if latitude else latitude
df.at[index, "longitude"] = (
round(float(longitude), 6) if longitude else longitude
)
# Save!
df.to_csv(annotations_file, index=False)
if __name__ == "__main__":
main(ANNOTATION_IDENTIFIERS)