Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(utils): implement new type rdf importer #1549

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions apis_core/apis_entities/abc.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

from django.db import models

#########################
Expand All @@ -23,6 +25,13 @@ class Meta:
def __str__(self):
return f"{self.forename} {self.surname}"

@classmethod
def rdf_configs(cls):
return [
Path(__file__).parent / "triple_configs/E21_PersonFromDNB.toml",
Path(__file__).parent / "triple_configs/E21_PersonFromWikidata.toml",
]


class E53_Place(models.Model):
label = models.CharField(blank=True, default="", max_length=4096)
Expand All @@ -35,6 +44,14 @@ class Meta:
def __str__(self):
return self.label

@classmethod
def rdf_configs(cls):
return [
Path(__file__).parent / "triple_configs/E53_PlaceFromDNB.toml",
Path(__file__).parent / "triple_configs/E53_PlaceFromGeonames.toml",
Path(__file__).parent / "triple_configs/E53_PlaceFromWikidata.toml",
]


class E74_Group(models.Model):
label = models.CharField(blank=True, default="", max_length=4096)
Expand All @@ -44,3 +61,10 @@ class Meta:

def __str__(self):
return self.label

@classmethod
def rdf_configs(cls):
return [
Path(__file__).parent / "triple_configs/E74_GroupFromDNB.toml",
Path(__file__).parent / "triple_configs/E74_GroupFromWikidata.toml",
]
13 changes: 13 additions & 0 deletions apis_core/apis_entities/triple_configs/E21_PersonFromDNB.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[[filters]]
"rdf:type" = "gndo:DifferentiatedPerson"

[attributes]
forename = ["gndo:forename", "gndo:preferredNameEntityForThePerson/gndo:forename"]
alternative_names = "gndo:variantNameForThePerson"
surname = ["gndo:surname", "gndo:preferredNameEntityForThePerson/gndo:surname"]
start_date_written = "gndo:dateOfBirth"
end_date_written = "gndo:dateOfDeath"
same_as = "owl:sameAs"
profession = "gndo:professionOrOccupation"

relations = ["gndo:placeOfDeath", "gndo:placeOfBirth"]
12 changes: 12 additions & 0 deletions apis_core/apis_entities/triple_configs/E21_PersonFromWikidata.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[[filters]]
"wdt:P31" = "wd:Q5"

[attributes]
forename = "wdt:P735/rdfs:label"
surname = "wdt:P734/rdfs:label"
date_of_birth = "wdt:P569"
date_of_death = "wdt:P570"
same_as = ["owl:sameAs", "wdtn:P227", "wdtn:P1566", "wdtn:P214", "wdtn:P244"]

#TODO:
#relations = ["wd:P20", "wd:P19"]
21 changes: 21 additions & 0 deletions apis_core/apis_entities/triple_configs/E53_PlaceFromDNB.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[[filters]]
"rdf:type" = "gndo:TerritorialCorporateBodyOrAdministrativeUnit"


[attributes]
label = "gndo:preferredNameForThePlaceOrGeographicName"
longitude = '''
SELECT ?longitude
WHERE {
?subject geo:hasGeometry/geo:asWKT ?point .
BIND(REPLACE(str(?point), "Point \\( \\+?(-?\\d+.\\d+).*", "$1") as ?longitude)
}
'''
latitude = '''
SELECT ?latitude
WHERE {
?subject geo:hasGeometry/geo:asWKT ?point .
BIND(REPLACE(str(?point), "^Point\\s*\\(\\s*[+-]?\\d+\\.\\d+\\s+([+-]?\\d+\\.\\d+)\\s*\\)$", "$1") as ?latitude)
}
'''
same_as = "owl:sameAs"
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[[filters]]
"rdf:type" = "gn:Feature"


[attributes]
label = ["gn:name", "gn:officialName", "gn:alternateName"]
latitude = "wgs84_pos:lat"
longitude = "wgs84_pos:long"
same_as = ["rdfs:seeAlso", "gn:wikipediaArticle"]
22 changes: 22 additions & 0 deletions apis_core/apis_entities/triple_configs/E53_PlaceFromWikidata.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[[filters]]
# metropolis (large and significant city or urban area usually with millions of inhabitants)
"wdt:P31" = "wd:Q200250"


[attributes]
label = ["rdfs:label", "wdt:P1448/rdfs:label"]
longitude = '''
SELECT ?longitude
WHERE {
?subject wdt:P625 ?geo1 .
BIND(REPLACE(str(?geo1), "Point\\((\\d+\\.\\d+).*$", "$1") as ?longitude)
}
'''
latitude = '''
SELECT ?latitude
WHERE {
?subject wdt:P625 ?geo1 .
BIND(REPLACE(str(?geo1), "Point\\((\\d+\\.\\d+) (\\d+\\.\\d+).*$", "$2") as ?latitude)
}
'''
same_as = ["owl:sameAs", "wdtn:P227", "wdtn:P1566", "wdtn:P214", "wdtn:P244"]
8 changes: 8 additions & 0 deletions apis_core/apis_entities/triple_configs/E74_GroupFromDNB.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[[filters]]
"rdf:type" = "gndo:CorporateBody"

[attributes]
label = ["gndo:preferredNameForTheCorporateBody", "gndo:variantNameForTheCorporateBody"]
start_date = "gndo:dateOfEstablishment"
end_date = "gndo:dateOfTermination"
sameas = "owl:sameAs"
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[[filters]]
"wdt:P31" = "wd:Q414147"

[attributes]
label = ["schema:about/rdfs:label", "rdfs:label"]
same_as = ["owl:sameAs", "wdtn:P227", "wdtn:P1566", "wdtn:P214", "wdtn:P244"]
82 changes: 82 additions & 0 deletions apis_core/utils/rdf2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# SPDX-FileCopyrightText: 2025 Birger Schacht
# SPDX-License-Identifier: MIT

import logging
from collections import defaultdict
from pathlib import Path

import tomllib
from AcdhArcheAssets.uri_norm_rules import get_normalized_uri
from django.apps import apps
from rdflib import RDF, BNode, Graph

logger = logging.getLogger(__name__)


def find_matching_config(graph: Graph) -> dict | None:
models_with_config = [
model for model in apps.get_models() if hasattr(model, "rdf_configs")
]
for model in models_with_config:
for path in model.rdf_configs():
config = tomllib.loads(Path(path).read_text())
for _filter in config.get("filters", []):
try:
triples = [
(
None,
graph.namespace_manager.expand_curie(predicate),
graph.namespace_manager.expand_curie(obj),
)
for predicate, obj in _filter.items()
]
triples = [triple in graph for triple in triples]
if all(triples):
logger.debug("Using %s for parsing graph", path)
config["model"] = model
return config
except ValueError:
pass
return None


def get_something_from_uri(uri: str) -> dict | None:
uri = get_normalized_uri(uri)
graph = Graph()
graph.parse(uri)

if config := find_matching_config(graph):
result = defaultdict(list)
result["model"] = config["model"]
result["relations"] = defaultdict(list)

for attribute, curies in config.get("attributes", {}).items():
if isinstance(curies, str):
curies = [curies]
for curie in curies:
values = []
if curie.startswith("SELECT "):
results = graph.query(curie)
else:
results = graph.query(
"SELECT ?object WHERE { ?subject " + curie + " ?object }"
)
objects = [result[0] for result in results]
for obj in objects:
if isinstance(obj, BNode):
values.extend(
[
value.toPython()
for value in graph.objects(subject=obj)
if value != RDF.Seq
]
)
else:
values.append(obj.toPython())

if attribute == "relations":
result["relations"][curie].extend(values)
else:
result[attribute].extend(values)
return dict(result)
return None