Skip to content

Commit

Permalink
Add translations to prefLabel #41
Browse files Browse the repository at this point in the history
Delete unused, duplicate notation 10 (label "Außerhalb der Studienbereichsgliederung/Sonstige Fächer")
  • Loading branch information
maipet committed Sep 11, 2024
1 parent e11fc5a commit bd7c868
Show file tree
Hide file tree
Showing 2 changed files with 2,167 additions and 1,444 deletions.
81 changes: 76 additions & 5 deletions create_faechersystematik_ttl.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,48 @@
import pandas as pd
import rdflib.term
from rdflib import Graph, Literal, RDF, URIRef, Namespace, DCTERMS
import logging

def extract_preflabel_translations(current_ttl):
pref_label_dict_list = []
g_old = Graph()
g_old.parse(current_ttl, format="ttl")
qres = g_old.query(
"""
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
SELECT DISTINCT ?label_en ?label_uk ?concept
WHERE {
?concept a skos:Concept ;
skos:prefLabel ?label ;
skos:prefLabel ?label_en ;
skos:prefLabel ?label_uk .
FILTER(lang(?label_en)="en")
FILTER(lang(?label_uk)="uk")
}""")
for row in qres:
notation = row.concept.replace("https://w3id.org/kim/hochschulfaechersystematik/n","")
pref_label_dict = {notation: {"label_en": f"{row.label_en}", "label_uk": f"{row.label_uk}"}}
pref_label_dict_list.append(pref_label_dict)
return pref_label_dict_list

def add_pref_labels_lang(level_dict_list, current_pref_labels_dict):
for idx, i in enumerate(level_dict_list):
notation = level_dict_list[idx]['notation']
if notation in current_pref_labels_dict:
for k,v in current_pref_labels_dict.items():
pref_label_en = v.get("label_en")
pref_label_uk = v.get("label_uk")
level_dict_list[idx].update({"label_en": pref_label_en, "label_uk": pref_label_uk})
return level_dict_list


# extract translations of prefLabels
current_hfs_file = "https://github.com/dini-ag-kim/hochschulfaechersystematik/blob/master/hochschulfaechersystematik.ttl?raw=true"
lang_preflabel_list = extract_preflabel_translations(current_hfs_file)

# extract hfs data from destatis files
url_1st_level = "https://github.com/dini-ag-kim/destatis-schluesseltabellen/blob/main/studierende/Faechergruppe.csv?raw=true"
url_2nd_level = "https://github.com/dini-ag-kim/destatis-schluesseltabellen/raw/main/studierende/STB.csv?raw=true"
url_3rd_level = "https://github.com/dini-ag-kim/destatis-schluesseltabellen/blob/main/studierende/Studienfach.csv?raw=true"
Expand All @@ -10,16 +51,29 @@
df_2nd_level = pd.read_csv(url_2nd_level, encoding="ISO-8859-1", sep=';', quotechar='"', header=None, engine ='python', dtype=str, usecols=[0, 2, 3], names=["notation", "label", "broader"])
df_3rd_level = pd.read_csv(url_3rd_level, encoding="ISO-8859-1", sep=';', quotechar='"', header=None, engine ='python', dtype=str, usecols=[0, 2, 3], names=["notation", "label", "broader"])

# remove duplicate, unused notation 10 from top level
df_1st_level = df_1st_level[df_1st_level.notation !="10"]

# remove of "10" subordinate notations from 2nd and 3rd level
df_2nd_level = df_2nd_level[df_2nd_level.broader !="10"]
df_3rd_level = df_3rd_level[df_3rd_level.broader !="83"]

df_1st_level['notation'] = df_1st_level['notation'].str.lstrip("0")
df_2nd_level['broader'] = df_2nd_level['broader'].str.lstrip("0")
df_1st_level['notation'] = df_1st_level['notation'].apply(lambda n: "00" if n == "10" else n)
df_2nd_level['broader'] = df_2nd_level['broader'].apply(lambda n: "00" if n == "10" else n)

dict_1st_level = df_1st_level.to_dict("records")
dict_2nd_level = df_2nd_level.to_dict("records")
dict_3rd_level = df_3rd_level.to_dict("records")

# add translations from current hfs to dictionaries
for lang_preflabel_dict in lang_preflabel_list:
dict_1st_level = add_pref_labels_lang(dict_1st_level, lang_preflabel_dict)
dict_2nd_level = add_pref_labels_lang(dict_2nd_level, lang_preflabel_dict)
dict_3rd_level = add_pref_labels_lang(dict_3rd_level, lang_preflabel_dict)

g = Graph()

# namespaces
base = Namespace('https://w3id.org/kim/hochschulfaechersystematik/')
vann = Namespace('http://purl.org/vocab/vann/')
dct = Namespace('http://purl.org/dc/terms/')
Expand All @@ -46,13 +100,23 @@
g.add((URIRef('n%s' % top_level), RDF['type'], skos['Concept']))
g.add((URIRef('n%s' % top_level), skos['topConceptOf'], (URIRef('scheme'))))
g.add((URIRef('n%s' % top_level), skos['prefLabel'], Literal(dict_1st_level[idx]['label'], lang='de')))
g.add((URIRef('n%s' % top_level), skos['notation'], Literal(top_level)))
if dict_1st_level[idx].get('label_en'):
g.add((URIRef('n%s' % top_level), skos['prefLabel'], Literal(dict_1st_level[idx]['label_en'], lang='en')))
g.add((URIRef('n%s' % top_level), skos['prefLabel'], Literal(dict_1st_level[idx]['label_uk'], lang='uk')))
g.add((URIRef('n%s' % top_level), skos['notation'], Literal(top_level)))
else:
logging.warning("No translation for {notation}".format(notation=top_level))
g.add((URIRef('scheme'), skos['hasTopConcept'], (URIRef('n%s' % top_level))))
for idx_2, i_2 in enumerate(dict_2nd_level):
if dict_2nd_level[idx_2]['broader'] == top_level:
level_2_notation = dict_2nd_level[idx_2]['notation']
g.add((URIRef('n%s' % level_2_notation), RDF['type'], skos['Concept']))
g.add((URIRef('n%s' % level_2_notation), skos['prefLabel'], Literal(dict_2nd_level[idx_2]['label'], lang='de')))
if dict_2nd_level[idx_2].get('label_en'):
g.add((URIRef('n%s' % level_2_notation), skos['prefLabel'], Literal(dict_2nd_level[idx_2]['label_en'], lang='en')))
g.add((URIRef('n%s' % level_2_notation), skos['prefLabel'], Literal(dict_2nd_level[idx_2]['label_uk'], lang='uk')))
else:
logging.warning("No translation for {notation}".format(notation=level_2_notation))
g.add((URIRef('n%s' % level_2_notation), skos['broader'], (URIRef('n%s' % dict_2nd_level[idx_2]['broader']))))
g.add((URIRef('n%s' % level_2_notation), skos['notation'], Literal(level_2_notation)))
g.add((URIRef('n%s' % level_2_notation), skos['inScheme'], (URIRef('scheme'))))
Expand All @@ -61,14 +125,21 @@
level_3_notation = dict_3rd_level[idx_3]['notation']
g.add((URIRef('n%s' % level_3_notation), RDF['type'], skos['Concept']))
g.add((URIRef('n%s' % level_3_notation), skos['prefLabel'],Literal(dict_3rd_level[idx_3]['label'], lang='de')))
if dict_3rd_level[idx_3].get('label_en'):
g.add((URIRef('n%s' % level_3_notation), skos['prefLabel'],Literal(dict_3rd_level[idx_3]['label_en'], lang='en')))
g.add((URIRef('n%s' % level_3_notation), skos['prefLabel'],Literal(dict_3rd_level[idx_3]['label_uk'], lang='uk')))
else:
logging.warning("No translation for {notation}".format(notation=level_3_notation))
g.add((URIRef('n%s' % level_3_notation), skos['notation'], Literal(level_3_notation)))
g.add((URIRef('n%s' % level_3_notation), skos['inScheme'], (URIRef('scheme'))))
g.add((URIRef('n%s' % level_3_notation), skos['broader'], (URIRef('n%s' % dict_3rd_level[idx_3]['broader']))))

g.add((URIRef('n0'), RDF['type'], skos['Concept']))
g.add((URIRef('n0'), skos['prefLabel'], Literal('Fachübergreifend', lang='de')))
g.add((URIRef('n0'), skos['prefLabel'], Literal('Interdisciplinary', lang='en')))
g.add((URIRef('n0'), skos['prefLabel'], Literal('Міждисциплінарний', lang='uk')))
g.add((URIRef('n0'), skos['topConceptOf'], (URIRef('scheme'))))
g.add((URIRef('n0'), skos['notation'], Literal('0')))
g.add((URIRef('scheme'), skos['hasTopConcept'], (URIRef('n0'))))
g.bind("dct", DCTERMS)
g.serialize('hochschulfaechersystematik.ttl', format='turtle')

g.serialize('hochschulfaechersystematik.ttl', format='turtle')
Loading

0 comments on commit bd7c868

Please sign in to comment.