From 31906b587b6442013f67b0ea14c2ebe772b9e6eb Mon Sep 17 00:00:00 2001 From: Miguel Caballer Date: Wed, 18 Sep 2024 13:19:12 +0200 Subject: [PATCH] Implements data.europa.eu support --- datahugger/base.py | 2 +- datahugger/config.py | 2 ++ datahugger/services.py | 17 +++++++++++++++++ tests/test_repositories.toml | 4 ++++ 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/datahugger/base.py b/datahugger/base.py index cd7797b..66c76c7 100644 --- a/datahugger/base.py +++ b/datahugger/base.py @@ -11,7 +11,7 @@ from urllib.parse import urlparse import requests -from jsonpath_ng import parse +from jsonpath_ng.ext import parse from scitree import scitree from tqdm import tqdm diff --git a/datahugger/config.py b/datahugger/config.py index 3176c87..76311f7 100644 --- a/datahugger/config.py +++ b/datahugger/config.py @@ -4,6 +4,7 @@ from datahugger.services import DataverseDataset from datahugger.services import DjehutyDataset from datahugger.services import DSpaceDataset +from datahugger.services import DataEuropaDataset from datahugger.services import FigShareDataset from datahugger.services import GitHubDataset from datahugger.services import HuggingFaceDataset @@ -115,6 +116,7 @@ "trolling.uit.no": DataverseDataset, "www.sodha.be": DataverseDataset, "www.uni-hildesheim.de": DataverseDataset, + "data.europa.eu": DataEuropaDataset, } # regexp lookup diff --git a/datahugger/services.py b/datahugger/services.py index a0075e7..dae7df4 100644 --- a/datahugger/services.py +++ b/datahugger/services.py @@ -388,3 +388,20 @@ def _get_attr_hash(self, record): def _get_attr_hash_type(self, record): return self._get_attr_attr(record, self.ATTR_HASH_JSONPATH).split(":")[0] + + +class DataEuropaDataset(DatasetDownloader): + """Downloader for European data repository.""" + + REGEXP_ID = r"data\.europa\.eu\/data\/datasets\/(?P.+)" + + # the base entry point of the REST API + API_URL = "https://data.europa.eu/api/hub/repo/" + + API_URL_META = "{api_url}datasets/{record_id}" + META_FILES_JSONPATH = '$.@graph[?(@.@type == "dcat:Distribution")]' + + # paths to file attributes + ATTR_FILE_LINK_JSONPATH = "'dcat:accessURL'.@id" + ATTR_NAME_JSONPATH = "'dct:title'" + ATTR_SIZE_JSONPATH = "'dcat:byteSize'.@value" diff --git a/tests/test_repositories.toml b/tests/test_repositories.toml index 1758b46..882ce5b 100644 --- a/tests/test_repositories.toml +++ b/tests/test_repositories.toml @@ -109,3 +109,7 @@ files = "AA_age.tab" [[github]] location = "https://github.com/j535d165/cbsodata" files = "cbsodata-main/README.md" + +[[dataeuropa]] +location = "https://data.europa.eu/data/datasets/65e092e4009f18f050b14216" +files = "consolidation-wattzhub-schema-irve-dynamic-20240918-033000.csv" \ No newline at end of file