Merge branch 'main' into b2share

micafer · Sep 24, 2024 · 21e89a1 · 21e89a1
2 parents 659e2d3 + 62698ef
commit 21e89a1
Show file tree

Hide file tree

Showing 8 changed files with 83 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ tracker. Pull Requests are very welcome as well.
 Datahugger requires Python 3.6 or later.
 
 ```
-pip install datahugger
+pip install git+https://github.com/micafer/datahugger
 ```
 
 ## Getting started

diff --git a/datahugger/__main__.py b/datahugger/__main__.py
@@ -58,6 +58,13 @@ def main():
         help="Skip files larger than this size. Might not work for all services.",
     )
 
+    parser.add_argument(
+        "--filter-files",
+        default=None,
+        type=str,
+        help="A regex pattern to filter files by name.",
+    )
+
     parser.add_argument(
         "-f", "--force-download", dest="force_download", action="store_true"
     )
@@ -113,6 +120,7 @@ def main():
             args.url_or_doi,
             args.output_dir,
             max_file_size=args.max_file_size,
+            filter_files=args.filter_files,
             force_download=args.force_download,
             unzip=args.unzip,
             checksum=args.checksum,

diff --git a/datahugger/api.py b/datahugger/api.py
@@ -53,6 +53,7 @@ def parse_resource_identifier(resource, resolve=True):
 def info(
     resource,
     max_file_size=None,
+    filter_files=None,
     force_download=False,
     unzip=True,
     checksum=False,
@@ -69,6 +70,8 @@ def info(
     max_file_size: int
         The maximum number of bytes for a single file. If exceeded,
         the file is skipped.
+    filter_files string
+        A regex pattern to filter files by name.
     force_download: bool
         Force the download of the dataset even if there are already
         files in the destination folder. Default: False.
@@ -97,6 +100,7 @@ def info(
     return service_class(
         handle,
         max_file_size=max_file_size,
+        filter_files=filter_files,
         force_download=force_download,
         unzip=unzip,
         checksum=checksum,
@@ -110,6 +114,7 @@ def get(
     resource,
     output_folder,
     max_file_size=None,
+    filter_files=None,
     force_download=False,
     unzip=True,
     checksum=False,
@@ -131,6 +136,8 @@ def get(
     max_file_size: int
         The maximum number of bytes for a single file. If exceeded,
         the file is skipped.
+    filter_files string
+        A regex pattern to filter files by name.
     force_download: bool
         Force the download of the dataset even if there are already
         files in the destination folder. Default: False.
@@ -156,6 +163,7 @@ def get(
     service = info(
         resource,
         max_file_size=max_file_size,
+        filter_files=filter_files,
         force_download=force_download,
         unzip=unzip,
         checksum=checksum,

diff --git a/datahugger/base.py b/datahugger/base.py
@@ -11,7 +11,7 @@
 from urllib.parse import urlparse
 
 import requests
-from jsonpath_ng import parse
+from jsonpath_ng.ext import parse
 from scitree import scitree
 from tqdm import tqdm
 
@@ -51,6 +51,7 @@ def __init__(
         self,
         resource,
         max_file_size=None,
+        filter_files=None,
         force_download=False,
         progress=True,
         unzip=True,
@@ -61,6 +62,7 @@ def __init__(
         super().__init__()
         self.resource = resource
         self.max_file_size = max_file_size
+        self.filter_files = filter_files
         self.force_download = force_download
         self.progress = progress
         self.unzip = unzip
@@ -157,6 +159,16 @@ def download_file(
             if self.progress:
                 print(f"{_format_filename(file_name)}: SKIPPED")
             return
+
+        if (
+            self.filter_files and
+            not re.match(self.filter_files, file_name)
+            ):
+
+            logging.info(f"Skipping file by filter {file_link}")
+            if self.progress:
+                print(f"{_format_filename(file_name)}: SKIPPED")
+            return
 
         if not self.print_only:
             logging.info(f"Downloading file {file_link}")
@@ -284,6 +296,7 @@ def _get_files_recursive(self, url, folder_name=None, base_url=None):
 
         # get the data from URL
         res = requests.get(url)
+        res.raise_for_status()
         response = res.json()
 
         # find path to raw files

diff --git a/datahugger/config.py b/datahugger/config.py
@@ -5,12 +5,14 @@
 from datahugger.services import DataverseDataset
 from datahugger.services import DjehutyDataset
 from datahugger.services import DSpaceDataset
+from datahugger.services import DataEuropaDataset
 from datahugger.services import FigShareDataset
 from datahugger.services import GitHubDataset
 from datahugger.services import HuggingFaceDataset
 from datahugger.services import MendeleyDataset
 from datahugger.services import OSFDataset
 from datahugger.services import PangaeaDataset
+from datahugger.services import SeaNoeDataset
 from datahugger.services import ZenodoDataset
 
 # fast lookup
@@ -113,10 +115,12 @@
     "researchdata.ntu.edu.sg": DataverseDataset,
     "rin.lipi.go.id": DataverseDataset,
     "ssri.is": DataverseDataset,
+    "www.seanoe.org": SeaNoeDataset,
     "trolling.uit.no": DataverseDataset,
     "www.sodha.be": DataverseDataset,
     "www.uni-hildesheim.de": DataverseDataset,
     "b2share.eudat.eu": B2shareDataset,
+    "data.europa.eu": DataEuropaDataset,
 }
 
 # regexp lookup

diff --git a/datahugger/services.py b/datahugger/services.py
@@ -320,7 +320,7 @@ class MendeleyDataset(DatasetDownloader):
 class OSFDataset(DatasetDownloader):
     """Downloader for OSF repository."""
 
-    REGEXP_ID = r"osf\.io\/(?P<record_id>.*)/"
+    REGEXP_ID = r"osf\.io\/(?P<record_id>[^\/]*)\/{0,1}"
 
     # the base entry point of the REST API
     API_URL = "https://api.osf.io/v2/nodes/"
@@ -410,3 +410,40 @@ class B2shareDataset(DatasetDownloader):
 
     def _get_attr_link(self, record, base_url=None):
         return f"{base_url}/files/{self._params['record_id']}/{record['key']}"
+
+
+class SeaNoeDataset(DatasetDownloader):
+    """Downloader for SeaNoe publication."""
+
+    REGEXP_ID = r"https://www.seanoe\.org/data/[0-9]+/(?P<record_id>.*)/"
+
+    # the base entry point of the REST API
+    API_URL = "https://www.seanoe.org/api/"
+
+    # the files and metadata about the dataset
+    API_URL_META = "{api_url}find-by-id/{record_id}"
+    META_FILES_JSONPATH = "files[*]"
+
+    # paths to file attributes
+    ATTR_NAME_JSONPATH = "fileName"
+    ATTR_FILE_LINK_JSONPATH = "fileUrl"
+    ATTR_SIZE_JSONPATH = "size"
+    ATTR_HASH_JSONPATH = "checksum"
+    ATTR_HASH_TYPE_VALUE = "sha256"
+
+
+class DataEuropaDataset(DatasetDownloader):
+    """Downloader for European data repository."""
+
+    REGEXP_ID = r"data\.europa\.eu\/data\/datasets\/(?P<record_id>.+)"
+
+    # the base entry point of the REST API
+    API_URL = "https://data.europa.eu/api/hub/repo/"
+
+    API_URL_META = "{api_url}datasets/{record_id}"
+    META_FILES_JSONPATH = '$.@graph[?(@.@type == "dcat:Distribution")]'
+
+    # paths to file attributes
+    ATTR_FILE_LINK_JSONPATH = "'dcat:accessURL'.@id"
+    ATTR_NAME_JSONPATH = "'dct:title'"
+    ATTR_SIZE_JSONPATH = "'dcat:byteSize'.@value"
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,7 +15,7 @@ classifiers = [
     "Programming Language :: Python :: 3.12"
 ]
 license = {text = "MIT"}
-dependencies = ["jsonpath_ng", "requests", "requests-cache", "scitree", "tqdm"]
+dependencies = ["jsonpath_ng", "pandas", "requests", "requests-cache", "scitree", "tqdm"]
 dynamic = ["version"]
 requires-python = ">=3.8"
 

diff --git a/tests/test_repositories.toml b/tests/test_repositories.toml
@@ -113,3 +113,12 @@ files = "cbsodata-main/README.md"
 [[b2share]]
 location = "https://b2share.eudat.eu/records/db2ef5890fa44c7a85af366a50de73b9"
 files = "2024-02-13.sav"
+
+[[seanoe]]
+location = "https://doi.org/10.17882/101042"
+files = "111609.xlsx"
+
+[[dataeuropa]]
+location = "https://data.europa.eu/data/datasets/65e092e4009f18f050b14216"
+files = "consolidation-wattzhub-schema-irve-dynamic-20240918-033000.csv"
+