J535D165 · micafer · Jun 4, 2024 · Jun 4, 2024 · Sep 30, 2024 · Sep 30, 2024
diff --git a/datahugger/__main__.py b/datahugger/__main__.py
@@ -58,6 +58,13 @@ def main():
         help="Skip files larger than this size. Might not work for all services.",
     )
 
+    parser.add_argument(
+        "--filter-files",
+        default=None,
+        type=str,
+        help="A regex pattern to filter files by name.",
+    )
+
     parser.add_argument(
         "-f", "--force-download", dest="force_download", action="store_true"
     )
@@ -113,6 +120,7 @@ def main():
             args.url_or_doi,
             args.output_dir,
             max_file_size=args.max_file_size,
+            filter_files=args.filter_files,
             force_download=args.force_download,
             unzip=args.unzip,
             checksum=args.checksum,

diff --git a/datahugger/api.py b/datahugger/api.py
@@ -53,6 +53,7 @@ def parse_resource_identifier(resource, resolve=True):
 def info(
     resource,
     max_file_size=None,
+    filter_files=None,
     force_download=False,
     unzip=True,
     checksum=False,
@@ -69,6 +70,8 @@ def info(
     max_file_size: int
         The maximum number of bytes for a single file. If exceeded,
         the file is skipped.
+    filter_files string
+        A regex pattern to filter files by name.
     force_download: bool
         Force the download of the dataset even if there are already
         files in the destination folder. Default: False.
@@ -97,6 +100,7 @@ def info(
     return service_class(
         handle,
         max_file_size=max_file_size,
+        filter_files=filter_files,
         force_download=force_download,
         unzip=unzip,
         checksum=checksum,
@@ -110,6 +114,7 @@ def get(
     resource,
     output_folder,
     max_file_size=None,
+    filter_files=None,
     force_download=False,
     unzip=True,
     checksum=False,
@@ -131,6 +136,8 @@ def get(
     max_file_size: int
         The maximum number of bytes for a single file. If exceeded,
         the file is skipped.
+    filter_files string
+        A regex pattern to filter files by name.
     force_download: bool
         Force the download of the dataset even if there are already
         files in the destination folder. Default: False.
@@ -156,6 +163,7 @@ def get(
     service = info(
         resource,
         max_file_size=max_file_size,
+        filter_files=filter_files,
         force_download=force_download,
         unzip=unzip,
         checksum=checksum,

diff --git a/datahugger/base.py b/datahugger/base.py
@@ -51,6 +51,7 @@ def __init__(
         self,
         resource,
         max_file_size=None,
+        filter_files=None,
         force_download=False,
         progress=True,
         unzip=True,
@@ -61,6 +62,7 @@ def __init__(
         super().__init__()
         self.resource = resource
         self.max_file_size = max_file_size
+        self.filter_files = filter_files
         self.force_download = force_download
         self.progress = progress
         self.unzip = unzip
@@ -158,6 +160,12 @@ def download_file(
                 print(f"{_format_filename(file_name)}: SKIPPED")
             return
 
+        if self.filter_files and not re.match(self.filter_files, file_name):
+            logging.info(f"Skipping file by filter {file_link}")
+            if self.progress:
+                print(f"{_format_filename(file_name)}: SKIPPED")
+            return
+
         if not self.print_only:
             logging.info(f"Downloading file {file_link}")
             res = requests.get(file_link, stream=True)

diff --git a/tests/test_repositories_plus.py b/tests/test_repositories_plus.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 import pytest
 
 import datahugger
@@ -14,3 +16,11 @@ def test_huggingface(tmpdir):
 def test_huggingface_without_params(tmpdir):
     with pytest.raises(ValueError):
         datahugger.get("https://huggingface.co/datasets/wikitext", tmpdir)
+
+
+def test_filter(tmpdir):
+    datahugger.get("https://zenodo.org/records/6614829", tmpdir, filter_files=r".*\.m")
+
+    files = [file for file in Path(tmpdir).iterdir()]
+    assert len(files) == 1
+    assert files[0].name == "quasiperiod.m"