Skip to content

Commit

Permalink
Merge branch 'main' into b2share
Browse files Browse the repository at this point in the history
  • Loading branch information
micafer authored Sep 24, 2024
2 parents 659e2d3 + 62698ef commit 21e89a1
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 4 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ tracker. Pull Requests are very welcome as well.
Datahugger requires Python 3.6 or later.

```
pip install datahugger
pip install git+https://github.com/micafer/datahugger
```

## Getting started
Expand Down
8 changes: 8 additions & 0 deletions datahugger/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ def main():
help="Skip files larger than this size. Might not work for all services.",
)

parser.add_argument(
"--filter-files",
default=None,
type=str,
help="A regex pattern to filter files by name.",
)

parser.add_argument(
"-f", "--force-download", dest="force_download", action="store_true"
)
Expand Down Expand Up @@ -113,6 +120,7 @@ def main():
args.url_or_doi,
args.output_dir,
max_file_size=args.max_file_size,
filter_files=args.filter_files,
force_download=args.force_download,
unzip=args.unzip,
checksum=args.checksum,
Expand Down
8 changes: 8 additions & 0 deletions datahugger/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def parse_resource_identifier(resource, resolve=True):
def info(
resource,
max_file_size=None,
filter_files=None,
force_download=False,
unzip=True,
checksum=False,
Expand All @@ -69,6 +70,8 @@ def info(
max_file_size: int
The maximum number of bytes for a single file. If exceeded,
the file is skipped.
filter_files string
A regex pattern to filter files by name.
force_download: bool
Force the download of the dataset even if there are already
files in the destination folder. Default: False.
Expand Down Expand Up @@ -97,6 +100,7 @@ def info(
return service_class(
handle,
max_file_size=max_file_size,
filter_files=filter_files,
force_download=force_download,
unzip=unzip,
checksum=checksum,
Expand All @@ -110,6 +114,7 @@ def get(
resource,
output_folder,
max_file_size=None,
filter_files=None,
force_download=False,
unzip=True,
checksum=False,
Expand All @@ -131,6 +136,8 @@ def get(
max_file_size: int
The maximum number of bytes for a single file. If exceeded,
the file is skipped.
filter_files string
A regex pattern to filter files by name.
force_download: bool
Force the download of the dataset even if there are already
files in the destination folder. Default: False.
Expand All @@ -156,6 +163,7 @@ def get(
service = info(
resource,
max_file_size=max_file_size,
filter_files=filter_files,
force_download=force_download,
unzip=unzip,
checksum=checksum,
Expand Down
15 changes: 14 additions & 1 deletion datahugger/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from urllib.parse import urlparse

import requests
from jsonpath_ng import parse
from jsonpath_ng.ext import parse
from scitree import scitree
from tqdm import tqdm

Expand Down Expand Up @@ -51,6 +51,7 @@ def __init__(
self,
resource,
max_file_size=None,
filter_files=None,
force_download=False,
progress=True,
unzip=True,
Expand All @@ -61,6 +62,7 @@ def __init__(
super().__init__()
self.resource = resource
self.max_file_size = max_file_size
self.filter_files = filter_files
self.force_download = force_download
self.progress = progress
self.unzip = unzip
Expand Down Expand Up @@ -157,6 +159,16 @@ def download_file(
if self.progress:
print(f"{_format_filename(file_name)}: SKIPPED")
return

if (
self.filter_files and
not re.match(self.filter_files, file_name)
):

logging.info(f"Skipping file by filter {file_link}")
if self.progress:
print(f"{_format_filename(file_name)}: SKIPPED")
return

if not self.print_only:
logging.info(f"Downloading file {file_link}")
Expand Down Expand Up @@ -284,6 +296,7 @@ def _get_files_recursive(self, url, folder_name=None, base_url=None):

# get the data from URL
res = requests.get(url)
res.raise_for_status()
response = res.json()

# find path to raw files
Expand Down
4 changes: 4 additions & 0 deletions datahugger/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
from datahugger.services import DataverseDataset
from datahugger.services import DjehutyDataset
from datahugger.services import DSpaceDataset
from datahugger.services import DataEuropaDataset
from datahugger.services import FigShareDataset
from datahugger.services import GitHubDataset
from datahugger.services import HuggingFaceDataset
from datahugger.services import MendeleyDataset
from datahugger.services import OSFDataset
from datahugger.services import PangaeaDataset
from datahugger.services import SeaNoeDataset
from datahugger.services import ZenodoDataset

# fast lookup
Expand Down Expand Up @@ -113,10 +115,12 @@
"researchdata.ntu.edu.sg": DataverseDataset,
"rin.lipi.go.id": DataverseDataset,
"ssri.is": DataverseDataset,
"www.seanoe.org": SeaNoeDataset,
"trolling.uit.no": DataverseDataset,
"www.sodha.be": DataverseDataset,
"www.uni-hildesheim.de": DataverseDataset,
"b2share.eudat.eu": B2shareDataset,
"data.europa.eu": DataEuropaDataset,
}

# regexp lookup
Expand Down
39 changes: 38 additions & 1 deletion datahugger/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ class MendeleyDataset(DatasetDownloader):
class OSFDataset(DatasetDownloader):
"""Downloader for OSF repository."""

REGEXP_ID = r"osf\.io\/(?P<record_id>.*)/"
REGEXP_ID = r"osf\.io\/(?P<record_id>[^\/]*)\/{0,1}"

# the base entry point of the REST API
API_URL = "https://api.osf.io/v2/nodes/"
Expand Down Expand Up @@ -410,3 +410,40 @@ class B2shareDataset(DatasetDownloader):

def _get_attr_link(self, record, base_url=None):
return f"{base_url}/files/{self._params['record_id']}/{record['key']}"


class SeaNoeDataset(DatasetDownloader):
"""Downloader for SeaNoe publication."""

REGEXP_ID = r"https://www.seanoe\.org/data/[0-9]+/(?P<record_id>.*)/"

# the base entry point of the REST API
API_URL = "https://www.seanoe.org/api/"

# the files and metadata about the dataset
API_URL_META = "{api_url}find-by-id/{record_id}"
META_FILES_JSONPATH = "files[*]"

# paths to file attributes
ATTR_NAME_JSONPATH = "fileName"
ATTR_FILE_LINK_JSONPATH = "fileUrl"
ATTR_SIZE_JSONPATH = "size"
ATTR_HASH_JSONPATH = "checksum"
ATTR_HASH_TYPE_VALUE = "sha256"


class DataEuropaDataset(DatasetDownloader):
"""Downloader for European data repository."""

REGEXP_ID = r"data\.europa\.eu\/data\/datasets\/(?P<record_id>.+)"

# the base entry point of the REST API
API_URL = "https://data.europa.eu/api/hub/repo/"

API_URL_META = "{api_url}datasets/{record_id}"
META_FILES_JSONPATH = '$.@graph[?(@.@type == "dcat:Distribution")]'

# paths to file attributes
ATTR_FILE_LINK_JSONPATH = "'dcat:accessURL'.@id"
ATTR_NAME_JSONPATH = "'dct:title'"
ATTR_SIZE_JSONPATH = "'dcat:byteSize'.@value"
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ classifiers = [
"Programming Language :: Python :: 3.12"
]
license = {text = "MIT"}
dependencies = ["jsonpath_ng", "requests", "requests-cache", "scitree", "tqdm"]
dependencies = ["jsonpath_ng", "pandas", "requests", "requests-cache", "scitree", "tqdm"]
dynamic = ["version"]
requires-python = ">=3.8"

Expand Down
9 changes: 9 additions & 0 deletions tests/test_repositories.toml
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,12 @@ files = "cbsodata-main/README.md"
[[b2share]]
location = "https://b2share.eudat.eu/records/db2ef5890fa44c7a85af366a50de73b9"
files = "2024-02-13.sav"

[[seanoe]]
location = "https://doi.org/10.17882/101042"
files = "111609.xlsx"

[[dataeuropa]]
location = "https://data.europa.eu/data/datasets/65e092e4009f18f050b14216"
files = "consolidation-wattzhub-schema-irve-dynamic-20240918-033000.csv"

0 comments on commit 21e89a1

Please sign in to comment.