diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index dec5926..b6aad02 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,11 +1,34 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions -name: Python package +name: Test repositories on: [push, pull_request] jobs: - build: + test-repositories: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install . + python -m pip install pytest pytest-xdist + - name: Test with pytest + run: | + pytest -n 4 --ignore=tests/test_repositories_plus.py + + test-repositories-plus: runs-on: ubuntu-latest strategy: @@ -26,4 +49,4 @@ jobs: python -m pip install pytest pytest-xdist - name: Test with pytest run: | - pytest -n 4 + pytest -n 4 tests/test_repositories_plus.py diff --git a/datahugger/__main__.py b/datahugger/__main__.py index 522a014..eb97a8d 100644 --- a/datahugger/__main__.py +++ b/datahugger/__main__.py @@ -66,8 +66,7 @@ def main(): parser.add_argument("--no-unzip", dest="unzip", action="store_false") parser.set_defaults(unzip=True) - parser.add_argument("--checksum", dest="checksum", action="store_false") - parser.set_defaults(checksum=False) + parser.add_argument("--checksum", dest="checksum", action="store_true") parser.add_argument("--no-progress", dest="progress", action="store_false") parser.set_defaults(progress=True) diff --git a/datahugger/base.py b/datahugger/base.py index f08bfdb..cd7797b 100644 --- a/datahugger/base.py +++ b/datahugger/base.py @@ -10,7 +10,6 @@ from typing import Union from urllib.parse import urlparse -import pandas as pd import requests from jsonpath_ng import parse from scitree import scitree @@ -224,24 +223,22 @@ def _check_checksums(self, output_folder, files_info): try: checksums = {} - df = pd.DataFrame(files_info) - # loop through the downloaded files in the output_folder for subdir, dirs, files in os.walk(output_folder): logging.info(f"Not using the dirs: {dirs}") for file in files: filepath = os.path.join(subdir, file) - df2 = df[df["name"] == file].reset_index() + + file_comp = list(filter(lambda x: x["name"] == file, files_info)) + try: - hash = df2["hash"][0] - except Exception as e: - logging.info(f"Setting hash to None: {e}") + hash = file_comp[0]["hash"] + hash_type = file_comp[0]["hash_type"] + except IndexError: + logging.info("Setting hash and hash_type to None") hash = None - try: - hash_type = df2["hash_type"][0] - except Exception as e: - logging.info(f"Setting hash_type to None: {e}") hash_type = None + newhash = None with open(filepath, "rb") as f: if hash_type == "md5": @@ -257,10 +254,10 @@ def _check_checksums(self, output_folder, files_info): if hash_type == "sha512": newhash = hashlib.sha512(f.read()).hexdigest() hash_match = hash == newhash + if hash is not None and hash_type is not None: - status = f"---> Checksum match: {hash_match} - {file}" - print(status) - logging.info(status) + print(f"Checksum match: {hash_match} - {file}") + logging.info(f"Checksum match: {hash_match} - {file}") checksums[file] = hash_match try: @@ -398,7 +395,7 @@ def _get( file_hash=f["hash"], file_hash_type=f["hash_type"], ) - # if checksum==True do checking of checksum + if self.checksum: self._check_checksums(output_folder=output_folder, files_info=files_info) diff --git a/pyproject.toml b/pyproject.toml index 646a5c2..b0aecd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ datahugger = "datahugger.__main__:main" [project.optional-dependencies] all = ["datasets"] -benchmark = ["pandas", "requests", "tabulate"] +benchmark = ["pandas", "tabulate"] lint = ["ruff"] test = ["pytest"] docs = ["mkdocs-material"] diff --git a/tests/test_repositories.py b/tests/test_repositories.py index aca2a6e..304aa87 100644 --- a/tests/test_repositories.py +++ b/tests/test_repositories.py @@ -53,8 +53,6 @@ "https://repositorioinstitucional.ceu.es/handle/10637/2741", "Aquaporin_1_JAMartin_et_al_MedSport_2009.pdf", ), - # huggingface - # ("10.57967/hf/0034", "test.csv"), # Pangaea ("https://doi.org/10.1594/PANGAEA.954547", "Gubbio_age.tab"), ("https://doi.pangaea.de/10.1594/PANGAEA.954543", "AA_age.tab"), @@ -99,16 +97,3 @@ def test_info_without_loading(tmpdir): dh_info = datahugger.info("https://osf.io/wdzh5/") assert dh_get.dataset.files == dh_info.files - - -def test_huggingface(tmpdir): - datahugger.get( - "https://huggingface.co/datasets/wikitext", - tmpdir, - params={"name": "wikitext-2-v1"}, - ) - - -def test_huggingface_without_params(tmpdir): - with pytest.raises(ValueError): - datahugger.get("https://huggingface.co/datasets/wikitext", tmpdir) diff --git a/tests/test_repositories_plus.py b/tests/test_repositories_plus.py new file mode 100644 index 0000000..622d497 --- /dev/null +++ b/tests/test_repositories_plus.py @@ -0,0 +1,16 @@ +import pytest + +import datahugger + + +def test_huggingface(tmpdir): + datahugger.get( + "https://huggingface.co/datasets/wikitext", + tmpdir, + params={"name": "wikitext-2-v1"}, + ) + + +def test_huggingface_without_params(tmpdir): + with pytest.raises(ValueError): + datahugger.get("https://huggingface.co/datasets/wikitext", tmpdir)