From 7b6fedcdb3d1c54c8798741eb811a50bcea702f8 Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Mon, 24 Jun 2024 12:49:31 +0200 Subject: [PATCH] Remove pandas dep and fix argument checksum on CLI --- datahugger/__main__.py | 3 +-- datahugger/base.py | 27 ++++++++++++--------------- pyproject.toml | 2 +- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/datahugger/__main__.py b/datahugger/__main__.py index 522a014..eb97a8d 100644 --- a/datahugger/__main__.py +++ b/datahugger/__main__.py @@ -66,8 +66,7 @@ def main(): parser.add_argument("--no-unzip", dest="unzip", action="store_false") parser.set_defaults(unzip=True) - parser.add_argument("--checksum", dest="checksum", action="store_false") - parser.set_defaults(checksum=False) + parser.add_argument("--checksum", dest="checksum", action="store_true") parser.add_argument("--no-progress", dest="progress", action="store_false") parser.set_defaults(progress=True) diff --git a/datahugger/base.py b/datahugger/base.py index f08bfdb..cd7797b 100644 --- a/datahugger/base.py +++ b/datahugger/base.py @@ -10,7 +10,6 @@ from typing import Union from urllib.parse import urlparse -import pandas as pd import requests from jsonpath_ng import parse from scitree import scitree @@ -224,24 +223,22 @@ def _check_checksums(self, output_folder, files_info): try: checksums = {} - df = pd.DataFrame(files_info) - # loop through the downloaded files in the output_folder for subdir, dirs, files in os.walk(output_folder): logging.info(f"Not using the dirs: {dirs}") for file in files: filepath = os.path.join(subdir, file) - df2 = df[df["name"] == file].reset_index() + + file_comp = list(filter(lambda x: x["name"] == file, files_info)) + try: - hash = df2["hash"][0] - except Exception as e: - logging.info(f"Setting hash to None: {e}") + hash = file_comp[0]["hash"] + hash_type = file_comp[0]["hash_type"] + except IndexError: + logging.info("Setting hash and hash_type to None") hash = None - try: - hash_type = df2["hash_type"][0] - except Exception as e: - logging.info(f"Setting hash_type to None: {e}") hash_type = None + newhash = None with open(filepath, "rb") as f: if hash_type == "md5": @@ -257,10 +254,10 @@ def _check_checksums(self, output_folder, files_info): if hash_type == "sha512": newhash = hashlib.sha512(f.read()).hexdigest() hash_match = hash == newhash + if hash is not None and hash_type is not None: - status = f"---> Checksum match: {hash_match} - {file}" - print(status) - logging.info(status) + print(f"Checksum match: {hash_match} - {file}") + logging.info(f"Checksum match: {hash_match} - {file}") checksums[file] = hash_match try: @@ -398,7 +395,7 @@ def _get( file_hash=f["hash"], file_hash_type=f["hash_type"], ) - # if checksum==True do checking of checksum + if self.checksum: self._check_checksums(output_folder=output_folder, files_info=files_info) diff --git a/pyproject.toml b/pyproject.toml index 646a5c2..b0aecd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ datahugger = "datahugger.__main__:main" [project.optional-dependencies] all = ["datasets"] -benchmark = ["pandas", "requests", "tabulate"] +benchmark = ["pandas", "tabulate"] lint = ["ruff"] test = ["pytest"] docs = ["mkdocs-material"]