Skip to content

Commit

Permalink
Remove pandas dep and fix argument checksum on CLI
Browse files Browse the repository at this point in the history
  • Loading branch information
J535D165 committed Jun 24, 2024
1 parent 4935c64 commit 7b6fedc
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 18 deletions.
3 changes: 1 addition & 2 deletions datahugger/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,7 @@ def main():
parser.add_argument("--no-unzip", dest="unzip", action="store_false")
parser.set_defaults(unzip=True)

parser.add_argument("--checksum", dest="checksum", action="store_false")
parser.set_defaults(checksum=False)
parser.add_argument("--checksum", dest="checksum", action="store_true")

parser.add_argument("--no-progress", dest="progress", action="store_false")
parser.set_defaults(progress=True)
Expand Down
27 changes: 12 additions & 15 deletions datahugger/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from typing import Union
from urllib.parse import urlparse

import pandas as pd
import requests
from jsonpath_ng import parse
from scitree import scitree
Expand Down Expand Up @@ -224,24 +223,22 @@ def _check_checksums(self, output_folder, files_info):
try:
checksums = {}

df = pd.DataFrame(files_info)

# loop through the downloaded files in the output_folder
for subdir, dirs, files in os.walk(output_folder):
logging.info(f"Not using the dirs: {dirs}")
for file in files:
filepath = os.path.join(subdir, file)
df2 = df[df["name"] == file].reset_index()

file_comp = list(filter(lambda x: x["name"] == file, files_info))

try:
hash = df2["hash"][0]
except Exception as e:
logging.info(f"Setting hash to None: {e}")
hash = file_comp[0]["hash"]
hash_type = file_comp[0]["hash_type"]
except IndexError:
logging.info("Setting hash and hash_type to None")
hash = None
try:
hash_type = df2["hash_type"][0]
except Exception as e:
logging.info(f"Setting hash_type to None: {e}")
hash_type = None

newhash = None
with open(filepath, "rb") as f:
if hash_type == "md5":
Expand All @@ -257,10 +254,10 @@ def _check_checksums(self, output_folder, files_info):
if hash_type == "sha512":
newhash = hashlib.sha512(f.read()).hexdigest()
hash_match = hash == newhash

if hash is not None and hash_type is not None:
status = f"---> Checksum match: {hash_match} - {file}"
print(status)
logging.info(status)
print(f"Checksum match: {hash_match} - {file}")
logging.info(f"Checksum match: {hash_match} - {file}")
checksums[file] = hash_match

try:
Expand Down Expand Up @@ -398,7 +395,7 @@ def _get(
file_hash=f["hash"],
file_hash_type=f["hash_type"],
)
# if checksum==True do checking of checksum

if self.checksum:
self._check_checksums(output_folder=output_folder, files_info=files_info)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ datahugger = "datahugger.__main__:main"

[project.optional-dependencies]
all = ["datasets"]
benchmark = ["pandas", "requests", "tabulate"]
benchmark = ["pandas", "tabulate"]
lint = ["ruff"]
test = ["pytest"]
docs = ["mkdocs-material"]
Expand Down

0 comments on commit 7b6fedc

Please sign in to comment.