From 7b6fedcdb3d1c54c8798741eb811a50bcea702f8 Mon Sep 17 00:00:00 2001
From: Jonathan de Bruin <jonathandebruinos@gmail.com>
Date: Mon, 24 Jun 2024 12:49:31 +0200
Subject: [PATCH] Remove pandas dep and fix argument checksum on CLI

---
 datahugger/__main__.py |  3 +--
 datahugger/base.py     | 27 ++++++++++++---------------
 pyproject.toml         |  2 +-
 3 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/datahugger/__main__.py b/datahugger/__main__.py
index 522a014..eb97a8d 100644
--- a/datahugger/__main__.py
+++ b/datahugger/__main__.py
@@ -66,8 +66,7 @@ def main():
     parser.add_argument("--no-unzip", dest="unzip", action="store_false")
     parser.set_defaults(unzip=True)
 
-    parser.add_argument("--checksum", dest="checksum", action="store_false")
-    parser.set_defaults(checksum=False)
+    parser.add_argument("--checksum", dest="checksum", action="store_true")
 
     parser.add_argument("--no-progress", dest="progress", action="store_false")
     parser.set_defaults(progress=True)
diff --git a/datahugger/base.py b/datahugger/base.py
index f08bfdb..cd7797b 100644
--- a/datahugger/base.py
+++ b/datahugger/base.py
@@ -10,7 +10,6 @@
 from typing import Union
 from urllib.parse import urlparse
 
-import pandas as pd
 import requests
 from jsonpath_ng import parse
 from scitree import scitree
@@ -224,24 +223,22 @@ def _check_checksums(self, output_folder, files_info):
         try:
             checksums = {}
 
-            df = pd.DataFrame(files_info)
-
             # loop through the downloaded files in the output_folder
             for subdir, dirs, files in os.walk(output_folder):
                 logging.info(f"Not using the dirs: {dirs}")
                 for file in files:
                     filepath = os.path.join(subdir, file)
-                    df2 = df[df["name"] == file].reset_index()
+
+                    file_comp = list(filter(lambda x: x["name"] == file, files_info))
+
                     try:
-                        hash = df2["hash"][0]
-                    except Exception as e:
-                        logging.info(f"Setting hash to None: {e}")
+                        hash = file_comp[0]["hash"]
+                        hash_type = file_comp[0]["hash_type"]
+                    except IndexError:
+                        logging.info("Setting hash and hash_type to None")
                         hash = None
-                    try:
-                        hash_type = df2["hash_type"][0]
-                    except Exception as e:
-                        logging.info(f"Setting hash_type to None: {e}")
                         hash_type = None
+
                     newhash = None
                     with open(filepath, "rb") as f:
                         if hash_type == "md5":
@@ -257,10 +254,10 @@ def _check_checksums(self, output_folder, files_info):
                         if hash_type == "sha512":
                             newhash = hashlib.sha512(f.read()).hexdigest()
                     hash_match = hash == newhash
+
                     if hash is not None and hash_type is not None:
-                        status = f"---> Checksum match: {hash_match} - {file}"
-                        print(status)
-                        logging.info(status)
+                        print(f"Checksum match: {hash_match} - {file}")
+                        logging.info(f"Checksum match: {hash_match} - {file}")
                         checksums[file] = hash_match
 
             try:
@@ -398,7 +395,7 @@ def _get(
                 file_hash=f["hash"],
                 file_hash_type=f["hash_type"],
             )
-        # if checksum==True do checking of checksum
+
         if self.checksum:
             self._check_checksums(output_folder=output_folder, files_info=files_info)
 
diff --git a/pyproject.toml b/pyproject.toml
index 646a5c2..b0aecd3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,7 @@ datahugger = "datahugger.__main__:main"
 
 [project.optional-dependencies]
 all = ["datasets"]
-benchmark = ["pandas", "requests", "tabulate"]
+benchmark = ["pandas", "tabulate"]
 lint = ["ruff"]
 test = ["pytest"]
 docs = ["mkdocs-material"]