From 7e75482a08707b5ecb0a9e38eca733f9b1a20734 Mon Sep 17 00:00:00 2001
From: Dave Tromp <dave.tromp@surf.nl>
Date: Wed, 17 Jan 2024 13:32:23 +0100
Subject: [PATCH 1/5] implement checksum checking

---
 datahugger/__main__.py |  4 +++
 datahugger/api.py      |  8 +++++
 datahugger/base.py     | 77 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 89 insertions(+)

diff --git a/datahugger/__main__.py b/datahugger/__main__.py
index f833f0a..522a014 100644
--- a/datahugger/__main__.py
+++ b/datahugger/__main__.py
@@ -66,6 +66,9 @@ def main():
     parser.add_argument("--no-unzip", dest="unzip", action="store_false")
     parser.set_defaults(unzip=True)
 
+    parser.add_argument("--checksum", dest="checksum", action="store_false")
+    parser.set_defaults(checksum=False)
+
     parser.add_argument("--no-progress", dest="progress", action="store_false")
     parser.set_defaults(progress=True)
 
@@ -113,6 +116,7 @@ def main():
             max_file_size=args.max_file_size,
             force_download=args.force_download,
             unzip=args.unzip,
+            checksum=args.checksum,
             progress=args.progress,
             print_only=args.print_only,
             params=args.params,
diff --git a/datahugger/api.py b/datahugger/api.py
index e130e72..a09138e 100644
--- a/datahugger/api.py
+++ b/datahugger/api.py
@@ -55,6 +55,7 @@ def info(
     max_file_size=None,
     force_download=False,
     unzip=True,
+    checksum=False,
     progress=True,
     print_only=False,
     params=None,
@@ -73,6 +74,8 @@ def info(
         files in the destination folder. Default: False.
     unzip: bool
         Unzip is the output is a single zip file. Default: True.
+    checksum: bool
+        Checksum will check the checksum of downloaded files. Default: False.
     progress: bool
         Print the progress of the download. Default: True.
     print_only: bool
@@ -96,6 +99,7 @@ def info(
         max_file_size=max_file_size,
         force_download=force_download,
         unzip=unzip,
+        checksum=checksum,
         progress=progress,
         print_only=print_only,
         params=params,
@@ -108,6 +112,7 @@ def get(
     max_file_size=None,
     force_download=False,
     unzip=True,
+    checksum=False,
     progress=True,
     print_only=False,
     params=None,
@@ -131,6 +136,8 @@ def get(
         files in the destination folder. Default: False.
     unzip: bool
         Unzip is the output is a single zip file. Default: True.
+    checksum: bool
+        Checksum will check the checksum of downloaded files Default: False.
     progress: bool
         Print the progress of the download. Default: True.
     print_only: bool
@@ -151,6 +158,7 @@ def get(
         max_file_size=max_file_size,
         force_download=force_download,
         unzip=unzip,
+        checksum=checksum,
         progress=progress,
         print_only=print_only,
         params=params,
diff --git a/datahugger/base.py b/datahugger/base.py
index cdb6024..eb2e185 100644
--- a/datahugger/base.py
+++ b/datahugger/base.py
@@ -1,4 +1,5 @@
 import io
+import json
 import logging
 import os
 import re
@@ -11,6 +12,8 @@
 from jsonpath_ng import parse
 from scitree import scitree
 from tqdm import tqdm
+import pandas as pd
+import hashlib
 
 from datahugger.utils import _format_filename
 from datahugger.utils import _get_url
@@ -51,6 +54,7 @@ def __init__(
         force_download=False,
         progress=True,
         unzip=True,
+        checksum=False,
         print_only=False,
         params=None,
     ):
@@ -60,6 +64,7 @@ def __init__(
         self.force_download = force_download
         self.progress = progress
         self.unzip = unzip
+        self.checksum = checksum
         self.print_only = print_only
         self.params = params
 
@@ -201,6 +206,72 @@ def _unpack_single_folder(self, zip_url, output_folder):
             zip_info.filename = os.path.basename(zip_info.filename)
             z.extract(zip_info, output_folder)
 
+
+    def _check_checksums(self, output_folder, files_info):
+        """Will compare the checksum values in the files_info with the checksums
+        of the downloaded files and will create a file in a new 'generated'
+        folder with the results.
+
+        Args:
+            output_folder (str): output_folder to push the data to
+            files_info (list): information on all the files
+
+        Example file output:
+            file name: generated/checksums.json
+            file content:
+                {"BTCBRL_final.csv": true}
+        """
+        try:
+            checksums = {}
+            
+            df = pd.DataFrame(files_info)
+
+            # loop through the downloaded files in the output_folder
+            for subdir, dirs, files in os.walk(output_folder):
+                for file in files:
+                    filepath = os.path.join(subdir, file)
+                    df2 = df[df['name'] == file].reset_index()
+                    try:
+                        hash = df2['hash'][0]
+                    except:
+                        hash = None
+                    try:
+                        hash_type = df2['hash_type'][0]
+                    except:
+                        hash_type = None
+                    newhash = None
+                    with open(filepath, "rb") as f:
+                        if hash_type == 'md5':
+                            newhash = hashlib.md5(f.read()).hexdigest()
+                        if hash_type == 'sha1':
+                            newhash = hashlib.sha1(f.read()).hexdigest()
+                        if hash_type == 'sha224':
+                            newhash = hashlib.sha224(f.read()).hexdigest()
+                        if hash_type == 'sha256':
+                            newhash = hashlib.sha256(f.read()).hexdigest()
+                        if hash_type == 'sha384':
+                            newhash = hashlib.sha384(f.read()).hexdigest()
+                        if hash_type == 'sha512':
+                            newhash = hashlib.sha512(f.read()).hexdigest()
+                    hash_match = (hash == newhash)
+                    if hash is not None and hash_type is not None:
+                        status = f"---> Checksum match: {hash_match} - {file}"
+                        print(status)
+                        logging.info(status)
+                        checksums[file] = hash_match
+                    
+            try:
+                timestamp = str(time.time()).split('.')[0]
+            except:
+                timestamp = ""
+            generated_path = f"{output_folder}/generated"
+            if not os.path.isdir(generated_path):
+                os.mkdir(generated_path)
+            with open(f"{generated_path}/checksums{timestamp}.json", "w") as f:
+                json.dump(checksums, f)
+        except Exception as e:
+            logging.error(f"Failed at checksum: {e}")
+
     def _pre_files(self):
         pass
 
@@ -312,7 +383,9 @@ def _get(
             self._unpack_single_folder(self.files[0]["link"], output_folder)
             return
 
+        files_info = []
         for f in self.files:
+            files_info.append(f)
             self.download_file(
                 f["link"],
                 output_folder,
@@ -321,6 +394,10 @@ def _get(
                 file_hash=f["hash"],
                 file_hash_type=f["hash_type"],
             )
+        # if checksum==True do checking of checksum
+        if self.checksum:
+            self._check_checksums(output_folder=output_folder,
+                                files_info=files_info)
 
     def download(
         self,

From 6557f796b479bda8acac249bc0f8c3cec3fb92a5 Mon Sep 17 00:00:00 2001
From: Dave Tromp <dave.tromp@surf.nl>
Date: Wed, 17 Jan 2024 14:07:41 +0100
Subject: [PATCH 2/5] add missing import and fix bare excepts

---
 datahugger/base.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/datahugger/base.py b/datahugger/base.py
index eb2e185..af06e1e 100644
--- a/datahugger/base.py
+++ b/datahugger/base.py
@@ -3,6 +3,7 @@
 import logging
 import os
 import re
+import time
 import zipfile
 from pathlib import Path
 from typing import Union
@@ -233,11 +234,13 @@ def _check_checksums(self, output_folder, files_info):
                     df2 = df[df['name'] == file].reset_index()
                     try:
                         hash = df2['hash'][0]
-                    except:
+                    except Exception as e:
+                        logging.info(f"Setting hash to None: {e}")
                         hash = None
                     try:
                         hash_type = df2['hash_type'][0]
-                    except:
+                    except Exception as e:
+                        logging.info(f"Setting hash_type to None: {e}")
                         hash_type = None
                     newhash = None
                     with open(filepath, "rb") as f:

From 354e7be7c3f5a980d53aa3775176bf050b811cf5 Mon Sep 17 00:00:00 2001
From: Dave Tromp <dave.tromp@surf.nl>
Date: Wed, 17 Jan 2024 14:27:13 +0100
Subject: [PATCH 3/5] fixing some linting issues

---
 datahugger/base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/datahugger/base.py b/datahugger/base.py
index af06e1e..22ea8ea 100644
--- a/datahugger/base.py
+++ b/datahugger/base.py
@@ -229,6 +229,7 @@ def _check_checksums(self, output_folder, files_info):
 
             # loop through the downloaded files in the output_folder
             for subdir, dirs, files in os.walk(output_folder):
+                logging.info(f"Not using the dirs: {dirs}")
                 for file in files:
                     filepath = os.path.join(subdir, file)
                     df2 = df[df['name'] == file].reset_index()
@@ -265,7 +266,8 @@ def _check_checksums(self, output_folder, files_info):
                     
             try:
                 timestamp = str(time.time()).split('.')[0]
-            except:
+            except Exception as e:
+                logging.info(f"Setting timestamp to empty string: {e}")
                 timestamp = ""
             generated_path = f"{output_folder}/generated"
             if not os.path.isdir(generated_path):

From 4203a6ae1df8d8cbf38e52ef749c19c57b423b59 Mon Sep 17 00:00:00 2001
From: Dave Tromp <dave.tromp@surf.nl>
Date: Wed, 17 Jan 2024 14:37:59 +0100
Subject: [PATCH 4/5] reorganize and reformat base.py

---
 datahugger/base.py | 42 ++++++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/datahugger/base.py b/datahugger/base.py
index 22ea8ea..b3f3a9c 100644
--- a/datahugger/base.py
+++ b/datahugger/base.py
@@ -1,3 +1,4 @@
+import hashlib
 import io
 import json
 import logging
@@ -9,16 +10,13 @@
 from typing import Union
 from urllib.parse import urlparse
 
+import pandas as pd
 import requests
 from jsonpath_ng import parse
 from scitree import scitree
 from tqdm import tqdm
-import pandas as pd
-import hashlib
 
-from datahugger.utils import _format_filename
-from datahugger.utils import _get_url
-from datahugger.utils import _is_url
+from datahugger.utils import _format_filename, _get_url, _is_url
 
 
 class DownloadResult:
@@ -207,7 +205,6 @@ def _unpack_single_folder(self, zip_url, output_folder):
             zip_info.filename = os.path.basename(zip_info.filename)
             z.extract(zip_info, output_folder)
 
-
     def _check_checksums(self, output_folder, files_info):
         """Will compare the checksum values in the files_info with the checksums
         of the downloaded files and will create a file in a new 'generated'
@@ -224,7 +221,7 @@ def _check_checksums(self, output_folder, files_info):
         """
         try:
             checksums = {}
-            
+
             df = pd.DataFrame(files_info)
 
             # loop through the downloaded files in the output_folder
@@ -232,40 +229,40 @@ def _check_checksums(self, output_folder, files_info):
                 logging.info(f"Not using the dirs: {dirs}")
                 for file in files:
                     filepath = os.path.join(subdir, file)
-                    df2 = df[df['name'] == file].reset_index()
+                    df2 = df[df["name"] == file].reset_index()
                     try:
-                        hash = df2['hash'][0]
+                        hash = df2["hash"][0]
                     except Exception as e:
                         logging.info(f"Setting hash to None: {e}")
                         hash = None
                     try:
-                        hash_type = df2['hash_type'][0]
+                        hash_type = df2["hash_type"][0]
                     except Exception as e:
                         logging.info(f"Setting hash_type to None: {e}")
                         hash_type = None
                     newhash = None
                     with open(filepath, "rb") as f:
-                        if hash_type == 'md5':
+                        if hash_type == "md5":
                             newhash = hashlib.md5(f.read()).hexdigest()
-                        if hash_type == 'sha1':
+                        if hash_type == "sha1":
                             newhash = hashlib.sha1(f.read()).hexdigest()
-                        if hash_type == 'sha224':
+                        if hash_type == "sha224":
                             newhash = hashlib.sha224(f.read()).hexdigest()
-                        if hash_type == 'sha256':
+                        if hash_type == "sha256":
                             newhash = hashlib.sha256(f.read()).hexdigest()
-                        if hash_type == 'sha384':
+                        if hash_type == "sha384":
                             newhash = hashlib.sha384(f.read()).hexdigest()
-                        if hash_type == 'sha512':
+                        if hash_type == "sha512":
                             newhash = hashlib.sha512(f.read()).hexdigest()
-                    hash_match = (hash == newhash)
+                    hash_match = hash == newhash
                     if hash is not None and hash_type is not None:
                         status = f"---> Checksum match: {hash_match} - {file}"
                         print(status)
                         logging.info(status)
                         checksums[file] = hash_match
-                    
+
             try:
-                timestamp = str(time.time()).split('.')[0]
+                timestamp = str(time.time()).split(".")[0]
             except Exception as e:
                 logging.info(f"Setting timestamp to empty string: {e}")
                 timestamp = ""
@@ -331,7 +328,8 @@ def _get_files_recursive(self, url, folder_name=None, base_url=None):
 
             if next_url:
                 result.extend(
-                    self._get_files_recursive(next_url, folder_name=folder_name)
+                    self._get_files_recursive(
+                        next_url, folder_name=folder_name)
                 )
 
         return result
@@ -401,8 +399,8 @@ def _get(
             )
         # if checksum==True do checking of checksum
         if self.checksum:
-            self._check_checksums(output_folder=output_folder,
-                                files_info=files_info)
+            self._check_checksums(
+                output_folder=output_folder, files_info=files_info)
 
     def download(
         self,

From 9ab71392ec5eb79e1b50bbde00302296a3d6e7f0 Mon Sep 17 00:00:00 2001
From: Jonathan de Bruin <jonathandebruinos@gmail.com>
Date: Tue, 26 Mar 2024 22:16:49 +0100
Subject: [PATCH 5/5] Happy lint

---
 datahugger/base.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/datahugger/base.py b/datahugger/base.py
index b3f3a9c..f08bfdb 100644
--- a/datahugger/base.py
+++ b/datahugger/base.py
@@ -16,7 +16,9 @@
 from scitree import scitree
 from tqdm import tqdm
 
-from datahugger.utils import _format_filename, _get_url, _is_url
+from datahugger.utils import _format_filename
+from datahugger.utils import _get_url
+from datahugger.utils import _is_url
 
 
 class DownloadResult:
@@ -328,8 +330,7 @@ def _get_files_recursive(self, url, folder_name=None, base_url=None):
 
             if next_url:
                 result.extend(
-                    self._get_files_recursive(
-                        next_url, folder_name=folder_name)
+                    self._get_files_recursive(next_url, folder_name=folder_name)
                 )
 
         return result
@@ -399,8 +400,7 @@ def _get(
             )
         # if checksum==True do checking of checksum
         if self.checksum:
-            self._check_checksums(
-                output_folder=output_folder, files_info=files_info)
+            self._check_checksums(output_folder=output_folder, files_info=files_info)
 
     def download(
         self,