implement checksum checking

J535D165 · Jan 17, 2024 · 7e75482 · 7e75482
1 parent 338781b
commit 7e75482
Show file tree

Hide file tree

Showing 3 changed files with 89 additions and 0 deletions.
diff --git a/datahugger/__main__.py b/datahugger/__main__.py
@@ -66,6 +66,9 @@ def main():
     parser.add_argument("--no-unzip", dest="unzip", action="store_false")
     parser.set_defaults(unzip=True)
 
+    parser.add_argument("--checksum", dest="checksum", action="store_false")
+    parser.set_defaults(checksum=False)
+
     parser.add_argument("--no-progress", dest="progress", action="store_false")
     parser.set_defaults(progress=True)
 
@@ -113,6 +116,7 @@ def main():
             max_file_size=args.max_file_size,
             force_download=args.force_download,
             unzip=args.unzip,
+            checksum=args.checksum,
             progress=args.progress,
             print_only=args.print_only,
             params=args.params,

diff --git a/datahugger/api.py b/datahugger/api.py
@@ -55,6 +55,7 @@ def info(
     max_file_size=None,
     force_download=False,
     unzip=True,
+    checksum=False,
     progress=True,
     print_only=False,
     params=None,
@@ -73,6 +74,8 @@ def info(
         files in the destination folder. Default: False.
     unzip: bool
         Unzip is the output is a single zip file. Default: True.
+    checksum: bool
+        Checksum will check the checksum of downloaded files. Default: False.
     progress: bool
         Print the progress of the download. Default: True.
     print_only: bool
@@ -96,6 +99,7 @@ def info(
         max_file_size=max_file_size,
         force_download=force_download,
         unzip=unzip,
+        checksum=checksum,
         progress=progress,
         print_only=print_only,
         params=params,
@@ -108,6 +112,7 @@ def get(
     max_file_size=None,
     force_download=False,
     unzip=True,
+    checksum=False,
     progress=True,
     print_only=False,
     params=None,
@@ -131,6 +136,8 @@ def get(
         files in the destination folder. Default: False.
     unzip: bool
         Unzip is the output is a single zip file. Default: True.
+    checksum: bool
+        Checksum will check the checksum of downloaded files Default: False.
     progress: bool
         Print the progress of the download. Default: True.
     print_only: bool
@@ -151,6 +158,7 @@ def get(
         max_file_size=max_file_size,
         force_download=force_download,
         unzip=unzip,
+        checksum=checksum,
         progress=progress,
         print_only=print_only,
         params=params,

diff --git a/datahugger/base.py b/datahugger/base.py
@@ -1,4 +1,5 @@
 import io
+import json
 import logging
 import os
 import re
@@ -11,6 +12,8 @@
 from jsonpath_ng import parse
 from scitree import scitree
 from tqdm import tqdm
+import pandas as pd
+import hashlib
 
 from datahugger.utils import _format_filename
 from datahugger.utils import _get_url
@@ -51,6 +54,7 @@ def __init__(
         force_download=False,
         progress=True,
         unzip=True,
+        checksum=False,
         print_only=False,
         params=None,
     ):
@@ -60,6 +64,7 @@ def __init__(
         self.force_download = force_download
         self.progress = progress
         self.unzip = unzip
+        self.checksum = checksum
         self.print_only = print_only
         self.params = params
 
@@ -201,6 +206,72 @@ def _unpack_single_folder(self, zip_url, output_folder):
             zip_info.filename = os.path.basename(zip_info.filename)
             z.extract(zip_info, output_folder)
 
+
+    def _check_checksums(self, output_folder, files_info):
+        """Will compare the checksum values in the files_info with the checksums
+        of the downloaded files and will create a file in a new 'generated'
+        folder with the results.
+
+        Args:
+            output_folder (str): output_folder to push the data to
+            files_info (list): information on all the files
+
+        Example file output:
+            file name: generated/checksums.json
+            file content:
+                {"BTCBRL_final.csv": true}
+        """
+        try:
+            checksums = {}
+
+            df = pd.DataFrame(files_info)
+
+            # loop through the downloaded files in the output_folder
+            for subdir, dirs, files in os.walk(output_folder):
+                for file in files:
+                    filepath = os.path.join(subdir, file)
+                    df2 = df[df['name'] == file].reset_index()
+                    try:
+                        hash = df2['hash'][0]
+                    except:
+                        hash = None
+                    try:
+                        hash_type = df2['hash_type'][0]
+                    except:
+                        hash_type = None
+                    newhash = None
+                    with open(filepath, "rb") as f:
+                        if hash_type == 'md5':
+                            newhash = hashlib.md5(f.read()).hexdigest()
+                        if hash_type == 'sha1':
+                            newhash = hashlib.sha1(f.read()).hexdigest()
+                        if hash_type == 'sha224':
+                            newhash = hashlib.sha224(f.read()).hexdigest()
+                        if hash_type == 'sha256':
+                            newhash = hashlib.sha256(f.read()).hexdigest()
+                        if hash_type == 'sha384':
+                            newhash = hashlib.sha384(f.read()).hexdigest()
+                        if hash_type == 'sha512':
+                            newhash = hashlib.sha512(f.read()).hexdigest()
+                    hash_match = (hash == newhash)
+                    if hash is not None and hash_type is not None:
+                        status = f"---> Checksum match: {hash_match} - {file}"
+                        print(status)
+                        logging.info(status)
+                        checksums[file] = hash_match
+
+            try:
+                timestamp = str(time.time()).split('.')[0]
+            except:
+                timestamp = ""
+            generated_path = f"{output_folder}/generated"
+            if not os.path.isdir(generated_path):
+                os.mkdir(generated_path)
+            with open(f"{generated_path}/checksums{timestamp}.json", "w") as f:
+                json.dump(checksums, f)
+        except Exception as e:
+            logging.error(f"Failed at checksum: {e}")
+
     def _pre_files(self):
         pass
 
@@ -312,7 +383,9 @@ def _get(
             self._unpack_single_folder(self.files[0]["link"], output_folder)
             return
 
+        files_info = []
         for f in self.files:
+            files_info.append(f)
             self.download_file(
                 f["link"],
                 output_folder,
@@ -321,6 +394,10 @@ def _get(
                 file_hash=f["hash"],
                 file_hash_type=f["hash_type"],
             )
+        # if checksum==True do checking of checksum
+        if self.checksum:
+            self._check_checksums(output_folder=output_folder,
+                                files_info=files_info)
 
     def download(
         self,