From 7e75482a08707b5ecb0a9e38eca733f9b1a20734 Mon Sep 17 00:00:00 2001 From: Dave Tromp Date: Wed, 17 Jan 2024 13:32:23 +0100 Subject: [PATCH 1/5] implement checksum checking --- datahugger/__main__.py | 4 +++ datahugger/api.py | 8 +++++ datahugger/base.py | 77 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+) diff --git a/datahugger/__main__.py b/datahugger/__main__.py index f833f0a..522a014 100644 --- a/datahugger/__main__.py +++ b/datahugger/__main__.py @@ -66,6 +66,9 @@ def main(): parser.add_argument("--no-unzip", dest="unzip", action="store_false") parser.set_defaults(unzip=True) + parser.add_argument("--checksum", dest="checksum", action="store_false") + parser.set_defaults(checksum=False) + parser.add_argument("--no-progress", dest="progress", action="store_false") parser.set_defaults(progress=True) @@ -113,6 +116,7 @@ def main(): max_file_size=args.max_file_size, force_download=args.force_download, unzip=args.unzip, + checksum=args.checksum, progress=args.progress, print_only=args.print_only, params=args.params, diff --git a/datahugger/api.py b/datahugger/api.py index e130e72..a09138e 100644 --- a/datahugger/api.py +++ b/datahugger/api.py @@ -55,6 +55,7 @@ def info( max_file_size=None, force_download=False, unzip=True, + checksum=False, progress=True, print_only=False, params=None, @@ -73,6 +74,8 @@ def info( files in the destination folder. Default: False. unzip: bool Unzip is the output is a single zip file. Default: True. + checksum: bool + Checksum will check the checksum of downloaded files. Default: False. progress: bool Print the progress of the download. Default: True. print_only: bool @@ -96,6 +99,7 @@ def info( max_file_size=max_file_size, force_download=force_download, unzip=unzip, + checksum=checksum, progress=progress, print_only=print_only, params=params, @@ -108,6 +112,7 @@ def get( max_file_size=None, force_download=False, unzip=True, + checksum=False, progress=True, print_only=False, params=None, @@ -131,6 +136,8 @@ def get( files in the destination folder. Default: False. unzip: bool Unzip is the output is a single zip file. Default: True. + checksum: bool + Checksum will check the checksum of downloaded files Default: False. progress: bool Print the progress of the download. Default: True. print_only: bool @@ -151,6 +158,7 @@ def get( max_file_size=max_file_size, force_download=force_download, unzip=unzip, + checksum=checksum, progress=progress, print_only=print_only, params=params, diff --git a/datahugger/base.py b/datahugger/base.py index cdb6024..eb2e185 100644 --- a/datahugger/base.py +++ b/datahugger/base.py @@ -1,4 +1,5 @@ import io +import json import logging import os import re @@ -11,6 +12,8 @@ from jsonpath_ng import parse from scitree import scitree from tqdm import tqdm +import pandas as pd +import hashlib from datahugger.utils import _format_filename from datahugger.utils import _get_url @@ -51,6 +54,7 @@ def __init__( force_download=False, progress=True, unzip=True, + checksum=False, print_only=False, params=None, ): @@ -60,6 +64,7 @@ def __init__( self.force_download = force_download self.progress = progress self.unzip = unzip + self.checksum = checksum self.print_only = print_only self.params = params @@ -201,6 +206,72 @@ def _unpack_single_folder(self, zip_url, output_folder): zip_info.filename = os.path.basename(zip_info.filename) z.extract(zip_info, output_folder) + + def _check_checksums(self, output_folder, files_info): + """Will compare the checksum values in the files_info with the checksums + of the downloaded files and will create a file in a new 'generated' + folder with the results. + + Args: + output_folder (str): output_folder to push the data to + files_info (list): information on all the files + + Example file output: + file name: generated/checksums.json + file content: + {"BTCBRL_final.csv": true} + """ + try: + checksums = {} + + df = pd.DataFrame(files_info) + + # loop through the downloaded files in the output_folder + for subdir, dirs, files in os.walk(output_folder): + for file in files: + filepath = os.path.join(subdir, file) + df2 = df[df['name'] == file].reset_index() + try: + hash = df2['hash'][0] + except: + hash = None + try: + hash_type = df2['hash_type'][0] + except: + hash_type = None + newhash = None + with open(filepath, "rb") as f: + if hash_type == 'md5': + newhash = hashlib.md5(f.read()).hexdigest() + if hash_type == 'sha1': + newhash = hashlib.sha1(f.read()).hexdigest() + if hash_type == 'sha224': + newhash = hashlib.sha224(f.read()).hexdigest() + if hash_type == 'sha256': + newhash = hashlib.sha256(f.read()).hexdigest() + if hash_type == 'sha384': + newhash = hashlib.sha384(f.read()).hexdigest() + if hash_type == 'sha512': + newhash = hashlib.sha512(f.read()).hexdigest() + hash_match = (hash == newhash) + if hash is not None and hash_type is not None: + status = f"---> Checksum match: {hash_match} - {file}" + print(status) + logging.info(status) + checksums[file] = hash_match + + try: + timestamp = str(time.time()).split('.')[0] + except: + timestamp = "" + generated_path = f"{output_folder}/generated" + if not os.path.isdir(generated_path): + os.mkdir(generated_path) + with open(f"{generated_path}/checksums{timestamp}.json", "w") as f: + json.dump(checksums, f) + except Exception as e: + logging.error(f"Failed at checksum: {e}") + def _pre_files(self): pass @@ -312,7 +383,9 @@ def _get( self._unpack_single_folder(self.files[0]["link"], output_folder) return + files_info = [] for f in self.files: + files_info.append(f) self.download_file( f["link"], output_folder, @@ -321,6 +394,10 @@ def _get( file_hash=f["hash"], file_hash_type=f["hash_type"], ) + # if checksum==True do checking of checksum + if self.checksum: + self._check_checksums(output_folder=output_folder, + files_info=files_info) def download( self, From 6557f796b479bda8acac249bc0f8c3cec3fb92a5 Mon Sep 17 00:00:00 2001 From: Dave Tromp Date: Wed, 17 Jan 2024 14:07:41 +0100 Subject: [PATCH 2/5] add missing import and fix bare excepts --- datahugger/base.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/datahugger/base.py b/datahugger/base.py index eb2e185..af06e1e 100644 --- a/datahugger/base.py +++ b/datahugger/base.py @@ -3,6 +3,7 @@ import logging import os import re +import time import zipfile from pathlib import Path from typing import Union @@ -233,11 +234,13 @@ def _check_checksums(self, output_folder, files_info): df2 = df[df['name'] == file].reset_index() try: hash = df2['hash'][0] - except: + except Exception as e: + logging.info(f"Setting hash to None: {e}") hash = None try: hash_type = df2['hash_type'][0] - except: + except Exception as e: + logging.info(f"Setting hash_type to None: {e}") hash_type = None newhash = None with open(filepath, "rb") as f: From 354e7be7c3f5a980d53aa3775176bf050b811cf5 Mon Sep 17 00:00:00 2001 From: Dave Tromp Date: Wed, 17 Jan 2024 14:27:13 +0100 Subject: [PATCH 3/5] fixing some linting issues --- datahugger/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datahugger/base.py b/datahugger/base.py index af06e1e..22ea8ea 100644 --- a/datahugger/base.py +++ b/datahugger/base.py @@ -229,6 +229,7 @@ def _check_checksums(self, output_folder, files_info): # loop through the downloaded files in the output_folder for subdir, dirs, files in os.walk(output_folder): + logging.info(f"Not using the dirs: {dirs}") for file in files: filepath = os.path.join(subdir, file) df2 = df[df['name'] == file].reset_index() @@ -265,7 +266,8 @@ def _check_checksums(self, output_folder, files_info): try: timestamp = str(time.time()).split('.')[0] - except: + except Exception as e: + logging.info(f"Setting timestamp to empty string: {e}") timestamp = "" generated_path = f"{output_folder}/generated" if not os.path.isdir(generated_path): From 4203a6ae1df8d8cbf38e52ef749c19c57b423b59 Mon Sep 17 00:00:00 2001 From: Dave Tromp Date: Wed, 17 Jan 2024 14:37:59 +0100 Subject: [PATCH 4/5] reorganize and reformat base.py --- datahugger/base.py | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/datahugger/base.py b/datahugger/base.py index 22ea8ea..b3f3a9c 100644 --- a/datahugger/base.py +++ b/datahugger/base.py @@ -1,3 +1,4 @@ +import hashlib import io import json import logging @@ -9,16 +10,13 @@ from typing import Union from urllib.parse import urlparse +import pandas as pd import requests from jsonpath_ng import parse from scitree import scitree from tqdm import tqdm -import pandas as pd -import hashlib -from datahugger.utils import _format_filename -from datahugger.utils import _get_url -from datahugger.utils import _is_url +from datahugger.utils import _format_filename, _get_url, _is_url class DownloadResult: @@ -207,7 +205,6 @@ def _unpack_single_folder(self, zip_url, output_folder): zip_info.filename = os.path.basename(zip_info.filename) z.extract(zip_info, output_folder) - def _check_checksums(self, output_folder, files_info): """Will compare the checksum values in the files_info with the checksums of the downloaded files and will create a file in a new 'generated' @@ -224,7 +221,7 @@ def _check_checksums(self, output_folder, files_info): """ try: checksums = {} - + df = pd.DataFrame(files_info) # loop through the downloaded files in the output_folder @@ -232,40 +229,40 @@ def _check_checksums(self, output_folder, files_info): logging.info(f"Not using the dirs: {dirs}") for file in files: filepath = os.path.join(subdir, file) - df2 = df[df['name'] == file].reset_index() + df2 = df[df["name"] == file].reset_index() try: - hash = df2['hash'][0] + hash = df2["hash"][0] except Exception as e: logging.info(f"Setting hash to None: {e}") hash = None try: - hash_type = df2['hash_type'][0] + hash_type = df2["hash_type"][0] except Exception as e: logging.info(f"Setting hash_type to None: {e}") hash_type = None newhash = None with open(filepath, "rb") as f: - if hash_type == 'md5': + if hash_type == "md5": newhash = hashlib.md5(f.read()).hexdigest() - if hash_type == 'sha1': + if hash_type == "sha1": newhash = hashlib.sha1(f.read()).hexdigest() - if hash_type == 'sha224': + if hash_type == "sha224": newhash = hashlib.sha224(f.read()).hexdigest() - if hash_type == 'sha256': + if hash_type == "sha256": newhash = hashlib.sha256(f.read()).hexdigest() - if hash_type == 'sha384': + if hash_type == "sha384": newhash = hashlib.sha384(f.read()).hexdigest() - if hash_type == 'sha512': + if hash_type == "sha512": newhash = hashlib.sha512(f.read()).hexdigest() - hash_match = (hash == newhash) + hash_match = hash == newhash if hash is not None and hash_type is not None: status = f"---> Checksum match: {hash_match} - {file}" print(status) logging.info(status) checksums[file] = hash_match - + try: - timestamp = str(time.time()).split('.')[0] + timestamp = str(time.time()).split(".")[0] except Exception as e: logging.info(f"Setting timestamp to empty string: {e}") timestamp = "" @@ -331,7 +328,8 @@ def _get_files_recursive(self, url, folder_name=None, base_url=None): if next_url: result.extend( - self._get_files_recursive(next_url, folder_name=folder_name) + self._get_files_recursive( + next_url, folder_name=folder_name) ) return result @@ -401,8 +399,8 @@ def _get( ) # if checksum==True do checking of checksum if self.checksum: - self._check_checksums(output_folder=output_folder, - files_info=files_info) + self._check_checksums( + output_folder=output_folder, files_info=files_info) def download( self, From 9ab71392ec5eb79e1b50bbde00302296a3d6e7f0 Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Tue, 26 Mar 2024 22:16:49 +0100 Subject: [PATCH 5/5] Happy lint --- datahugger/base.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datahugger/base.py b/datahugger/base.py index b3f3a9c..f08bfdb 100644 --- a/datahugger/base.py +++ b/datahugger/base.py @@ -16,7 +16,9 @@ from scitree import scitree from tqdm import tqdm -from datahugger.utils import _format_filename, _get_url, _is_url +from datahugger.utils import _format_filename +from datahugger.utils import _get_url +from datahugger.utils import _is_url class DownloadResult: @@ -328,8 +330,7 @@ def _get_files_recursive(self, url, folder_name=None, base_url=None): if next_url: result.extend( - self._get_files_recursive( - next_url, folder_name=folder_name) + self._get_files_recursive(next_url, folder_name=folder_name) ) return result @@ -399,8 +400,7 @@ def _get( ) # if checksum==True do checking of checksum if self.checksum: - self._check_checksums( - output_folder=output_folder, files_info=files_info) + self._check_checksums(output_folder=output_folder, files_info=files_info) def download( self,