Skip to content

Commit

Permalink
implement checksum checking
Browse files Browse the repository at this point in the history
  • Loading branch information
Dave Tromp committed Jan 17, 2024
1 parent 338781b commit 7e75482
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 0 deletions.
4 changes: 4 additions & 0 deletions datahugger/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ def main():
parser.add_argument("--no-unzip", dest="unzip", action="store_false")
parser.set_defaults(unzip=True)

parser.add_argument("--checksum", dest="checksum", action="store_false")
parser.set_defaults(checksum=False)

parser.add_argument("--no-progress", dest="progress", action="store_false")
parser.set_defaults(progress=True)

Expand Down Expand Up @@ -113,6 +116,7 @@ def main():
max_file_size=args.max_file_size,
force_download=args.force_download,
unzip=args.unzip,
checksum=args.checksum,
progress=args.progress,
print_only=args.print_only,
params=args.params,
Expand Down
8 changes: 8 additions & 0 deletions datahugger/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def info(
max_file_size=None,
force_download=False,
unzip=True,
checksum=False,
progress=True,
print_only=False,
params=None,
Expand All @@ -73,6 +74,8 @@ def info(
files in the destination folder. Default: False.
unzip: bool
Unzip is the output is a single zip file. Default: True.
checksum: bool
Checksum will check the checksum of downloaded files. Default: False.
progress: bool
Print the progress of the download. Default: True.
print_only: bool
Expand All @@ -96,6 +99,7 @@ def info(
max_file_size=max_file_size,
force_download=force_download,
unzip=unzip,
checksum=checksum,
progress=progress,
print_only=print_only,
params=params,
Expand All @@ -108,6 +112,7 @@ def get(
max_file_size=None,
force_download=False,
unzip=True,
checksum=False,
progress=True,
print_only=False,
params=None,
Expand All @@ -131,6 +136,8 @@ def get(
files in the destination folder. Default: False.
unzip: bool
Unzip is the output is a single zip file. Default: True.
checksum: bool
Checksum will check the checksum of downloaded files Default: False.
progress: bool
Print the progress of the download. Default: True.
print_only: bool
Expand All @@ -151,6 +158,7 @@ def get(
max_file_size=max_file_size,
force_download=force_download,
unzip=unzip,
checksum=checksum,
progress=progress,
print_only=print_only,
params=params,
Expand Down
77 changes: 77 additions & 0 deletions datahugger/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import io
import json
import logging
import os
import re
Expand All @@ -11,6 +12,8 @@
from jsonpath_ng import parse
from scitree import scitree
from tqdm import tqdm
import pandas as pd
import hashlib

from datahugger.utils import _format_filename
from datahugger.utils import _get_url
Expand Down Expand Up @@ -51,6 +54,7 @@ def __init__(
force_download=False,
progress=True,
unzip=True,
checksum=False,
print_only=False,
params=None,
):
Expand All @@ -60,6 +64,7 @@ def __init__(
self.force_download = force_download
self.progress = progress
self.unzip = unzip
self.checksum = checksum
self.print_only = print_only
self.params = params

Expand Down Expand Up @@ -201,6 +206,72 @@ def _unpack_single_folder(self, zip_url, output_folder):
zip_info.filename = os.path.basename(zip_info.filename)
z.extract(zip_info, output_folder)


def _check_checksums(self, output_folder, files_info):
"""Will compare the checksum values in the files_info with the checksums
of the downloaded files and will create a file in a new 'generated'
folder with the results.
Args:
output_folder (str): output_folder to push the data to
files_info (list): information on all the files
Example file output:
file name: generated/checksums.json
file content:
{"BTCBRL_final.csv": true}
"""
try:
checksums = {}

df = pd.DataFrame(files_info)

# loop through the downloaded files in the output_folder
for subdir, dirs, files in os.walk(output_folder):
for file in files:
filepath = os.path.join(subdir, file)
df2 = df[df['name'] == file].reset_index()
try:
hash = df2['hash'][0]
except:
hash = None
try:
hash_type = df2['hash_type'][0]
except:
hash_type = None
newhash = None
with open(filepath, "rb") as f:
if hash_type == 'md5':
newhash = hashlib.md5(f.read()).hexdigest()
if hash_type == 'sha1':
newhash = hashlib.sha1(f.read()).hexdigest()
if hash_type == 'sha224':
newhash = hashlib.sha224(f.read()).hexdigest()
if hash_type == 'sha256':
newhash = hashlib.sha256(f.read()).hexdigest()
if hash_type == 'sha384':
newhash = hashlib.sha384(f.read()).hexdigest()
if hash_type == 'sha512':
newhash = hashlib.sha512(f.read()).hexdigest()
hash_match = (hash == newhash)
if hash is not None and hash_type is not None:
status = f"---> Checksum match: {hash_match} - {file}"
print(status)
logging.info(status)
checksums[file] = hash_match

try:
timestamp = str(time.time()).split('.')[0]
except:
timestamp = ""
generated_path = f"{output_folder}/generated"
if not os.path.isdir(generated_path):
os.mkdir(generated_path)
with open(f"{generated_path}/checksums{timestamp}.json", "w") as f:
json.dump(checksums, f)
except Exception as e:
logging.error(f"Failed at checksum: {e}")

def _pre_files(self):
pass

Expand Down Expand Up @@ -312,7 +383,9 @@ def _get(
self._unpack_single_folder(self.files[0]["link"], output_folder)
return

files_info = []
for f in self.files:
files_info.append(f)
self.download_file(
f["link"],
output_folder,
Expand All @@ -321,6 +394,10 @@ def _get(
file_hash=f["hash"],
file_hash_type=f["hash_type"],
)
# if checksum==True do checking of checksum
if self.checksum:
self._check_checksums(output_folder=output_folder,
files_info=files_info)

def download(
self,
Expand Down

0 comments on commit 7e75482

Please sign in to comment.