Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement checksum checking #71

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions datahugger/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ def main():
parser.add_argument("--no-unzip", dest="unzip", action="store_false")
parser.set_defaults(unzip=True)

parser.add_argument("--checksum", dest="checksum", action="store_false")
parser.set_defaults(checksum=False)

parser.add_argument("--no-progress", dest="progress", action="store_false")
parser.set_defaults(progress=True)

Expand Down Expand Up @@ -113,6 +116,7 @@ def main():
max_file_size=args.max_file_size,
force_download=args.force_download,
unzip=args.unzip,
checksum=args.checksum,
progress=args.progress,
print_only=args.print_only,
params=args.params,
Expand Down
8 changes: 8 additions & 0 deletions datahugger/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def info(
max_file_size=None,
force_download=False,
unzip=True,
checksum=False,
progress=True,
print_only=False,
params=None,
Expand All @@ -73,6 +74,8 @@ def info(
files in the destination folder. Default: False.
unzip: bool
Unzip is the output is a single zip file. Default: True.
checksum: bool
Checksum will check the checksum of downloaded files. Default: False.
progress: bool
Print the progress of the download. Default: True.
print_only: bool
Expand All @@ -96,6 +99,7 @@ def info(
max_file_size=max_file_size,
force_download=force_download,
unzip=unzip,
checksum=checksum,
progress=progress,
print_only=print_only,
params=params,
Expand All @@ -108,6 +112,7 @@ def get(
max_file_size=None,
force_download=False,
unzip=True,
checksum=False,
progress=True,
print_only=False,
params=None,
Expand All @@ -131,6 +136,8 @@ def get(
files in the destination folder. Default: False.
unzip: bool
Unzip is the output is a single zip file. Default: True.
checksum: bool
Checksum will check the checksum of downloaded files Default: False.
progress: bool
Print the progress of the download. Default: True.
print_only: bool
Expand All @@ -151,6 +158,7 @@ def get(
max_file_size=max_file_size,
force_download=force_download,
unzip=unzip,
checksum=checksum,
progress=progress,
print_only=print_only,
params=params,
Expand Down
80 changes: 80 additions & 0 deletions datahugger/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import io
import json
import logging
import os
import re
import time
import zipfile
from pathlib import Path
from typing import Union
Expand All @@ -11,6 +13,8 @@
from jsonpath_ng import parse
from scitree import scitree
from tqdm import tqdm
import pandas as pd
import hashlib

from datahugger.utils import _format_filename
from datahugger.utils import _get_url
Expand Down Expand Up @@ -51,6 +55,7 @@ def __init__(
force_download=False,
progress=True,
unzip=True,
checksum=False,
print_only=False,
params=None,
):
Expand All @@ -60,6 +65,7 @@ def __init__(
self.force_download = force_download
self.progress = progress
self.unzip = unzip
self.checksum = checksum
self.print_only = print_only
self.params = params

Expand Down Expand Up @@ -201,6 +207,74 @@ def _unpack_single_folder(self, zip_url, output_folder):
zip_info.filename = os.path.basename(zip_info.filename)
z.extract(zip_info, output_folder)


def _check_checksums(self, output_folder, files_info):
"""Will compare the checksum values in the files_info with the checksums
of the downloaded files and will create a file in a new 'generated'
folder with the results.

Args:
output_folder (str): output_folder to push the data to
files_info (list): information on all the files

Example file output:
file name: generated/checksums.json
file content:
{"BTCBRL_final.csv": true}
"""
try:
checksums = {}

df = pd.DataFrame(files_info)

# loop through the downloaded files in the output_folder
for subdir, dirs, files in os.walk(output_folder):
for file in files:
filepath = os.path.join(subdir, file)
df2 = df[df['name'] == file].reset_index()
try:
hash = df2['hash'][0]
except Exception as e:
logging.info(f"Setting hash to None: {e}")
hash = None
try:
hash_type = df2['hash_type'][0]
except Exception as e:
logging.info(f"Setting hash_type to None: {e}")
hash_type = None
newhash = None
with open(filepath, "rb") as f:
if hash_type == 'md5':
newhash = hashlib.md5(f.read()).hexdigest()
if hash_type == 'sha1':
newhash = hashlib.sha1(f.read()).hexdigest()
if hash_type == 'sha224':
newhash = hashlib.sha224(f.read()).hexdigest()
if hash_type == 'sha256':
newhash = hashlib.sha256(f.read()).hexdigest()
if hash_type == 'sha384':
newhash = hashlib.sha384(f.read()).hexdigest()
if hash_type == 'sha512':
newhash = hashlib.sha512(f.read()).hexdigest()
hash_match = (hash == newhash)
if hash is not None and hash_type is not None:
status = f"---> Checksum match: {hash_match} - {file}"
print(status)
logging.info(status)
checksums[file] = hash_match

try:
timestamp = str(time.time()).split('.')[0]
except:
timestamp = ""
generated_path = f"{output_folder}/generated"
if not os.path.isdir(generated_path):
os.mkdir(generated_path)
with open(f"{generated_path}/checksums{timestamp}.json", "w") as f:
json.dump(checksums, f)
except Exception as e:
logging.error(f"Failed at checksum: {e}")

def _pre_files(self):
pass

Expand Down Expand Up @@ -312,7 +386,9 @@ def _get(
self._unpack_single_folder(self.files[0]["link"], output_folder)
return

files_info = []
for f in self.files:
files_info.append(f)
self.download_file(
f["link"],
output_folder,
Expand All @@ -321,6 +397,10 @@ def _get(
file_hash=f["hash"],
file_hash_type=f["hash_type"],
)
# if checksum==True do checking of checksum
if self.checksum:
self._check_checksums(output_folder=output_folder,
files_info=files_info)

def download(
self,
Expand Down
Loading