Skip to content

Commit

Permalink
Adding put files
Browse files Browse the repository at this point in the history
  • Loading branch information
jimtuttle committed Sep 3, 2021
1 parent 305d78f commit 3348fa4
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 31 deletions.
146 changes: 115 additions & 31 deletions s3-replicate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,67 +3,84 @@
import argparse
import pathlib
import logging
from os.path import basename, normpath, exists, join
from os.path import basename, normpath, join
from datetime import datetime
from os import access, W_OK, R_OK
from os import access, W_OK, R_OK, walk
from sys import exit

from hashlib import md5
from deepdiff import DeepDiff as diff
import boto3
from botocore.exceptions import ClientError
from pathlib import Path

def get_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('-b', '--bucket', dest='bucket', help='AWS bucket or folder', required=True )
parser.add_argument('-c', '--config', dest='config', default='/tmp',
help='path to aws credentials file. E.g., /home/user/.aws/credentials. '
'Default is ~/.aws/credentials')
parser.add_argument('-d', '--directory', dest='directory', type=pathlib.Path,
help='path to digital collections directory. E.g., /some/path', required=True)
parser.add_argument('-f', '--fixity', dest='fixity', default=False, action='store_true',
help='perform fixity validation against manifest')
parser.add_argument('-m', '--manifest', dest='manifest', default='checksums-md5.txt',
help='name of manifest file if not "checksums-md5.txt"')
parser.add_argument('-i', '--id', dest='id', help='AWS access key id', required=True)
parser.add_argument('-k', '--key', dest='key', help='AWS secret access key', required=True)
parser.add_argument('-l', '--log', dest='log', default='/tmp',
help='directory to save logfile. E.g., /some/path. Default is /tmp')
parser.add_argument('-c', '--config', dest='config', default='/tmp',
help='path to aws config file. E.g., ~/.aws/config. Default is ~/.aws/config')
parser.add_argument('-m', '--manifest', dest='manifest', default='checksums-md5.txt',
help='name of manifest file if not "checksums-md5.txt"')
parser.add_argument('-p', '--profile', dest='profile', default='profile',
help='aws profile name. E.g., default. Default is default.')
# parser.add_argument('-a', '--id', dest='id', help='AWS access key id', required=True)
# parser.add_argument('-k', '--key', dest='key', help='AWS secret access key', required=True)
parser.add_argument('-b', '--bucket', dest='bucket', help='AWS bucket or folder', required=True )
parser.add_argument('-v', '--verbose', dest='verbose', default=False, action='store_true',
help='print verbose output to console')
arguments = parser.parse_args()
return arguments


def instantiate_logger(logpath, directory, bucket):
def instantiate_logger(logpath, directory, bucket, verbosity):
timestamp = '{:%Y-%m-%d-%H-%M-%S}'.format(datetime.now())
dirname = basename(normpath(directory))
bucketname= basename(normpath(bucket))
logname = 's3-replicate_{}_{}_{}.log'.format(dirname, bucketname, timestamp)
logging.basicConfig(filename=join(logpath, logname), encoding='utf-8', level=logging.INFO,
format='%(asctime)s - %(message)s')
logging.info('s3-replicate %s to %s', directory, bucket)
format='%(asctime)s - %(levelname)s - %(message)s')
print('Logging output to {}'.format(join(logpath, logname)))
message = 'Replicating files from {} to {}'.format(directory, bucket)
logging.info(message)
if verbosity:
print(message)


def test_arguments(arguments):
# test collections directory, test bucket, test manifest if passed
error = False
print(arguments)
if not access(arguments.log, W_OK):
print('Log file location: {} not writeable.'.format(arguments.log))
message = 'Log file location: {} not writeable.'.format(arguments.log)
logging.error(message)
print(message)
error = True
if arguments.fixity:
manifest = join(arguments.directory, arguments.manifest)
if not access(manifest, R_OK):
print('Manifest file: {} is not readable.'.format(arguments.manifest))
message = 'Manifest file: {} is not readable.'.format(arguments.manifest)
logging.error(message)
print(message)
error = True
if not access(arguments.config, R_OK):
print('AWS configuration file {} not found or not readable.')
message = 'AWS configuration file {} not found or not readable.'
logging.error(message)
print(message)
# todo test AWS config file, aws bucket access
if error:
print('Exiting.')
message = 'Exiting due to error condition in passed arguments.'
logging.error(message)
print(message)
exit()


def ignore_file(path):
ignorelist = ('Thumbs.db', '.DS_Store')
def ignore_file(path, ignored):
ignore = False
for ignorename in ignorelist:
for ignorename in ignored:
if ignorename in path:
ignore = True
if ignore:
Expand All @@ -73,26 +90,93 @@ def ignore_file(path):
return False


def validate_fixity(manifest, directory):
logging.info('Validating fixity using manifest file: %s', join(directory, manifest))
def get_manifest(manifestfile, workdir, ignored):
manifestentries = {}
files = {}
manifestfile = open(join(directory, manifest))
manifestreader = open(join(workdir, manifestfile))
linecount = 0
for line in manifestfile:
for line in manifestreader:
linecount += 1
checksum, filepath = line.split(',')
if not ignore_file(filepath):
if not ignore_file(filepath, ignored):
manifestentries[filepath.strip()] = checksum.strip()
logging.info('Found %s records in manifest file', str(linecount))
logging.info('Using %s records from manifest file after matching ignored files', len(manifestentries))
print(manifestentries)
logging.info('Using %s manifest records after matching ignored files', len(manifestentries))
return manifestentries


def calculate_hash(p):
md5hash = md5()
with open(p, "rb") as f:
for byte_block in iter(lambda: f.read(4096), b""):
md5hash .update(byte_block)
return md5hash.hexdigest()


def get_filesystem(workdir, verbosity, ignored):
message = 'Scanning files at {}'.format(workdir)
fsrecords = {}
logging.info(message)
if verbosity:
print(message)
ignoredfiles = 0
for root, dirs, files in walk(workdir):
for f in files:
relativepath = str(join(root, f)).split(str(workdir))[1][1:]
if f not in ignored:
digest = calculate_hash(join(root, f))
fsrecords[relativepath] = digest
else:
ignoredfiles += 1
message = 'Ignoring file {}'.format(relativepath)
logging.info(message)
if verbosity:
print(message)
message = 'Found {} files in {} after ignoring {} files'.format(len(fsrecords.keys()), workdir, str(ignoredfiles))
logging.info(message)
if verbosity:
print(message)
return fsrecords


def validate_fixity(manifest, directory, verbosity, ignored):
manifestrecords = get_manifest(manifest, directory, ignored)
fsrecords = get_filesystem(directory, verbosity, ignored)
if manifestrecords == fsrecords:
message = 'Filesystem and manifest match.'
logging.info(message)
if verbosity:
print(message)
return
else:
diffs = diff(manifestrecords, fsrecords, ignore_order=True)
message = 'Exiting due to mismatch. The following differences exist between manifest' \
' and filesystem: {}'.format(diffs)
logging.error(message)
print(message)
exit()


def put_files(awsid, awskey, workdir, verbosity, bucket):
message = 'Initiating file replication to {} as user {}'.format(awsid, bucket)
logging.info(message)
if verbosity:
print(message)
set_aws_config(awsid, awskey)


def set_aws_config(awsid, awskey):
# Documentation on creating credentials: https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html
home = str(Path.home())
client = boto3.client('s3', aws_access_key_id=awsid, aws_secret_access_key=awskey)



if __name__ == "__main__":
args = get_arguments()
ignorelist = ('Thumbs.db', '.DS_Store', args.manifest)
instantiate_logger(args.log, args.directory, args.bucket, args.verbose)
test_arguments(args)
instantiate_logger(args.log, args.directory, args.bucket)
if args.fixity:
validate_fixity(args.manifest, args.directory)
validate_fixity(args.manifest, args.directory, args.verbose, ignorelist)
put_files(args.id, args.key, args.directory, args.verbose, args.bucket)

Empty file added test/data/Thumbs.db
Empty file.
3 changes: 3 additions & 0 deletions test/data/checksums-md5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
e243bb39c844b3543a7726576c869caf,f1.txt
4349cfeff8e2eb74dffc369bb5fd084e,item1/f2.txt
d41d8cd98f00b204e9800998ecf8427e,Thumbs.db
1 change: 1 addition & 0 deletions test/data/f1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
file 1
1 change: 1 addition & 0 deletions test/data/item1/f2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
file 2

0 comments on commit 3348fa4

Please sign in to comment.