Skip to content

Commit

Permalink
Need to get mtime from Last-Mod before checking for skips!
Browse files Browse the repository at this point in the history
  • Loading branch information
jake authored and jake committed Mar 27, 2024
1 parent 9db7df5 commit a00a601
Showing 1 changed file with 38 additions and 33 deletions.
71 changes: 38 additions & 33 deletions internetarchive/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,32 +219,6 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
raise OSError(f'{destdir} is not a directory!')
file_path = os.path.join(destdir, file_path)

if not return_responses and os.path.exists(file_path.encode('utf-8')):
if ignore_existing:
msg = f'skipping {file_path}, file already exists.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif checksum:
with open(file_path, 'rb') as fp:
md5_sum = utils.get_md5(fp)

if md5_sum == self.md5:
msg = f'skipping {file_path}, file already exists based on checksum.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif not fileobj:
st = os.stat(file_path.encode('utf-8'))
if (st.st_mtime == self.mtime) and (st.st_size == self.size):
msg = f'skipping {file_path}, file already exists based on length and date.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return

parent_dir = os.path.dirname(file_path)
try:
if parent_dir != '' and return_responses is not True:
Expand All @@ -255,8 +229,44 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
timeout=timeout,
auth=self.auth,
params=params)

# Get timestamp from Last-Modified header
dt = parsedate_to_datetime(response.headers['Last-Modified'])
last_mod_mtime = dt.timestamp()

response.raise_for_status()
if return_responses:

# Check if we should skip...
if not return_responses and os.path.exists(file_path.encode('utf-8')):
if ignore_existing:
msg = f'skipping {file_path}, file already exists.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif checksum:
with open(file_path, 'rb') as fp:
md5_sum = utils.get_md5(fp)

if md5_sum == self.md5:
msg = f'skipping {file_path}, file already exists based on checksum.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif not fileobj:
st = os.stat(file_path.encode('utf-8'))
if st.st_mtime == last_mod_mtime:
if self.name == f'{self.identifier}_files.xml' \
or (st.st_size == self.size):
msg = (f'skipping {file_path}, file already exists based on '
'length and date.')
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return

elif return_responses:
return response

if verbose:
Expand Down Expand Up @@ -299,15 +309,10 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
raise exc

# Set mtime with timestamp from Last-Modified header
try:
dt = parsedate_to_datetime(response.headers['Last-Modified'])
mtime = dt.timestamp()
except KeyError:
mtime = 0
if not no_change_timestamp:
# If we want to set the timestamp to that of the original archive...
with suppress(OSError): # Probably file-like object, e.g. sys.stdout.
os.utime(file_path.encode('utf-8'), (0, mtime))
os.utime(file_path.encode('utf-8'), (0,last_mod_mtime))

msg = f'downloaded {self.identifier}/{self.name} to {file_path}'
log.info(msg)
Expand Down

0 comments on commit a00a601

Please sign in to comment.