Skip to content

Commit

Permalink
warc/1.1 support! add ability to more easily write WARC/1.1 records (…
Browse files Browse the repository at this point in the history
…addresses #37) (reading already possible)

- use full millis precision for WARC-Date when using WARC/1.1
- timeutils: iso_date_to_datetime() supports parsing millis param
- timeutils: datetime_to_iso_date() supports 'use_millis' param which includes a millis fraction (as prt ISO 8601)
- record_http: pass extra args to base warcwriter, supports 'warc_version' param
- warc version: can be '1.0' or '1.1', converted to 'WARC/1.0' and 'WARC/1.1' respectively
- tests: test warc 1.1 writing directly, through record_http, also add test for utils.open()
- warcwriter: curr_warc_date() returns a second precsion (default) to millis precision based on current WARC version
  • Loading branch information
ikreymer committed Oct 7, 2018
1 parent 3f6e2d7 commit 313ef7f
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 23 deletions.
37 changes: 29 additions & 8 deletions test/test_record_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ class TestRecordHttpBin(object):
def setup_class(cls):
from httpbin import app as httpbin_app

cls.temp_dir = tempfile.mkdtemp('warctest')

server = make_server('localhost', 0, httpbin_app)
addr, cls.port = server.socket.getsockname()

Expand All @@ -37,6 +39,10 @@ def run():
thread.start()
time.sleep(0.1)

@classmethod
def teardown_class(cls):
os.rmdir(cls.temp_dir)

def test_get_no_record(self):
url = 'http://localhost:{0}/get?foo=bar'.format(self.port)
res = requests.get(url, headers={'Host': 'httpbin.org'})
Expand Down Expand Up @@ -156,9 +162,7 @@ def skip_filter(request, response, warc_writer):
assert warc_writer.get_contents() == b''

def test_record_to_temp_file_append(self):
temp_dir = tempfile.mkdtemp('warctest')

full_path = os.path.join(temp_dir, 'example.warc.gz')
full_path = os.path.join(self.temp_dir, 'example.warc.gz')

url = 'http://localhost:{0}/get?foo=bar'.format(self.port)

Expand Down Expand Up @@ -190,12 +194,9 @@ def test_record_to_temp_file_append(self):
assert request.rec_headers['WARC-Target-URI'] == url

os.remove(full_path)
os.rmdir(temp_dir)

def test_error_record_to_temp_file_no_append_no_overwrite(self):
temp_dir = tempfile.mkdtemp('warctest')

full_path = os.path.join(temp_dir, 'example2.warc.gz')
full_path = os.path.join(self.temp_dir, 'example2.warc.gz')

url = 'http://localhost:{0}/get?foo=bar'.format(self.port)

Expand All @@ -207,6 +208,26 @@ def test_error_record_to_temp_file_no_append_no_overwrite(self):
res = requests.get(url)

os.remove(full_path)
os.rmdir(temp_dir)

def test_warc_1_1(self):
full_path = os.path.join(self.temp_dir, 'example3.warc')

url = 'http://localhost:{0}/get?foo=bar'.format(self.port)

with record_http(full_path, append=False, warc_version='1.1', gzip=False):
res = requests.get(url)

with open(full_path, 'rb') as stream:
# response
ai = ArchiveIterator(stream)
response = next(ai)
assert response.rec_headers.protocol == 'WARC/1.1'
warc_date = response.rec_headers['WARC-Date']

# ISO date with fractional millis
assert '.' in warc_date
assert len(warc_date) == 27

os.remove(full_path)


16 changes: 16 additions & 0 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import pytest
from collections import Counter
from io import BytesIO
import os
import tempfile

import warcio.utils as utils
from . import get_test_file
Expand Down Expand Up @@ -64,3 +66,17 @@ def test_to_native_str(self):
# not string, leave as is
assert utils.to_native_str(10) == 10

def test_open_exclusive(self):
temp_dir = tempfile.mkdtemp('warctest')
full_name = os.path.join(temp_dir, 'foo.txt')
with utils.open(full_name, 'xt') as fh:
fh.write('test')

with pytest.raises(OSError):
with utils.open(full_name, 'xt') as fh:
fh.write('test')

os.remove(full_name)
os.rmdir(temp_dir)


26 changes: 24 additions & 2 deletions test/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@ def _make_warc_id(cls, id_=None):
return '<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>'

@classmethod
def _make_warc_date(cls):
return '2000-01-01T00:00:00Z'
def _make_warc_date(cls, use_millis=False):
if not use_millis:
return '2000-01-01T00:00:00Z'
else:
return '2000-01-01T00:00:00.123456Z'


# ============================================================================
Expand Down Expand Up @@ -566,6 +569,25 @@ def test_request_response_concur(self, is_gzip):
assert resp_id != req_id
assert resp_id == req.rec_headers.get_header('WARC-Concurrent-To')

def test_response_warc_1_1(self, is_gzip):
writer = BufferWARCWriter(gzip=is_gzip, warc_version='WARC/1.1')

resp = sample_response(writer)

writer.write_record(resp)

stream = writer.get_stream()

reader = ArchiveIterator(stream)
recs = list(reader)

assert len(recs) == 1
assert recs[0].rec_headers.protocol == 'WARC/1.1'

# ISO datetime with fractional millis
assert '.' in recs[0].rec_headers['WARC-Date']
assert len(recs[0].rec_headers['WARC-Date']) == 27

def _conv_to_streaming_record(self, record_buff, rec_type):
# strip-off the two empty \r\n\r\n added at the end of uncompressed record
record_buff = record_buff[:-4]
Expand Down
5 changes: 3 additions & 2 deletions warcio/record_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,12 @@ def done(self):
# ============================================================================

@contextmanager
def record_http(warc_writer, filter_func=None, append=True):
def record_http(warc_writer, filter_func=None, append=True,
**kwargs):
out = None
if isinstance(warc_writer, str):
out = open(warc_writer, 'ab' if append else 'xb')
warc_writer = WARCWriter(out)
warc_writer = WARCWriter(out, **kwargs)

try:
recorder = RequestRecorder(warc_writer, filter_func)
Expand Down
43 changes: 38 additions & 5 deletions warcio/timeutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,41 @@
PAD_14_DOWN = '10000101000000'
PAD_14_UP = '29991231235959'
PAD_6_UP = '299912'
PAD_MICRO = '000000'


def iso_date_to_datetime(string):
"""
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
>>> iso_date_to_datetime('2013-12-26T10:11:12.456789Z')
datetime.datetime(2013, 12, 26, 10, 11, 12, 456789)
>>> iso_date_to_datetime('2013-12-26T10:11:12.30Z')
datetime.datetime(2013, 12, 26, 10, 11, 12, 300000)
>>> iso_date_to_datetime('2013-12-26T10:11:12.00001Z')
datetime.datetime(2013, 12, 26, 10, 11, 12, 10)
>>> iso_date_to_datetime('2013-12-26T10:11:12.000001Z')
datetime.datetime(2013, 12, 26, 10, 11, 12, 1)
>>> iso_date_to_datetime('2013-12-26T10:11:12.0000001Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
"""
>>> iso_date_to_datetime('2013-12-26T10:11:12.000000Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
"""

nums = DATE_TIMESPLIT.split(string)
if nums[-1] == '':
nums = nums[:-1]

if len(nums) == 7:
nums[6] = nums[6][:6]
nums[6] += PAD_MICRO[len(nums[6]):]

the_datetime = datetime.datetime(*(int(num) for num in nums))
return the_datetime

Expand Down Expand Up @@ -65,16 +85,29 @@ def datetime_to_http_date(the_datetime):
usegmt=True)


def datetime_to_iso_date(the_datetime):
def datetime_to_iso_date(the_datetime, use_millis=False):
"""
>>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12))
'2013-12-26T10:11:12Z'
>>> datetime_to_iso_date( datetime.datetime(2013, 12, 26, 10, 11, 12))
>>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12, 456789))
'2013-12-26T10:11:12Z'
>>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12), use_millis=True)
'2013-12-26T10:11:12Z'
>>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12, 456789), use_millis=True)
'2013-12-26T10:11:12.456789Z'
>>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12, 1), use_millis=True)
'2013-12-26T10:11:12.000001Z'
"""

return the_datetime.strftime(ISO_DT)
if not use_millis:
return the_datetime.strftime(ISO_DT)
else:
return the_datetime.isoformat() + 'Z'


def datetime_to_timestamp(the_datetime):
Expand Down
30 changes: 24 additions & 6 deletions warcio/warcwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ class BaseWARCWriter(object):

REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/identical-payload-digest'

WARC_VERSION = 'WARC/1.0'
WARC_1_0 = 'WARC/1.0'
WARC_1_1 = 'WARC/1.1'

# default warc version
WARC_VERSION = WARC_1_0

NO_PAYLOAD_DIGEST_TYPES = ('warcinfo', 'revisit')
NO_BLOCK_DIGEST_TYPES = ('warcinfo')
Expand All @@ -38,9 +42,19 @@ def __init__(self, gzip=True, *args, **kwargs):

self.parser = StatusAndHeadersParser([], verify=False)

self.warc_version = kwargs.get('warc_version', self.WARC_VERSION)
self.warc_version = self._parse_warc_version(kwargs.get('warc_version'))
self.header_filter = kwargs.get('header_filter')

def _parse_warc_version(self, version):
if not version:
return self.WARC_VERSION

version = str(version)
if version.startswith('WARC/'):
return version

return 'WARC/' + version

@classmethod
def _iter_stream(cls, stream):
while True:
Expand Down Expand Up @@ -134,7 +148,7 @@ def create_warcinfo_record(self, filename, info):
warc_headers.add_header('WARC-Record-ID', self._make_warc_id())
if filename:
warc_headers.add_header('WARC-Filename', filename)
warc_headers.add_header('WARC-Date', self._make_warc_date())
warc_headers.add_header('WARC-Date', self.curr_warc_date())

warcinfo = BytesIO()
for name, value in six.iteritems(info):
Expand Down Expand Up @@ -217,7 +231,7 @@ def _init_warc_headers(self, uri, record_type, warc_headers_dict):
warc_headers.replace_header('WARC-Target-URI', uri)

if not warc_headers.get_header('WARC-Date'):
warc_headers.add_header('WARC-Date', self._make_warc_date())
warc_headers.add_header('WARC-Date', self.curr_warc_date())

return warc_headers

Expand Down Expand Up @@ -294,13 +308,17 @@ def _write_warc_record(self, out, record):

out.flush()

def curr_warc_date(self):
use_millis = (self.warc_version >= self.WARC_1_1)
return self._make_warc_date(use_millis=use_millis)

@classmethod
def _make_warc_id(cls):
return StatusAndHeadersParser.make_warc_id()

@classmethod
def _make_warc_date(cls):
return datetime_to_iso_date(datetime.datetime.utcnow())
def _make_warc_date(cls, use_millis=False):
return datetime_to_iso_date(datetime.datetime.utcnow(), use_millis=use_millis)

@classmethod
def _create_temp_file(cls):
Expand Down

0 comments on commit 313ef7f

Please sign in to comment.