From 313ef7f56edd64a3e460927d6fe9f95febb7cebf Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 6 Oct 2018 22:57:54 -0700 Subject: [PATCH] warc/1.1 support! add ability to more easily write WARC/1.1 records (addresses #37) (reading already possible) - use full millis precision for WARC-Date when using WARC/1.1 - timeutils: iso_date_to_datetime() supports parsing millis param - timeutils: datetime_to_iso_date() supports 'use_millis' param which includes a millis fraction (as prt ISO 8601) - record_http: pass extra args to base warcwriter, supports 'warc_version' param - warc version: can be '1.0' or '1.1', converted to 'WARC/1.0' and 'WARC/1.1' respectively - tests: test warc 1.1 writing directly, through record_http, also add test for utils.open() - warcwriter: curr_warc_date() returns a second precsion (default) to millis precision based on current WARC version --- test/test_record_http.py | 37 ++++++++++++++++++++++++++-------- test/test_utils.py | 16 +++++++++++++++ test/test_writer.py | 26 ++++++++++++++++++++++-- warcio/record_http.py | 5 +++-- warcio/timeutils.py | 43 +++++++++++++++++++++++++++++++++++----- warcio/warcwriter.py | 30 ++++++++++++++++++++++------ 6 files changed, 134 insertions(+), 23 deletions(-) diff --git a/test/test_record_http.py b/test/test_record_http.py index 68ce994c..1c2198af 100644 --- a/test/test_record_http.py +++ b/test/test_record_http.py @@ -23,6 +23,8 @@ class TestRecordHttpBin(object): def setup_class(cls): from httpbin import app as httpbin_app + cls.temp_dir = tempfile.mkdtemp('warctest') + server = make_server('localhost', 0, httpbin_app) addr, cls.port = server.socket.getsockname() @@ -37,6 +39,10 @@ def run(): thread.start() time.sleep(0.1) + @classmethod + def teardown_class(cls): + os.rmdir(cls.temp_dir) + def test_get_no_record(self): url = 'http://localhost:{0}/get?foo=bar'.format(self.port) res = requests.get(url, headers={'Host': 'httpbin.org'}) @@ -156,9 +162,7 @@ def skip_filter(request, response, warc_writer): assert warc_writer.get_contents() == b'' def test_record_to_temp_file_append(self): - temp_dir = tempfile.mkdtemp('warctest') - - full_path = os.path.join(temp_dir, 'example.warc.gz') + full_path = os.path.join(self.temp_dir, 'example.warc.gz') url = 'http://localhost:{0}/get?foo=bar'.format(self.port) @@ -190,12 +194,9 @@ def test_record_to_temp_file_append(self): assert request.rec_headers['WARC-Target-URI'] == url os.remove(full_path) - os.rmdir(temp_dir) def test_error_record_to_temp_file_no_append_no_overwrite(self): - temp_dir = tempfile.mkdtemp('warctest') - - full_path = os.path.join(temp_dir, 'example2.warc.gz') + full_path = os.path.join(self.temp_dir, 'example2.warc.gz') url = 'http://localhost:{0}/get?foo=bar'.format(self.port) @@ -207,6 +208,26 @@ def test_error_record_to_temp_file_no_append_no_overwrite(self): res = requests.get(url) os.remove(full_path) - os.rmdir(temp_dir) + + def test_warc_1_1(self): + full_path = os.path.join(self.temp_dir, 'example3.warc') + + url = 'http://localhost:{0}/get?foo=bar'.format(self.port) + + with record_http(full_path, append=False, warc_version='1.1', gzip=False): + res = requests.get(url) + + with open(full_path, 'rb') as stream: + # response + ai = ArchiveIterator(stream) + response = next(ai) + assert response.rec_headers.protocol == 'WARC/1.1' + warc_date = response.rec_headers['WARC-Date'] + + # ISO date with fractional millis + assert '.' in warc_date + assert len(warc_date) == 27 + + os.remove(full_path) diff --git a/test/test_utils.py b/test/test_utils.py index 9ad26a08..8ed6f08f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2,6 +2,8 @@ import pytest from collections import Counter from io import BytesIO +import os +import tempfile import warcio.utils as utils from . import get_test_file @@ -64,3 +66,17 @@ def test_to_native_str(self): # not string, leave as is assert utils.to_native_str(10) == 10 + def test_open_exclusive(self): + temp_dir = tempfile.mkdtemp('warctest') + full_name = os.path.join(temp_dir, 'foo.txt') + with utils.open(full_name, 'xt') as fh: + fh.write('test') + + with pytest.raises(OSError): + with utils.open(full_name, 'xt') as fh: + fh.write('test') + + os.remove(full_name) + os.rmdir(temp_dir) + + diff --git a/test/test_writer.py b/test/test_writer.py index 2b943c0f..d5b99351 100644 --- a/test/test_writer.py +++ b/test/test_writer.py @@ -21,8 +21,11 @@ def _make_warc_id(cls, id_=None): return '' @classmethod - def _make_warc_date(cls): - return '2000-01-01T00:00:00Z' + def _make_warc_date(cls, use_millis=False): + if not use_millis: + return '2000-01-01T00:00:00Z' + else: + return '2000-01-01T00:00:00.123456Z' # ============================================================================ @@ -566,6 +569,25 @@ def test_request_response_concur(self, is_gzip): assert resp_id != req_id assert resp_id == req.rec_headers.get_header('WARC-Concurrent-To') + def test_response_warc_1_1(self, is_gzip): + writer = BufferWARCWriter(gzip=is_gzip, warc_version='WARC/1.1') + + resp = sample_response(writer) + + writer.write_record(resp) + + stream = writer.get_stream() + + reader = ArchiveIterator(stream) + recs = list(reader) + + assert len(recs) == 1 + assert recs[0].rec_headers.protocol == 'WARC/1.1' + + # ISO datetime with fractional millis + assert '.' in recs[0].rec_headers['WARC-Date'] + assert len(recs[0].rec_headers['WARC-Date']) == 27 + def _conv_to_streaming_record(self, record_buff, rec_type): # strip-off the two empty \r\n\r\n added at the end of uncompressed record record_buff = record_buff[:-4] diff --git a/warcio/record_http.py b/warcio/record_http.py index 39426490..c6cdac2a 100644 --- a/warcio/record_http.py +++ b/warcio/record_http.py @@ -169,11 +169,12 @@ def done(self): # ============================================================================ @contextmanager -def record_http(warc_writer, filter_func=None, append=True): +def record_http(warc_writer, filter_func=None, append=True, + **kwargs): out = None if isinstance(warc_writer, str): out = open(warc_writer, 'ab' if append else 'xb') - warc_writer = WARCWriter(out) + warc_writer = WARCWriter(out, **kwargs) try: recorder = RequestRecorder(warc_writer, filter_func) diff --git a/warcio/timeutils.py b/warcio/timeutils.py index e52644be..20af2493 100644 --- a/warcio/timeutils.py +++ b/warcio/timeutils.py @@ -22,6 +22,7 @@ PAD_14_DOWN = '10000101000000' PAD_14_UP = '29991231235959' PAD_6_UP = '299912' +PAD_MICRO = '000000' def iso_date_to_datetime(string): @@ -29,14 +30,33 @@ def iso_date_to_datetime(string): >>> iso_date_to_datetime('2013-12-26T10:11:12Z') datetime.datetime(2013, 12, 26, 10, 11, 12) - >>> iso_date_to_datetime('2013-12-26T10:11:12Z') + >>> iso_date_to_datetime('2013-12-26T10:11:12.456789Z') + datetime.datetime(2013, 12, 26, 10, 11, 12, 456789) + + >>> iso_date_to_datetime('2013-12-26T10:11:12.30Z') + datetime.datetime(2013, 12, 26, 10, 11, 12, 300000) + + >>> iso_date_to_datetime('2013-12-26T10:11:12.00001Z') + datetime.datetime(2013, 12, 26, 10, 11, 12, 10) + + >>> iso_date_to_datetime('2013-12-26T10:11:12.000001Z') + datetime.datetime(2013, 12, 26, 10, 11, 12, 1) + + >>> iso_date_to_datetime('2013-12-26T10:11:12.0000001Z') datetime.datetime(2013, 12, 26, 10, 11, 12) - """ + + >>> iso_date_to_datetime('2013-12-26T10:11:12.000000Z') + datetime.datetime(2013, 12, 26, 10, 11, 12) + """ nums = DATE_TIMESPLIT.split(string) if nums[-1] == '': nums = nums[:-1] + if len(nums) == 7: + nums[6] = nums[6][:6] + nums[6] += PAD_MICRO[len(nums[6]):] + the_datetime = datetime.datetime(*(int(num) for num in nums)) return the_datetime @@ -65,16 +85,29 @@ def datetime_to_http_date(the_datetime): usegmt=True) -def datetime_to_iso_date(the_datetime): +def datetime_to_iso_date(the_datetime, use_millis=False): """ >>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12)) '2013-12-26T10:11:12Z' - >>> datetime_to_iso_date( datetime.datetime(2013, 12, 26, 10, 11, 12)) + >>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12, 456789)) + '2013-12-26T10:11:12Z' + + >>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12), use_millis=True) '2013-12-26T10:11:12Z' + + >>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12, 456789), use_millis=True) + '2013-12-26T10:11:12.456789Z' + + >>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12, 1), use_millis=True) + '2013-12-26T10:11:12.000001Z' + """ - return the_datetime.strftime(ISO_DT) + if not use_millis: + return the_datetime.strftime(ISO_DT) + else: + return the_datetime.isoformat() + 'Z' def datetime_to_timestamp(the_datetime): diff --git a/warcio/warcwriter.py b/warcio/warcwriter.py index 42b8d9ec..b940c0b9 100644 --- a/warcio/warcwriter.py +++ b/warcio/warcwriter.py @@ -27,7 +27,11 @@ class BaseWARCWriter(object): REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/identical-payload-digest' - WARC_VERSION = 'WARC/1.0' + WARC_1_0 = 'WARC/1.0' + WARC_1_1 = 'WARC/1.1' + + # default warc version + WARC_VERSION = WARC_1_0 NO_PAYLOAD_DIGEST_TYPES = ('warcinfo', 'revisit') NO_BLOCK_DIGEST_TYPES = ('warcinfo') @@ -38,9 +42,19 @@ def __init__(self, gzip=True, *args, **kwargs): self.parser = StatusAndHeadersParser([], verify=False) - self.warc_version = kwargs.get('warc_version', self.WARC_VERSION) + self.warc_version = self._parse_warc_version(kwargs.get('warc_version')) self.header_filter = kwargs.get('header_filter') + def _parse_warc_version(self, version): + if not version: + return self.WARC_VERSION + + version = str(version) + if version.startswith('WARC/'): + return version + + return 'WARC/' + version + @classmethod def _iter_stream(cls, stream): while True: @@ -134,7 +148,7 @@ def create_warcinfo_record(self, filename, info): warc_headers.add_header('WARC-Record-ID', self._make_warc_id()) if filename: warc_headers.add_header('WARC-Filename', filename) - warc_headers.add_header('WARC-Date', self._make_warc_date()) + warc_headers.add_header('WARC-Date', self.curr_warc_date()) warcinfo = BytesIO() for name, value in six.iteritems(info): @@ -217,7 +231,7 @@ def _init_warc_headers(self, uri, record_type, warc_headers_dict): warc_headers.replace_header('WARC-Target-URI', uri) if not warc_headers.get_header('WARC-Date'): - warc_headers.add_header('WARC-Date', self._make_warc_date()) + warc_headers.add_header('WARC-Date', self.curr_warc_date()) return warc_headers @@ -294,13 +308,17 @@ def _write_warc_record(self, out, record): out.flush() + def curr_warc_date(self): + use_millis = (self.warc_version >= self.WARC_1_1) + return self._make_warc_date(use_millis=use_millis) + @classmethod def _make_warc_id(cls): return StatusAndHeadersParser.make_warc_id() @classmethod - def _make_warc_date(cls): - return datetime_to_iso_date(datetime.datetime.utcnow()) + def _make_warc_date(cls, use_millis=False): + return datetime_to_iso_date(datetime.datetime.utcnow(), use_millis=use_millis) @classmethod def _create_temp_file(cls):