Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WARC 1.1 timestamp precision support #46

Merged
merged 3 commits into from
Oct 9, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 49 additions & 10 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,13 @@ Format <https://en.wikipedia.org/wiki/Web_ARChive>`__ commonly used in
web archives. Supports Python 2.7+ and Python 3.3+ (using
`six <https://pythonhosted.org/six/>`__, the only external dependency)

warcio supports reading and writing of WARC files compliant with both the `WARC 1.0 <http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf>`__
and `WARC 1.1 <http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1-1_latestdraft.pdf>`__ ISO standards.

Install with: ``pip install warcio``

This library is a spin-off of the WARC reading and writing component of
the `pywb <https://github.com/ikreymer/pywb>`__ high-fidelity replay
the `pywb <https://github.com/webrecorder/pywb>`__ high-fidelity replay
library, a key component of
`Webrecorder <https://github.com/webrecorder/webrecorder>`__

Expand All @@ -28,11 +31,13 @@ Reading WARC Records
--------------------

A key feature of the library is to be able to iterate over a stream of
WARC records using the ``ArchiveIterator``
WARC records using the ``ArchiveIterator``.

It includes the following features:

It includes the following features: - Reading a WARC/ARC stream - On the
fly ARC to WARC record conversion - Decompressing and de-chunking HTTP
payload content stored in WARC/ARC files.
- Reading a WARC 1.0, WARC 1.1 or ARC stream
- On the fly ARC to WARC record conversion
- Decompressing and de-chunking HTTP payload content stored in WARC/ARC files.

For example, the following prints the the url for each WARC ``response``
record:
Expand Down Expand Up @@ -146,10 +151,8 @@ The WARC ``example.warc.gz`` will contain two records (the response is written f
Customizing WARC Writing
~~~~~~~~~~~~~~~~~~~~~~~~

The library provides a simple and extensible interface for writing WARC
records conformant to WARC 1.0 ISO standard
`(see draft) <http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf>`__
Parts of WARC 1.1 spec are also being implemented.
The library provides a simple and extensible interface for writing
standards-compliant WARC files.

The library comes with a basic ``WARCWriter`` class for writing to a
single WARC file and ``BufferWARCWriter`` for writing to an in-memory
Expand All @@ -171,8 +174,44 @@ the above example can be written as:
warc_writer = WARCWriter(fh)
with record_http(warc_writer):
requests.get('https://example.com/')
requests.get('http://example.com/abc')

WARC/1.1 Support
~~~~~~~~~~~~~~~~

By default, warcio creates WARC 1.0 records for maximum compatibility with existing tools.
To create WARC/1.1 records, simply specify the warc version as follows:

.. code:: python

with record_http('example.warc.gz', warc_version='1.1'):
...


.. code:: python

WARCWriter(fh, warc_version='1.1)
...

When using WARC 1.1, the main difference is that the ``WARC-Date`` timestamp header
will be written with microsecond precision, while WARC 1.0 only supports second precision.

WARC 1.0:

.. code::

WARC/1.0
...
WARC-Date: 2018-12-26T10:11:12Z

WARC 1.1:

.. code::

WARC/1.1
...
WARC-Date: 2018-12-26T10:11:12.456789Z



Filtering Recording
~~~~~~~~~~~~~~~~~~~
Expand Down
37 changes: 29 additions & 8 deletions test/test_record_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ class TestRecordHttpBin(object):
def setup_class(cls):
from httpbin import app as httpbin_app

cls.temp_dir = tempfile.mkdtemp('warctest')

server = make_server('localhost', 0, httpbin_app)
addr, cls.port = server.socket.getsockname()

Expand All @@ -37,6 +39,10 @@ def run():
thread.start()
time.sleep(0.1)

@classmethod
def teardown_class(cls):
os.rmdir(cls.temp_dir)

def test_get_no_record(self):
url = 'http://localhost:{0}/get?foo=bar'.format(self.port)
res = requests.get(url, headers={'Host': 'httpbin.org'})
Expand Down Expand Up @@ -156,9 +162,7 @@ def skip_filter(request, response, warc_writer):
assert warc_writer.get_contents() == b''

def test_record_to_temp_file_append(self):
temp_dir = tempfile.mkdtemp('warctest')

full_path = os.path.join(temp_dir, 'example.warc.gz')
full_path = os.path.join(self.temp_dir, 'example.warc.gz')

url = 'http://localhost:{0}/get?foo=bar'.format(self.port)

Expand Down Expand Up @@ -190,12 +194,9 @@ def test_record_to_temp_file_append(self):
assert request.rec_headers['WARC-Target-URI'] == url

os.remove(full_path)
os.rmdir(temp_dir)

def test_error_record_to_temp_file_no_append_no_overwrite(self):
temp_dir = tempfile.mkdtemp('warctest')

full_path = os.path.join(temp_dir, 'example2.warc.gz')
full_path = os.path.join(self.temp_dir, 'example2.warc.gz')

url = 'http://localhost:{0}/get?foo=bar'.format(self.port)

Expand All @@ -207,6 +208,26 @@ def test_error_record_to_temp_file_no_append_no_overwrite(self):
res = requests.get(url)

os.remove(full_path)
os.rmdir(temp_dir)

def test_warc_1_1(self):
full_path = os.path.join(self.temp_dir, 'example3.warc')

url = 'http://localhost:{0}/get?foo=bar'.format(self.port)

with record_http(full_path, append=False, warc_version='1.1', gzip=False):
res = requests.get(url)

with open(full_path, 'rb') as stream:
# response
ai = ArchiveIterator(stream)
response = next(ai)
assert response.rec_headers.protocol == 'WARC/1.1'
warc_date = response.rec_headers['WARC-Date']

# ISO 8601 date with fractional seconds (microseconds)
assert '.' in warc_date
assert len(warc_date) == 27

os.remove(full_path)


16 changes: 16 additions & 0 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import pytest
from collections import Counter
from io import BytesIO
import os
import tempfile

import warcio.utils as utils
from . import get_test_file
Expand Down Expand Up @@ -64,3 +66,17 @@ def test_to_native_str(self):
# not string, leave as is
assert utils.to_native_str(10) == 10

def test_open_exclusive(self):
temp_dir = tempfile.mkdtemp('warctest')
full_name = os.path.join(temp_dir, 'foo.txt')
with utils.open(full_name, 'xt') as fh:
fh.write('test')

with pytest.raises(OSError):
with utils.open(full_name, 'xt') as fh:
fh.write('test')

os.remove(full_name)
os.rmdir(temp_dir)


26 changes: 24 additions & 2 deletions test/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@ def _make_warc_id(cls, id_=None):
return '<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>'

@classmethod
def _make_warc_date(cls):
return '2000-01-01T00:00:00Z'
def _make_warc_date(cls, use_micros=False):
if not use_micros:
return '2000-01-01T00:00:00Z'
else:
return '2000-01-01T00:00:00.123456Z'


# ============================================================================
Expand Down Expand Up @@ -566,6 +569,25 @@ def test_request_response_concur(self, is_gzip):
assert resp_id != req_id
assert resp_id == req.rec_headers.get_header('WARC-Concurrent-To')

def test_response_warc_1_1(self, is_gzip):
writer = BufferWARCWriter(gzip=is_gzip, warc_version='WARC/1.1')

resp = sample_response(writer)

writer.write_record(resp)

stream = writer.get_stream()

reader = ArchiveIterator(stream)
recs = list(reader)

assert len(recs) == 1
assert recs[0].rec_headers.protocol == 'WARC/1.1'

# ISO 8601 date with fractional seconds (microseconds)
assert '.' in recs[0].rec_headers['WARC-Date']
assert len(recs[0].rec_headers['WARC-Date']) == 27

def _conv_to_streaming_record(self, record_buff, rec_type):
# strip-off the two empty \r\n\r\n added at the end of uncompressed record
record_buff = record_buff[:-4]
Expand Down
5 changes: 3 additions & 2 deletions warcio/record_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,12 @@ def done(self):
# ============================================================================

@contextmanager
def record_http(warc_writer, filter_func=None, append=True):
def record_http(warc_writer, filter_func=None, append=True,
**kwargs):
out = None
if isinstance(warc_writer, str):
out = open(warc_writer, 'ab' if append else 'xb')
warc_writer = WARCWriter(out)
warc_writer = WARCWriter(out, **kwargs)

try:
recorder = RequestRecorder(warc_writer, filter_func)
Expand Down
43 changes: 38 additions & 5 deletions warcio/timeutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,41 @@
PAD_14_DOWN = '10000101000000'
PAD_14_UP = '29991231235959'
PAD_6_UP = '299912'
PAD_MICRO = '000000'


def iso_date_to_datetime(string):
"""
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)

>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
>>> iso_date_to_datetime('2013-12-26T10:11:12.456789Z')
datetime.datetime(2013, 12, 26, 10, 11, 12, 456789)

>>> iso_date_to_datetime('2013-12-26T10:11:12.30Z')
datetime.datetime(2013, 12, 26, 10, 11, 12, 300000)

>>> iso_date_to_datetime('2013-12-26T10:11:12.00001Z')
datetime.datetime(2013, 12, 26, 10, 11, 12, 10)

>>> iso_date_to_datetime('2013-12-26T10:11:12.000001Z')
datetime.datetime(2013, 12, 26, 10, 11, 12, 1)

>>> iso_date_to_datetime('2013-12-26T10:11:12.0000001Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
"""

>>> iso_date_to_datetime('2013-12-26T10:11:12.000000Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
"""

nums = DATE_TIMESPLIT.split(string)
if nums[-1] == '':
nums = nums[:-1]

if len(nums) == 7:
nums[6] = nums[6][:6]
nums[6] += PAD_MICRO[len(nums[6]):]

the_datetime = datetime.datetime(*(int(num) for num in nums))
return the_datetime

Expand Down Expand Up @@ -65,16 +85,29 @@ def datetime_to_http_date(the_datetime):
usegmt=True)


def datetime_to_iso_date(the_datetime):
def datetime_to_iso_date(the_datetime, use_micros=False):
"""
>>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12))
'2013-12-26T10:11:12Z'

>>> datetime_to_iso_date( datetime.datetime(2013, 12, 26, 10, 11, 12))
>>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12, 456789))
'2013-12-26T10:11:12Z'

>>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12), use_micros=True)
'2013-12-26T10:11:12Z'

>>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12, 456789), use_micros=True)
'2013-12-26T10:11:12.456789Z'

>>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12, 1), use_micros=True)
'2013-12-26T10:11:12.000001Z'

"""

return the_datetime.strftime(ISO_DT)
if not use_micros:
return the_datetime.strftime(ISO_DT)
else:
return the_datetime.isoformat() + 'Z'


def datetime_to_timestamp(the_datetime):
Expand Down
30 changes: 24 additions & 6 deletions warcio/warcwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ class BaseWARCWriter(object):

REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/identical-payload-digest'

WARC_VERSION = 'WARC/1.0'
WARC_1_0 = 'WARC/1.0'
WARC_1_1 = 'WARC/1.1'

# default warc version
WARC_VERSION = WARC_1_0

NO_PAYLOAD_DIGEST_TYPES = ('warcinfo', 'revisit')
NO_BLOCK_DIGEST_TYPES = ('warcinfo')
Expand All @@ -38,9 +42,19 @@ def __init__(self, gzip=True, *args, **kwargs):

self.parser = StatusAndHeadersParser([], verify=False)

self.warc_version = kwargs.get('warc_version', self.WARC_VERSION)
self.warc_version = self._parse_warc_version(kwargs.get('warc_version'))
self.header_filter = kwargs.get('header_filter')

def _parse_warc_version(self, version):
if not version:
return self.WARC_VERSION

version = str(version)
if version.startswith('WARC/'):
return version

return 'WARC/' + version

@classmethod
def _iter_stream(cls, stream):
while True:
Expand Down Expand Up @@ -134,7 +148,7 @@ def create_warcinfo_record(self, filename, info):
warc_headers.add_header('WARC-Record-ID', self._make_warc_id())
if filename:
warc_headers.add_header('WARC-Filename', filename)
warc_headers.add_header('WARC-Date', self._make_warc_date())
warc_headers.add_header('WARC-Date', self.curr_warc_date())

warcinfo = BytesIO()
for name, value in six.iteritems(info):
Expand Down Expand Up @@ -217,7 +231,7 @@ def _init_warc_headers(self, uri, record_type, warc_headers_dict):
warc_headers.replace_header('WARC-Target-URI', uri)

if not warc_headers.get_header('WARC-Date'):
warc_headers.add_header('WARC-Date', self._make_warc_date())
warc_headers.add_header('WARC-Date', self.curr_warc_date())

return warc_headers

Expand Down Expand Up @@ -294,13 +308,17 @@ def _write_warc_record(self, out, record):

out.flush()

def curr_warc_date(self):
use_micros = (self.warc_version >= self.WARC_1_1)
return self._make_warc_date(use_micros=use_micros)

@classmethod
def _make_warc_id(cls):
return StatusAndHeadersParser.make_warc_id()

@classmethod
def _make_warc_date(cls):
return datetime_to_iso_date(datetime.datetime.utcnow())
def _make_warc_date(cls, use_micros=False):
return datetime_to_iso_date(datetime.datetime.utcnow(), use_micros=use_micros)

@classmethod
def _create_temp_file(cls):
Expand Down