From 9ea90b172d993908090055cdc47d5a80d35089cb Mon Sep 17 00:00:00 2001 From: "Vanya A. Sergeev" Date: Fri, 14 Oct 2016 18:05:14 -0700 Subject: [PATCH] add option to allow unpacking invalid utf8 strings resolves #2. --- README.md | 24 ++++++++++++++++++++---- test_umsgpack.py | 9 +++++++++ umsgpack.py | 24 ++++++++++++++++++++++-- 3 files changed, 51 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index d63f17b..21dfb49 100644 --- a/README.md +++ b/README.md @@ -142,6 +142,20 @@ OrderedDict([('compact', True), ('schema', 0)]) >>> ``` +### Invalid UTF-8 Strings + +The unpacking functions provide an `allow_invalid_utf8` option to unpack MessagePack strings with invalid UTF-8 into the `umsgpack.InvalidString` type, instead of throwing an exception. The `umsgpack.InvalidString` type is a subclass of `bytes`, and can be used like any other `bytes` object. + +``` python +>>> # Attempt to unpack invalid UTF-8 string +... umsgpack.unpackb(b'\xa4\x80\x01\x02\x03') +... +umsgpack.InvalidStringException: unpacked string is invalid utf-8 +>>> umsgpack.unpackb(b'\xa4\x80\x01\x02\x03', allow_invalid_utf8=True) +b'\x80\x01\x02\x03' +>>> +``` + ### Compatibility Mode The compatibility mode supports the "raw" bytes MessagePack type from the [old specification](https://github.com/msgpack/msgpack/blob/master/spec-old.md). When the module-wide `compatibility` option is enabled, both unicode strings and bytes will be serialized into the "raw" MessagePack type, and the "raw" MessagePack type will be deserialized into bytes. @@ -216,13 +230,15 @@ If a non-byte-string argument is passed to `umsgpack.unpackb()`, it will raise a ``` * `InvalidStringException`: Invalid UTF-8 string encountered during unpacking. - String bytes are strictly decoded with UTF-8. This exception is thrown if UTF-8 decoding of string bytes fails. + String bytes are strictly decoded with UTF-8. This exception is thrown if + UTF-8 decoding of string bytes fails. Use the `allow_invalid_utf8` option + to unpack invalid MessagePack strings into byte strings. ``` python - # Attempt to unpack the string b"\x80\x81" + # Attempt to unpack invalid UTF-8 string >>> umsgpack.unpackb(b"\xa2\x80\x81") ... - umsgpack.InvalidStringException: unpacked string is not utf-8 + umsgpack.InvalidStringException: unpacked string is invalid utf-8 >>> ``` @@ -268,7 +284,7 @@ If a non-byte-string argument is passed to `umsgpack.unpackb()`, it will raise a * Python 3 * `str` type objects are packed into, and unpacked from, the msgpack `string` format * `bytes` type objects are packed into, and unpacked from, the msgpack `binary` format -* The msgpack string format is strictly decoded with UTF-8 -- an exception is thrown if the string bytes cannot be decoded into a valid UTF-8 string +* The msgpack string format is strictly decoded with UTF-8 — an exception is thrown if the string bytes cannot be decoded into a valid UTF-8 string, unless the `allow_invalid_utf8` option is enabled * The msgpack array format is unpacked into a Python list, unless it is the key of a map, in which case it is unpacked into a Python tuple * Python tuples and lists are both packed into the msgpack array format * Python float types are packed into the msgpack float32 or float64 format depending on the system's `sys.float_info` diff --git a/test_umsgpack.py b/test_umsgpack.py index 81ada20..f96f1fe 100644 --- a/test_umsgpack.py +++ b/test_umsgpack.py @@ -224,6 +224,7 @@ # These are the only global variables that should be exported by umsgpack exported_vars_test_vector = [ "Ext", + "InvalidString", "PackException", "UnpackException", "UnsupportedTypeException", @@ -332,6 +333,14 @@ def test_unpack_compatibility(self): umsgpack.compatibility = False + def test_unpack_invalid_string(self): + # Use last unpack exception test vector (an invalid string) + (_, data, _) = unpack_exception_test_vectors[-1] + + obj = umsgpack.unpackb(data, allow_invalid_utf8=True) + self.assertTrue(isinstance(obj, umsgpack.InvalidString)) + self.assertEqual(obj, b"\x80") + def test_unpack_ordered_dict(self): # Use last composite test vector (a map) (_, obj, data) = composite_test_vectors[-1] diff --git a/umsgpack.py b/umsgpack.py index 4129513..2481477 100644 --- a/umsgpack.py +++ b/umsgpack.py @@ -129,6 +129,10 @@ def __str__(self): s += ")" return s +class InvalidString(bytes): + """Subclass of bytes to hold invalid UTF-8 strings.""" + pass + ################################################################################ ### Exceptions ################################################################################ @@ -551,10 +555,13 @@ def _unpack_string(code, fp, options): if compatibility: return _read_except(fp, length) + data = _read_except(fp, length) try: - return bytes.decode(_read_except(fp, length), 'utf-8') + return bytes.decode(data, 'utf-8') except UnicodeDecodeError: - raise InvalidStringException("unpacked string is not utf-8") + if options.get("allow_invalid_utf8"): + return InvalidString(data) + raise InvalidStringException("unpacked string is invalid utf-8") def _unpack_binary(code, fp, options): if code == b'\xc4': @@ -655,6 +662,9 @@ def _unpack2(fp, **options): Kwargs: use_ordered_dict (bool): unpack maps into OrderedDict, instead of unordered dict (default False) + allow_invalid_utf8 (bool): unpack invalid strings into instances of + InvalidString, for access to the bytes + (default False) Returns: A Python object. @@ -690,6 +700,9 @@ def _unpack3(fp, **options): Kwargs: use_ordered_dict (bool): unpack maps into OrderedDict, instead of unordered dict (default False) + allow_invalid_utf8 (bool): unpack invalid strings into instances of + InvalidString, for access to the bytes + (default False) Returns: A Python object. @@ -726,6 +739,9 @@ def _unpackb2(s, **options): Kwargs: use_ordered_dict (bool): unpack maps into OrderedDict, instead of unordered dict (default False) + allow_invalid_utf8 (bool): unpack invalid strings into instances of + InvalidString, for access to the bytes + (default False) Returns: A Python object. @@ -765,6 +781,10 @@ def _unpackb3(s, **options): Kwargs: use_ordered_dict (bool): unpack maps into OrderedDict, instead of unordered dict (default False) + allow_invalid_utf8 (bool): unpack invalid strings into instances of + InvalidString, for access to the bytes + (default False) + Returns: A Python object.