From 9e8fd72db32935e0341be45488dc22fa88ece161 Mon Sep 17 00:00:00 2001 From: anivegesana Date: Sun, 22 May 2022 09:01:15 -0700 Subject: [PATCH] Determinism --- README.md | 10 ++++ dill/_dill.py | 115 +++++++++++++++++++++++++++++++----- dill/settings.py | 1 + tests/test_deterministic.py | 60 +++++++++++++++++++ 4 files changed, 170 insertions(+), 16 deletions(-) create mode 100644 tests/test_deterministic.py diff --git a/README.md b/README.md index 42eead69..ed61db59 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,16 @@ as keyword arguments to several ``dill`` functions: remote system which does not have the original file on disk. Options are *HANDLE_FMODE* for just the handle, *CONTENTS_FMODE* for the file content and *FILE_FMODE* for content and handle. +* with *deterministic=True*, dill will try to make pickles more likely to + be the same if an object is pickled multiple times. Currently, here is + the feature set: + * `set` and `frozenset` will be sorted before being pickled. + * Subclasses of `set` and `frozenset` will not be effected (and will remain nondeterministic) because they can implement their own `__reduce__` functions which don't have to follow the conventions of `set`'s pickling procedure. + * If the elements are incomparable (e.g. `complex`), they will be sorted by their hash instead. This will not create a natural order of elements that is easy to understand, but if the `__hash__` function of the class doesn't depend on `id`, it will be deterministic. + * If using the faster cPickle based pickler outlined in [#485](https://github.com/uqfoundation/dill/issues/485), this feature may be disabled. + * `dict` and subclasses will remain pickled in insertion order. + * Entries in global variable dictionaries will be in order for each function. The dictionary as a whole, however, will be ordered in visitation order by function and will not be sorted in alphabetical order. This will mean that the globals dictionaries will be deterministic given that the visitation order of functions is deterministic. + * This feature is guaranteed. * with *ignore=False*, objects reconstructed with types defined in the top-level script environment use the existing type in the environment rather than a possibly different reconstructed type. diff --git a/dill/_dill.py b/dill/_dill.py index 1c3caaed..23d752db 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -273,6 +273,7 @@ def get_file_type(*args, **kwargs): singletontypes = [] from collections import OrderedDict +from itertools import islice import inspect @@ -302,6 +303,8 @@ def __reduce_ex__(self, protocol): from . import _shims from ._shims import Reduce, Getattr +from pickle import EMPTY_SET, MARK, ADDITEMS, POP_MARK, FROZENSET, POP + ### File modes #: Pickles the file handle, preserving mode. The position of the unpickled #: object is as for a new file handle. @@ -323,7 +326,7 @@ def copy(obj, *args, **kwds): ignore = kwds.pop('ignore', Unpickler.settings['ignore']) return loads(dumps(obj, *args, **kwds), ignore=ignore) -def dump(obj, file, protocol=None, byref=None, fmode=None, recurse=None, **kwds):#, strictio=None): +def dump(obj, file, protocol=None, byref=None, fmode=None, recurse=None, deterministic=None, **kwds):#, strictio=None): """ Pickle an object to a file. @@ -332,11 +335,11 @@ def dump(obj, file, protocol=None, byref=None, fmode=None, recurse=None, **kwds) from .settings import settings protocol = settings['protocol'] if protocol is None else int(protocol) _kwds = kwds.copy() - _kwds.update(dict(byref=byref, fmode=fmode, recurse=recurse)) + _kwds.update(dict(byref=byref, fmode=fmode, recurse=recurse, deterministic=deterministic)) Pickler(file, protocol, **_kwds).dump(obj) return -def dumps(obj, protocol=None, byref=None, fmode=None, recurse=None, **kwds):#, strictio=None): +def dumps(obj, protocol=None, byref=None, fmode=None, recurse=None, deterministic=None, **kwds):#, strictio=None): """ Pickle an object to a string. @@ -361,7 +364,7 @@ def dumps(obj, protocol=None, byref=None, fmode=None, recurse=None, **kwds):#, s Default values for keyword arguments can be set in :mod:`dill.settings`. """ file = StringIO() - dump(obj, file, protocol, byref, fmode, recurse, **kwds)#, strictio) + dump(obj, file, protocol, byref, fmode, recurse, deterministic, **kwds)#, strictio) return file.getvalue() def load(file, ignore=None, **kwds): @@ -563,6 +566,7 @@ def __init__(self, *args, **kwds): #_strictio = kwds.pop('strictio', None) _fmode = kwds.pop('fmode', None) _recurse = kwds.pop('recurse', None) + _deterministic = kwds.pop('deterministic', None) StockPickler.__init__(self, *args, **kwds) self._main = _main_module self._diff_cache = {} @@ -570,6 +574,7 @@ def __init__(self, *args, **kwds): self._strictio = False #_strictio self._fmode = settings['fmode'] if _fmode is None else _fmode self._recurse = settings['recurse'] if _recurse is None else _recurse + self._deterministic = settings['deterministic'] if _deterministic is None else _deterministic from collections import OrderedDict self._postproc = OrderedDict() @@ -622,6 +627,91 @@ def save_numpy_array(pickler, obj): dump.__doc__ = StockPickler.dump.__doc__ pass + # https://github.com/python/cpython/blob/54b5e4da8a4c6ae527ab238fcd6b9ba0a3ed0fc7/Lib/pickle.py#L1009-L1054 + # This code MUST be updated if Python changes their implementation. + def save_set(self, obj): + # This if statement was added to sort the elements of the set before + # pickling in the case that a "deterministic" pickle is required. The + # result is not truly deterministic, but it is more stable than would + # otherwise be possible without sorting. If the elements are + # incomparable, the elements will be sorted by hash instead. Some + # objects use the memory location as the hash, which will result in + # non-determinisitc elements regardless. + if getattr(self, '_deterministic', False): + try: + obj_list = obj_maybe_sorted = sorted(obj) + except Exception as e: + w = PicklingWarning("Cannot canonize a set with incomparable members") + w.__cause__ = e + warnings.warn(w) + obj_list = sorted(obj, key=hash) + obj_maybe_sorted = obj_list + else: + obj_list = list(obj) + obj_maybe_sorted = obj + # End determinism code + + save = self.save + write = self.write + + if self.proto < 4: + self.save_reduce(set, (obj_list,), obj=obj) + return + + write(EMPTY_SET) + self.memoize(obj) + + it = iter(obj_maybe_sorted) + while True: + batch = list(islice(it, self._BATCHSIZE)) + n = len(batch) + if n > 0: + write(MARK) + for item in batch: + save(item) + write(ADDITEMS) + if n < self._BATCHSIZE: + return + dispatch[set] = save_set + + def save_frozenset(self, obj): + # Start determinism code. See save_set code for explanation. + if getattr(self, '_deterministic', False): + try: + obj_list = obj_maybe_sorted = sorted(obj) + except Exception as e: + w = PicklingWarning("Cannot canonize a frozenset with incomparable members") + w.__cause__ = e + warnings.warn(w) + obj_list = sorted(obj, key=hash) + obj_maybe_sorted = obj_list + else: + obj_list = list(obj) + obj_maybe_sorted = obj + # End determinism code + + save = self.save + write = self.write + + if self.proto < 4: + self.save_reduce(frozenset, (obj_list,), obj=obj) + return + + write(MARK) + for item in obj_maybe_sorted: + save(item) + + if id(obj) in self.memo: + # If the object is already in the memo, this means it is + # recursive. In this case, throw away everything we put on the + # stack, and fetch the object back from the memo. + write(POP_MARK + self.get(self.memo[id(obj)][0])) + return + + write(FROZENSET) + self.memoize(obj) + dispatch[frozenset] = save_frozenset + class Unpickler(StockUnpickler): """python's Unpickler extended to interpreter sessions and more types""" from .settings import settings @@ -1158,10 +1248,7 @@ def _save_with_postproc(pickler, reduction, is_pickler_dill=None, obj=Getattr.NO else: pickler.save_reduce(*reduction) # pop None created by calling preprocessing step off stack - if PY3: - pickler.write(bytes('0', 'UTF-8')) - else: - pickler.write('0') + pickler.write(POP) #@register(CodeType) #def save_code(pickler, obj): @@ -1582,10 +1669,7 @@ def save_cell(pickler, obj): # The result of this function call will be None pickler.save_reduce(_shims._delattr, (obj, 'cell_contents')) # pop None created by calling _delattr off stack - if PY3: - pickler.write(bytes('0', 'UTF-8')) - else: - pickler.write('0') + pickler.write(POP) log.info("# Ce3") return if is_dill(pickler, child=True): @@ -1930,6 +2014,8 @@ def save_function(pickler, obj): # In the case that the globals are copied, we need to ensure that # the globals dictionary is updated when all objects in the # dictionary are already created. + if getattr(pickler, '_deterministic', False): + globs_copy = dict(sorted(globs_copy.items())) if PY3: glob_ids = {id(g) for g in globs_copy.values()} else: @@ -1992,10 +2078,7 @@ def save_function(pickler, obj): # Change the value of the cell pickler.save_reduce(*possible_postproc) # pop None created by calling preprocessing step off stack - if PY3: - pickler.write(bytes('0', 'UTF-8')) - else: - pickler.write('0') + pickler.write(POP) log.info("# F1") else: diff --git a/dill/settings.py b/dill/settings.py index 4d0226b0..08662e9c 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -22,6 +22,7 @@ 'fmode' : 0, #HANDLE_FMODE 'recurse' : False, 'ignore' : False, + 'deterministic' : False, } del DEFAULT_PROTOCOL diff --git a/tests/test_deterministic.py b/tests/test_deterministic.py new file mode 100644 index 00000000..995a0482 --- /dev/null +++ b/tests/test_deterministic.py @@ -0,0 +1,60 @@ +import collections +import dill +import warnings + +b = 5 +a = 0 +c = 7 + +def test_determinism(): + def f(): + global a, b, c + return a + b + c + + d1 = {'a': 0, 'c': 7, 'b': 5, '__name__': __name__, '__builtins__': __builtins__} + d2 = {'a': 0, 'b': 5, 'c': 7, '__name__': __name__, '__builtins__': __builtins__} + assert dill.dumps(d1) != dill.dumps(d2) + + F1 = dill.dumps(f, recurse=True) + F1D = dill.dumps(f, recurse=True, deterministic=True) + + qual = f.__qualname__ + f = dill._dill.FunctionType(f.__code__, d1, f.__name__, f.__defaults__, f.__closure__) + f.__qualname__ = qual + f.__module__ = '__main__' + + assert f.__globals__ is d1 + + F2 = dill.dumps(f, recurse=True) + F2D = dill.dumps(f, recurse=True, deterministic=True) + + f = dill._dill.FunctionType(f.__code__, d2, f.__name__, f.__defaults__, f.__closure__) + f.__qualname__ = qual + f.__module__ = '__main__' + + assert f.__globals__ is d2 + + F3 = dill.dumps(f, recurse=True) + F3D = dill.dumps(f, recurse=True, deterministic=True) + + # TODO: actually create a test to verify that the globals are sorted. The + # globalvars function gets the globals dictionary from the module, not the + # function itself, so they will all have the same global namespace. + # assert F2 != F3 + # assert F1 != F1D + assert F1D == F2D + assert F2D == F3D + + a = {2-1j,2+1j,1+2j,1-2j} + b = a.copy() + b.add(-2) + b.remove(-2) + if not dill._dill.IS_PYPY: + assert list(a) != list(b) + assert dill.dumps(a, deterministic=True) == dill.dumps(b, deterministic=True) + +if __name__ == '__main__': + if dill._dill.PY3: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", dill.PickleWarning) + test_determinism()