Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deterministic mode #501

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,16 @@ as keyword arguments to several ``dill`` functions:
remote system which does not have the original file on disk. Options are
*HANDLE_FMODE* for just the handle, *CONTENTS_FMODE* for the file content
and *FILE_FMODE* for content and handle.
* with *deterministic=True*, dill will try to make pickles more likely to
be the same if an object is pickled multiple times. Currently, here is
the feature set:
* `set` and `frozenset` will be sorted before being pickled.
* Subclasses of `set` and `frozenset` will not be effected (and will remain nondeterministic) because they can implement their own `__reduce__` functions which don't have to follow the conventions of `set`'s pickling procedure.
* If the elements are incomparable (e.g. `complex`), they will be sorted by their hash instead. This will not create a natural order of elements that is easy to understand, but if the `__hash__` function of the class doesn't depend on `id`, it will be deterministic.
* If using the faster cPickle based pickler outlined in [#485](https://github.com/uqfoundation/dill/issues/485), this feature may be disabled.
* `dict` and subclasses will remain pickled in insertion order.
* Entries in global variable dictionaries will be in order for each function. The dictionary as a whole, however, will be ordered in visitation order by function and will not be sorted in alphabetical order. This will mean that the globals dictionaries will be deterministic given that the visitation order of functions is deterministic.
* This feature is guaranteed.
* with *ignore=False*, objects reconstructed with types defined in the
top-level script environment use the existing type in the environment
rather than a possibly different reconstructed type.
Expand Down
115 changes: 99 additions & 16 deletions dill/_dill.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ def get_file_type(*args, **kwargs):
singletontypes = []

from collections import OrderedDict
from itertools import islice

import inspect

Expand Down Expand Up @@ -302,6 +303,8 @@ def __reduce_ex__(self, protocol):
from . import _shims
from ._shims import Reduce, Getattr

from pickle import EMPTY_SET, MARK, ADDITEMS, POP_MARK, FROZENSET, POP

### File modes
#: Pickles the file handle, preserving mode. The position of the unpickled
#: object is as for a new file handle.
Expand All @@ -323,7 +326,7 @@ def copy(obj, *args, **kwds):
ignore = kwds.pop('ignore', Unpickler.settings['ignore'])
return loads(dumps(obj, *args, **kwds), ignore=ignore)

def dump(obj, file, protocol=None, byref=None, fmode=None, recurse=None, **kwds):#, strictio=None):
def dump(obj, file, protocol=None, byref=None, fmode=None, recurse=None, deterministic=None, **kwds):#, strictio=None):
"""
Pickle an object to a file.

Expand All @@ -332,11 +335,11 @@ def dump(obj, file, protocol=None, byref=None, fmode=None, recurse=None, **kwds)
from .settings import settings
protocol = settings['protocol'] if protocol is None else int(protocol)
_kwds = kwds.copy()
_kwds.update(dict(byref=byref, fmode=fmode, recurse=recurse))
_kwds.update(dict(byref=byref, fmode=fmode, recurse=recurse, deterministic=deterministic))
Pickler(file, protocol, **_kwds).dump(obj)
return

def dumps(obj, protocol=None, byref=None, fmode=None, recurse=None, **kwds):#, strictio=None):
def dumps(obj, protocol=None, byref=None, fmode=None, recurse=None, deterministic=None, **kwds):#, strictio=None):
"""
Pickle an object to a string.

Expand All @@ -361,7 +364,7 @@ def dumps(obj, protocol=None, byref=None, fmode=None, recurse=None, **kwds):#, s
Default values for keyword arguments can be set in :mod:`dill.settings`.
"""
file = StringIO()
dump(obj, file, protocol, byref, fmode, recurse, **kwds)#, strictio)
dump(obj, file, protocol, byref, fmode, recurse, deterministic, **kwds)#, strictio)
return file.getvalue()

def load(file, ignore=None, **kwds):
Expand Down Expand Up @@ -563,13 +566,15 @@ def __init__(self, *args, **kwds):
#_strictio = kwds.pop('strictio', None)
_fmode = kwds.pop('fmode', None)
_recurse = kwds.pop('recurse', None)
_deterministic = kwds.pop('deterministic', None)
StockPickler.__init__(self, *args, **kwds)
self._main = _main_module
self._diff_cache = {}
self._byref = settings['byref'] if _byref is None else _byref
self._strictio = False #_strictio
self._fmode = settings['fmode'] if _fmode is None else _fmode
self._recurse = settings['recurse'] if _recurse is None else _recurse
self._deterministic = settings['deterministic'] if _deterministic is None else _deterministic
from collections import OrderedDict
self._postproc = OrderedDict()

Expand Down Expand Up @@ -622,6 +627,91 @@ def save_numpy_array(pickler, obj):
dump.__doc__ = StockPickler.dump.__doc__
pass

# https://github.com/python/cpython/blob/54b5e4da8a4c6ae527ab238fcd6b9ba0a3ed0fc7/Lib/pickle.py#L1009-L1054
# This code MUST be updated if Python changes their implementation.
def save_set(self, obj):
# This if statement was added to sort the elements of the set before
# pickling in the case that a "deterministic" pickle is required. The
# result is not truly deterministic, but it is more stable than would
# otherwise be possible without sorting. If the elements are
# incomparable, the elements will be sorted by hash instead. Some
# objects use the memory location as the hash, which will result in
# non-determinisitc elements regardless.
if getattr(self, '_deterministic', False):
try:
obj_list = obj_maybe_sorted = sorted(obj)
except Exception as e:
w = PicklingWarning("Cannot canonize a set with incomparable members")
w.__cause__ = e
warnings.warn(w)
obj_list = sorted(obj, key=hash)
obj_maybe_sorted = obj_list
else:
obj_list = list(obj)
obj_maybe_sorted = obj
# End determinism code

save = self.save
write = self.write

if self.proto < 4:
self.save_reduce(set, (obj_list,), obj=obj)
return

write(EMPTY_SET)
self.memoize(obj)

it = iter(obj_maybe_sorted)
while True:
batch = list(islice(it, self._BATCHSIZE))
n = len(batch)
if n > 0:
write(MARK)
for item in batch:
save(item)
write(ADDITEMS)
if n < self._BATCHSIZE:
return
dispatch[set] = save_set

def save_frozenset(self, obj):
# Start determinism code. See save_set code for explanation.
if getattr(self, '_deterministic', False):
try:
obj_list = obj_maybe_sorted = sorted(obj)
except Exception as e:
w = PicklingWarning("Cannot canonize a frozenset with incomparable members")
w.__cause__ = e
warnings.warn(w)
obj_list = sorted(obj, key=hash)
obj_maybe_sorted = obj_list
else:
obj_list = list(obj)
obj_maybe_sorted = obj
# End determinism code

save = self.save
write = self.write

if self.proto < 4:
self.save_reduce(frozenset, (obj_list,), obj=obj)
return

write(MARK)
for item in obj_maybe_sorted:
save(item)

if id(obj) in self.memo:
# If the object is already in the memo, this means it is
# recursive. In this case, throw away everything we put on the
# stack, and fetch the object back from the memo.
write(POP_MARK + self.get(self.memo[id(obj)][0]))
return

write(FROZENSET)
self.memoize(obj)
dispatch[frozenset] = save_frozenset

class Unpickler(StockUnpickler):
"""python's Unpickler extended to interpreter sessions and more types"""
from .settings import settings
Expand Down Expand Up @@ -1158,10 +1248,7 @@ def _save_with_postproc(pickler, reduction, is_pickler_dill=None, obj=Getattr.NO
else:
pickler.save_reduce(*reduction)
# pop None created by calling preprocessing step off stack
if PY3:
pickler.write(bytes('0', 'UTF-8'))
else:
pickler.write('0')
pickler.write(POP)

#@register(CodeType)
#def save_code(pickler, obj):
Expand Down Expand Up @@ -1582,10 +1669,7 @@ def save_cell(pickler, obj):
# The result of this function call will be None
pickler.save_reduce(_shims._delattr, (obj, 'cell_contents'))
# pop None created by calling _delattr off stack
if PY3:
pickler.write(bytes('0', 'UTF-8'))
else:
pickler.write('0')
pickler.write(POP)
log.info("# Ce3")
return
if is_dill(pickler, child=True):
Expand Down Expand Up @@ -1930,6 +2014,8 @@ def save_function(pickler, obj):
# In the case that the globals are copied, we need to ensure that
# the globals dictionary is updated when all objects in the
# dictionary are already created.
if getattr(pickler, '_deterministic', False):
globs_copy = dict(sorted(globs_copy.items()))
if PY3:
glob_ids = {id(g) for g in globs_copy.values()}
else:
Expand Down Expand Up @@ -1992,10 +2078,7 @@ def save_function(pickler, obj):
# Change the value of the cell
pickler.save_reduce(*possible_postproc)
# pop None created by calling preprocessing step off stack
if PY3:
pickler.write(bytes('0', 'UTF-8'))
else:
pickler.write('0')
pickler.write(POP)

log.info("# F1")
else:
Expand Down
1 change: 1 addition & 0 deletions dill/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
'fmode' : 0, #HANDLE_FMODE
'recurse' : False,
'ignore' : False,
'deterministic' : False,
}

del DEFAULT_PROTOCOL
Expand Down
60 changes: 60 additions & 0 deletions tests/test_deterministic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import collections
import dill
import warnings

b = 5
a = 0
c = 7

def test_determinism():
def f():
global a, b, c
return a + b + c

d1 = {'a': 0, 'c': 7, 'b': 5, '__name__': __name__, '__builtins__': __builtins__}
d2 = {'a': 0, 'b': 5, 'c': 7, '__name__': __name__, '__builtins__': __builtins__}
assert dill.dumps(d1) != dill.dumps(d2)

F1 = dill.dumps(f, recurse=True)
F1D = dill.dumps(f, recurse=True, deterministic=True)

qual = f.__qualname__
f = dill._dill.FunctionType(f.__code__, d1, f.__name__, f.__defaults__, f.__closure__)
f.__qualname__ = qual
f.__module__ = '__main__'

assert f.__globals__ is d1

F2 = dill.dumps(f, recurse=True)
F2D = dill.dumps(f, recurse=True, deterministic=True)

f = dill._dill.FunctionType(f.__code__, d2, f.__name__, f.__defaults__, f.__closure__)
f.__qualname__ = qual
f.__module__ = '__main__'

assert f.__globals__ is d2

F3 = dill.dumps(f, recurse=True)
F3D = dill.dumps(f, recurse=True, deterministic=True)

# TODO: actually create a test to verify that the globals are sorted. The
# globalvars function gets the globals dictionary from the module, not the
# function itself, so they will all have the same global namespace.
# assert F2 != F3
# assert F1 != F1D
assert F1D == F2D
assert F2D == F3D

a = {2-1j,2+1j,1+2j,1-2j}
b = a.copy()
b.add(-2)
b.remove(-2)
if not dill._dill.IS_PYPY:
assert list(a) != list(b)
assert dill.dumps(a, deterministic=True) == dill.dumps(b, deterministic=True)

if __name__ == '__main__':
if dill._dill.PY3:
with warnings.catch_warnings():
warnings.simplefilter("ignore", dill.PickleWarning)
test_determinism()