From a650f626fdc3d8e97db4823cf9c5dcfac5fa437a Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sun, 5 Jun 2022 14:14:31 -0300 Subject: [PATCH 001/109] Move dump_session(), load_session() and everything related to session.py --- dill/__init__.py | 5 +- dill/_dill.py | 134 +------------------------------------ dill/session.py | 149 ++++++++++++++++++++++++++++++++++++++++++ tests/test_session.py | 2 +- 4 files changed, 154 insertions(+), 136 deletions(-) create mode 100644 dill/session.py diff --git a/dill/__init__.py b/dill/__init__.py index ac93ff6a..b28a1ac8 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -283,12 +283,13 @@ """ -from ._dill import dump, dumps, load, loads, dump_session, load_session, \ +from ._dill import dump, dumps, load, loads, \ Pickler, Unpickler, register, copy, pickle, pickles, check, \ HIGHEST_PROTOCOL, DEFAULT_PROTOCOL, PicklingError, UnpicklingError, \ HANDLE_FMODE, CONTENTS_FMODE, FILE_FMODE, PickleError, PickleWarning, \ PicklingWarning, UnpicklingWarning -from . import source, temp, detect +from .session import dump_session, load_session +from . import detect, session, source, temp # get global settings from .settings import settings diff --git a/dill/_dill.py b/dill/_dill.py index ee1d985f..f0d55115 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -15,7 +15,7 @@ Test against "all" python types (Std. Lib. CH 1-15 @ 2.7) by mmckerns. Test against CH16+ Std. Lib. ... TBD. """ -__all__ = ['dump','dumps','load','loads','dump_session','load_session', +__all__ = ['dump','dumps','load','loads', 'Pickler','Unpickler','register','copy','pickle','pickles', 'check','HIGHEST_PROTOCOL','DEFAULT_PROTOCOL','PicklingError', 'UnpicklingError','HANDLE_FMODE','CONTENTS_FMODE','FILE_FMODE', @@ -396,138 +396,6 @@ def loads(str, ignore=None, **kwds): ### End: Shorthands ### -### Pickle the Interpreter Session -SESSION_IMPORTED_AS_TYPES = (ModuleType, ClassType, TypeType, Exception, - FunctionType, MethodType, BuiltinMethodType) - -def _module_map(): - """get map of imported modules""" - from collections import defaultdict, namedtuple - modmap = namedtuple('Modmap', ['by_name', 'by_id', 'top_level']) - modmap = modmap(defaultdict(list), defaultdict(list), {}) - items = 'items' if PY3 else 'iteritems' - for modname, module in getattr(sys.modules, items)(): - if not isinstance(module, ModuleType): - continue - if '.' not in modname: - modmap.top_level[id(module)] = modname - for objname, modobj in module.__dict__.items(): - modmap.by_name[objname].append((modobj, modname)) - modmap.by_id[id(modobj)].append((modobj, objname, modname)) - return modmap - -def _lookup_module(modmap, name, obj, main_module): - """lookup name or id of obj if module is imported""" - for modobj, modname in modmap.by_name[name]: - if modobj is obj and sys.modules[modname] is not main_module: - return modname, name - if isinstance(obj, SESSION_IMPORTED_AS_TYPES): - for modobj, objname, modname in modmap.by_id[id(obj)]: - if sys.modules[modname] is not main_module: - return modname, objname - return None, None - -def _stash_modules(main_module): - modmap = _module_map() - newmod = ModuleType(main_module.__name__) - - imported = [] - imported_as = [] - imported_top_level = [] # keep separeted for backwards compatibility - original = {} - items = 'items' if PY3 else 'iteritems' - for name, obj in getattr(main_module.__dict__, items)(): - if obj is main_module: - original[name] = newmod # self-reference - continue - - # Avoid incorrectly matching a singleton value in another package (ex.: __doc__). - if any(obj is singleton for singleton in (None, False, True)) or \ - isinstance(obj, ModuleType) and _is_builtin_module(obj): # always saved by ref - original[name] = obj - continue - - source_module, objname = _lookup_module(modmap, name, obj, main_module) - if source_module: - if objname == name: - imported.append((source_module, name)) - else: - imported_as.append((source_module, objname, name)) - else: - try: - imported_top_level.append((modmap.top_level[id(obj)], name)) - except KeyError: - original[name] = obj - - if len(original) < len(main_module.__dict__): - newmod.__dict__.update(original) - newmod.__dill_imported = imported - newmod.__dill_imported_as = imported_as - newmod.__dill_imported_top_level = imported_top_level - return newmod - else: - return main_module - -def _restore_modules(unpickler, main_module): - try: - for modname, name in main_module.__dict__.pop('__dill_imported'): - main_module.__dict__[name] = unpickler.find_class(modname, name) - for modname, objname, name in main_module.__dict__.pop('__dill_imported_as'): - main_module.__dict__[name] = unpickler.find_class(modname, objname) - for modname, name in main_module.__dict__.pop('__dill_imported_top_level'): - main_module.__dict__[name] = __import__(modname) - except KeyError: - pass - -#NOTE: 06/03/15 renamed main_module to main -def dump_session(filename='/tmp/session.pkl', main=None, byref=False, **kwds): - """pickle the current state of __main__ to a file""" - from .settings import settings - protocol = settings['protocol'] - if main is None: main = _main_module - if hasattr(filename, 'write'): - f = filename - else: - f = open(filename, 'wb') - try: - pickler = Pickler(f, protocol, **kwds) - pickler._original_main = main - if byref: - main = _stash_modules(main) - pickler._main = main #FIXME: dill.settings are disabled - pickler._byref = False # disable pickling by name reference - pickler._recurse = False # disable pickling recursion for globals - pickler._session = True # is best indicator of when pickling a session - pickler._first_pass = True - pickler._main_modified = main is not pickler._original_main - pickler.dump(main) - finally: - if f is not filename: # If newly opened file - f.close() - return - -def load_session(filename='/tmp/session.pkl', main=None, **kwds): - """update the __main__ module with the state from the session file""" - if main is None: main = _main_module - if hasattr(filename, 'read'): - f = filename - else: - f = open(filename, 'rb') - try: #FIXME: dill.settings are disabled - unpickler = Unpickler(f, **kwds) - unpickler._main = main - unpickler._session = True - module = unpickler.load() - unpickler._session = False - main.__dict__.update(module.__dict__) - _restore_modules(unpickler, main) - finally: - if f is not filename: # If newly opened file - f.close() - return - -### End: Pickle the Interpreter - class MetaCatchingDict(dict): def get(self, key, default=None): try: diff --git a/dill/session.py b/dill/session.py new file mode 100644 index 00000000..35485009 --- /dev/null +++ b/dill/session.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python +# +# Author: Mike McKerns (mmckerns @caltech and @uqfoundation) +# Author: Leonardo Gama (@leogama) +# Copyright (c) 2008-2015 California Institute of Technology. +# Copyright (c) 2016-2022 The Uncertainty Quantification Foundation. +# License: 3-clause BSD. The full license text is available at: +# - https://github.com/uqfoundation/dill/blob/master/LICENSE +""" +Pickle and restore the intepreter session. +""" + +__all__ = ['dump_session', 'load_session'] + +import logging, sys + +from dill import _dill, Pickler, Unpickler +from ._dill import ModuleType, _import_module, _is_builtin_module, _main_module, PY3 + +SESSION_IMPORTED_AS_TYPES = tuple([Exception] + [getattr(_dill, name) for name in + ('ModuleType', 'TypeType', 'FunctionType', 'MethodType', 'BuiltinMethodType')]) + +log = logging.getLogger('dill') + +def _module_map(): + """get map of imported modules""" + from collections import defaultdict, namedtuple + modmap = namedtuple('Modmap', ['by_name', 'by_id', 'top_level']) + modmap = modmap(defaultdict(list), defaultdict(list), {}) + items = 'items' if PY3 else 'iteritems' + for modname, module in getattr(sys.modules, items)(): + if not isinstance(module, ModuleType): + continue + if '.' not in modname: + modmap.top_level[id(module)] = modname + for objname, modobj in module.__dict__.items(): + modmap.by_name[objname].append((modobj, modname)) + modmap.by_id[id(modobj)].append((modobj, objname, modname)) + return modmap + +def _lookup_module(modmap, name, obj, main_module): + """lookup name or id of obj if module is imported""" + for modobj, modname in modmap.by_name[name]: + if modobj is obj and sys.modules[modname] is not main_module: + return modname, name + if isinstance(obj, SESSION_IMPORTED_AS_TYPES): + for modobj, objname, modname in modmap.by_id[id(obj)]: + if sys.modules[modname] is not main_module: + return modname, objname + return None, None + +def _stash_modules(main_module): + modmap = _module_map() + newmod = ModuleType(main_module.__name__) + + imported = [] + imported_as = [] + imported_top_level = [] # keep separeted for backwards compatibility + original = {} + items = 'items' if PY3 else 'iteritems' + for name, obj in getattr(main_module.__dict__, items)(): + if obj is main_module: + original[name] = newmod # self-reference + continue + + # Avoid incorrectly matching a singleton value in another package (ex.: __doc__). + if any(obj is singleton for singleton in (None, False, True)) or \ + isinstance(obj, ModuleType) and _is_builtin_module(obj): # always saved by ref + original[name] = obj + continue + + source_module, objname = _lookup_module(modmap, name, obj, main_module) + if source_module: + if objname == name: + imported.append((source_module, name)) + else: + imported_as.append((source_module, objname, name)) + else: + try: + imported_top_level.append((modmap.top_level[id(obj)], name)) + except KeyError: + original[name] = obj + + if len(original) < len(main_module.__dict__): + newmod.__dict__.update(original) + newmod.__dill_imported = imported + newmod.__dill_imported_as = imported_as + newmod.__dill_imported_top_level = imported_top_level + return newmod + else: + return main_module + +def _restore_modules(unpickler, main_module): + try: + for modname, name in main_module.__dict__.pop('__dill_imported'): + main_module.__dict__[name] = unpickler.find_class(modname, name) + for modname, objname, name in main_module.__dict__.pop('__dill_imported_as'): + main_module.__dict__[name] = unpickler.find_class(modname, objname) + for modname, name in main_module.__dict__.pop('__dill_imported_top_level'): + main_module.__dict__[name] = __import__(modname) + except KeyError: + pass + +#NOTE: 06/03/15 renamed main_module to main +def dump_session(filename='/tmp/session.pkl', main=None, byref=False, **kwds): + """pickle the current state of __main__ to a file""" + from .settings import settings + protocol = settings['protocol'] + if main is None: main = _main_module + if hasattr(filename, 'write'): + f = filename + else: + f = open(filename, 'wb') + try: + pickler = Pickler(f, protocol, **kwds) + pickler._original_main = main + if byref: + main = _stash_modules(main) + pickler._main = main #FIXME: dill.settings are disabled + pickler._byref = False # disable pickling by name reference + pickler._recurse = False # disable pickling recursion for globals + pickler._session = True # is best indicator of when pickling a session + pickler._first_pass = True + pickler._main_modified = main is not pickler._original_main + pickler.dump(main) + finally: + if f is not filename: # If newly opened file + f.close() + return + +def load_session(filename='/tmp/session.pkl', main=None, **kwds): + """update the __main__ module with the state from the session file""" + if main is None: main = _main_module + if hasattr(filename, 'read'): + f = filename + else: + f = open(filename, 'rb') + try: #FIXME: dill.settings are disabled + unpickler = Unpickler(f, **kwds) + unpickler._main = main + unpickler._session = True + module = unpickler.load() + unpickler._session = False + main.__dict__.update(module.__dict__) + _restore_modules(unpickler, main) + finally: + if f is not filename: # If newly opened file + f.close() + return diff --git a/tests/test_session.py b/tests/test_session.py index fd71ea05..c4cbdd06 100644 --- a/tests/test_session.py +++ b/tests/test_session.py @@ -197,7 +197,7 @@ def test_objects(main, copy_dict, byref): main = ModuleType(modname) main.x = 42 - _main = dill._dill._stash_modules(main) + _main = dill.session._stash_modules(main) if _main is not main: print("There are objects to save by referenece that shouldn't be:", _main.__dill_imported, _main.__dill_imported_as, _main.__dill_imported_top_level, From de1943fac5bfab22884c2f4a3f49f3f644b96be9 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Wed, 8 Jun 2022 13:29:14 -0300 Subject: [PATCH 002/109] Add options to exclude objects from dump_session() --- dill/__init__.py | 13 +-- dill/_dill.py | 5 +- dill/_utils.py | 234 +++++++++++++++++++++++++++++++++++++++++++++++ dill/session.py | 117 +++++++++++++++++++----- dill/settings.py | 8 +- 5 files changed, 345 insertions(+), 32 deletions(-) create mode 100644 dill/_utils.py diff --git a/dill/__init__.py b/dill/__init__.py index b28a1ac8..12ad3cec 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -283,16 +283,17 @@ """ -from ._dill import dump, dumps, load, loads, \ - Pickler, Unpickler, register, copy, pickle, pickles, check, \ - HIGHEST_PROTOCOL, DEFAULT_PROTOCOL, PicklingError, UnpicklingError, \ - HANDLE_FMODE, CONTENTS_FMODE, FILE_FMODE, PickleError, PickleWarning, \ - PicklingWarning, UnpicklingWarning +from ._dill import ( + Pickler, Unpickler, + dump, dumps, load, loads, copy, check, pickle, pickles, register, + DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, HANDLE_FMODE, CONTENTS_FMODE, FILE_FMODE, + PicklingError, UnpicklingError, PickleError, PicklingWarning, UnpicklingWarning, PickleWarning, + ) from .session import dump_session, load_session from . import detect, session, source, temp # get global settings -from .settings import settings +from .settings import Settings, settings # make sure "trace" is turned off detect.trace(False) diff --git a/dill/_dill.py b/dill/_dill.py index f0d55115..9b062a1d 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -1917,8 +1917,7 @@ def save_function(pickler, obj): _recurse = getattr(pickler, '_recurse', None) _byref = getattr(pickler, '_byref', None) _postproc = getattr(pickler, '_postproc', None) - _main_modified = getattr(pickler, '_main_modified', None) - _original_main = getattr(pickler, '_original_main', __builtin__)#'None' + _original_main = getattr(pickler, '_original_main', None) postproc_list = [] if _recurse: # recurse to get all globals referred to by obj @@ -1935,7 +1934,7 @@ def save_function(pickler, obj): # If the globals is the __dict__ from the module being saved as a # session, substitute it by the dictionary being actually saved. - if _main_modified and globs_copy is _original_main.__dict__: + if _original_main and globs_copy is _original_main.__dict__: globs_copy = getattr(pickler, '_main', _original_main).__dict__ globs = globs_copy # If the globals is a module __dict__, do not save it in the pickle. diff --git a/dill/_utils.py b/dill/_utils.py new file mode 100644 index 00000000..1ce67acd --- /dev/null +++ b/dill/_utils.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python +# +# Author: Leonardo Gama (@leogama) +# Copyright (c) 2022 The Uncertainty Quantification Foundation. +# License: 3-clause BSD. The full license text is available at: +# - https://github.com/uqfoundation/dill/blob/master/LICENSE +"""auxiliary internal classes used in multiple submodules, set here to avoid import recursion""" + +__all__ = ['AttrDict', 'ExcludeRules', 'Filter', 'RuleType'] + +import logging +logger = logging.getLogger('dill._utils') + +import inspect +from functools import partialmethod + +class AttrDict(dict): + """syntactic sugar for accessing dictionary items""" + _CAST = object() # singleton + def __init__(self, *args, **kwargs): + data = args[0] if len(args) == 2 and args[1] is self._CAST else dict(*args, **kwargs) + for key, val in tuple(data.items()): + if isinstance(val, dict) and not isinstance(val, AttrDict): + data[key] = AttrDict(val, self._CAST) + super().__setattr__('_data', data) + def _check_attr(self, name): + try: + super().__getattribute__(name) + except AttributeError: + pass + else: + raise AttributeError("'AttrDict' object attribute %r is read-only" % name) + def __getattr__(self, key): + # This is called only if dict.__getattribute__(key) fails. + try: + return self._data[key] + except KeyError: + raise AttributeError("'AttrDict' object has no attribute %r" % key) + def __setattr__(self, key, value): + self._check_attr(key) + if isinstance(value, dict): + self._data[key] = AttrDict(value, self._CAST) + else: + self._data[key] = value + def __delattr__(self, key): + self._check_attr(key) + del self._data[key] + def __proxy__(self, method, *args, **kwargs): + return getattr(self._data, method)(*args, **kwargs) + def __reduce__(self): + return AttrDict, (self._data,) + def copy(self): + # Deep copy. + copy = AttrDict(self._data) + for key, val in tuple(copy.items()): + if isinstance(val, AttrDict): + copy[key] = val.copy() + return copy + +for method, _ in inspect.getmembers(dict, inspect.ismethoddescriptor): + if method not in vars(AttrDict) and method not in {'__getattribute__', '__reduce_ex__'}: + setattr(AttrDict, method, partialmethod(AttrDict.__proxy__, method)) + + +### Namespace filtering +import re +from dataclasses import InitVar, dataclass, field, fields +from collections import abc, namedtuple +from enum import Enum +from functools import partialmethod +from itertools import filterfalse +from re import Pattern +from typing import Callable, Iterable, Set, Tuple, Union + +RuleType = Enum('RuleType', 'EXCLUDE INCLUDE', module=__name__) +NamedObj = namedtuple('NamedObj', 'name value', module=__name__) + +Filter = Union[str, Pattern, int, type, Callable] +Rule = Tuple[RuleType, Union[Filter, Iterable[Filter]]] + +def isiterable(arg): + return isinstance(arg, abc.Iterable) and not isinstance(arg, (str, bytes)) + +@dataclass +class ExcludeFilters: + ids: Set[int] = field(default_factory=set) + names: Set[str] = field(default_factory=set) + regex: Set[Pattern] = field(default_factory=set) + types: Set[type] = field(default_factory=set) + funcs: Set[Callable] = field(default_factory=set) + + @property + def filter_sets(self): + return tuple(field.name for field in fields(self)) + def __bool__(self): + return any(getattr(self, filter_set) for filter_set in self.filter_sets) + def _check(self, filter): + if isinstance(filter, str): + if filter.isidentifier(): + field = 'names' + else: + filter, field = re.compile(filter), 'regex' + elif isinstance(filter, Pattern): + field = 'regex' + elif isinstance(filter, int): + field = 'ids' + elif isinstance(filter, type): + field = 'types' + elif callable(filter): + field = 'funcs' + else: + raise ValueError("invalid filter: %r" % filter) + return filter, getattr(self, field) + def add(self, filter): + filter, filter_set = self._check(filter) + filter_set.add(filter) + def discard(self, filter): + filter, filter_set = self._check(filter) + filter_set.discard(filter) + def remove(self, filter): + filter, filter_set = self._check(filter) + filter_set.remove(filter) + def update(self, filters): + for filter in filters: + self.add(filter) + def clear(self): + for filter_set in self.filter_sets: + getattr(self, filter_set).clear() + def add_type(self, type_name): + import types + name_suffix = type_name + 'Type' if not type_name.endswith('Type') else type_name + if hasattr(types, name_suffix): + type_name = name_suffix + type_obj = getattr(types, type_name, None) + if not isinstance(type_obj, type): + named = type_name if type_name == name_suffix else "%r or %r" % (type_name, name_suffix) + raise NameError("could not find a type named %s in module 'types'" % named) + self.types.add(type_obj) + +@dataclass +class ExcludeRules: + exclude: ExcludeFilters = field(init=False, default_factory=ExcludeFilters) + include: ExcludeFilters = field(init=False, default_factory=ExcludeFilters) + rules: InitVar[Iterable[Rule]] = None + + def __post_init__(self, rules): + if rules is not None: + self.update(rules) + + def __proxy__(self, method, filter, *, rule_type=RuleType.EXCLUDE): + if rule_type is RuleType.EXCLUDE: + getattr(self.exclude, method)(filter) + elif rule_type is RuleType.INCLUDE: + getattr(self.include, method)(filter) + else: + raise ValueError("invalid rule type: %r (must be one of %r)" % (rule_type, list(RuleType))) + + add = partialmethod(__proxy__, 'add') + discard = partialmethod(__proxy__, 'discard') + remove = partialmethod(__proxy__, 'remove') + + def update(self, rules): + if isinstance(rules, ExcludeRules): + for filter_set in self.exclude.filter_sets: + getattr(self.exclude, filter_set).update(getattr(rules.exclude, filter_set)) + getattr(self.include, filter_set).update(getattr(rules.include, filter_set)) + else: + # Validate rules. + for rule in rules: + if not isinstance(rule, tuple) or len(rule) != 2: + raise ValueError("invalid rule format: %r" % rule) + for rule_type, filter in rules: + if isiterable(filter): + for f in filter: + self.add(f, rule_type=rule_type) + else: + self.add(filter, rule_type=rule_type) + + def clear(self): + self.exclude.clear() + self.include.clear() + + def filter_namespace(self, namespace, obj=None): + if not self.exclude and not self.include: + return namespace + + # Protect agains dict changes during the call. + namespace_copy = namespace.copy() if obj is None or namespace is vars(obj) else namespace + objects = all_objects = [NamedObj._make(item) for item in namespace_copy.items()] + + for filters in (self.exclude, self.include): + if filters is self.exclude and not filters: + # Treat the rule set as an allowlist. + exclude_objs = objects + continue + elif filters is self.include: + if not filters or not exclude_objs: + break + objects = exclude_objs + + flist = [] + types_list = tuple(filters.types) + # Apply cheaper/broader filters first. + if types_list: + flist.append(lambda obj: isinstance(obj.value, types_list)) + if filters.ids: + flist.append(lambda obj: id(obj.value) in filters.ids) + if filters.names: + flist.append(lambda obj: obj.name in filters.names) + if filters.regex: + flist.append(lambda obj: any(regex.fullmatch(obj.name) for regex in filters.regex)) + flist.extend(filters.funcs) + for f in flist: + objects = filterfalse(f, objects) + + if filters is self.exclude: + include_names = {obj.name for obj in objects} + exclude_objs = [obj for obj in all_objects if obj.name not in include_names] + else: + exclude_objs = list(objects) + + if not exclude_objs: + return namespace + if len(exclude_objs) == len(namespace): + warnings.warn("filtering operation left the namespace empty!", PicklingWarning) + return {} + if logger.isEnabledFor(logging.INFO): + exclude_listing = {obj.name: type(obj.value).__name__ for obj in sorted(exclude_objs)} + exclude_listing = str(exclude_listing).translate({ord(","): "\n", ord("'"): None}) + logger.info("Objects excluded from dump_session():\n%s\n", exclude_listing) + + for obj in exclude_objs: + del namespace_copy[obj.name] + return namespace_copy diff --git a/dill/session.py b/dill/session.py index 35485009..6e561e9e 100644 --- a/dill/session.py +++ b/dill/session.py @@ -10,12 +10,22 @@ Pickle and restore the intepreter session. """ -__all__ = ['dump_session', 'load_session'] +__all__ = ['dump_session', 'load_session', 'ipython_filter', 'ExcludeRules', 'EXCLUDE', 'INCLUDE'] -import logging, sys +import logging, re, sys +from copy import copy from dill import _dill, Pickler, Unpickler -from ._dill import ModuleType, _import_module, _is_builtin_module, _main_module, PY3 +from ._dill import ModuleType, _import_module, _is_builtin_module, _main_module +from ._utils import AttrDict, ExcludeRules, Filter, RuleType +from .settings import settings + +# Classes and abstract classes for type hints. +from io import BytesIO +from os import PathLike +from typing import Iterable, NoReturn, Union + +EXCLUDE, INCLUDE = RuleType.EXCLUDE, RuleType.INCLUDE SESSION_IMPORTED_AS_TYPES = tuple([Exception] + [getattr(_dill, name) for name in ('ModuleType', 'TypeType', 'FunctionType', 'MethodType', 'BuiltinMethodType')]) @@ -24,11 +34,9 @@ def _module_map(): """get map of imported modules""" - from collections import defaultdict, namedtuple - modmap = namedtuple('Modmap', ['by_name', 'by_id', 'top_level']) - modmap = modmap(defaultdict(list), defaultdict(list), {}) - items = 'items' if PY3 else 'iteritems' - for modname, module in getattr(sys.modules, items)(): + from collections import defaultdict + modmap = AttrDict(by_name=defaultdict(list), by_id=defaultdict(list), top_level={}) + for modname, module in sys.modules.items(): if not isinstance(module, ModuleType): continue if '.' not in modname: @@ -57,8 +65,7 @@ def _stash_modules(main_module): imported_as = [] imported_top_level = [] # keep separeted for backwards compatibility original = {} - items = 'items' if PY3 else 'iteritems' - for name, obj in getattr(main_module.__dict__, items)(): + for name, obj in vars(main_module).items(): if obj is main_module: original[name] = newmod # self-reference continue @@ -101,36 +108,64 @@ def _restore_modules(unpickler, main_module): except KeyError: pass -#NOTE: 06/03/15 renamed main_module to main -def dump_session(filename='/tmp/session.pkl', main=None, byref=False, **kwds): +def _filter_objects(main, exclude_extra, include_extra, obj=None): + filters = ExcludeRules(getattr(settings, 'session_exclude', None)) + if exclude_extra is not None: + filters.update([(EXCLUDE, exclude_extra)]) + if include_extra is not None: + filters.update([(INCLUDE, include_extra)]) + + namespace = filters.filter_namespace(vars(main), obj=obj) + if namespace is vars(main): + return main + + main = ModuleType(main.__name__) + vars(main).update(namespace) + return main + +def dump_session(filename: Union[PathLike, BytesIO] = '/tmp/session.pkl', + main: Union[str, ModuleType] = '__main__', + byref: bool = False, + exclude: Union[Filter, Iterable[Filter]] = None, + include: Union[Filter, Iterable[Filter]] = None, + **kwds) -> NoReturn: """pickle the current state of __main__ to a file""" - from .settings import settings - protocol = settings['protocol'] - if main is None: main = _main_module + protocol = settings.protocol + if isinstance(main, str): + main = _import_module(main) + original_main = main + if byref: + #NOTE: *must* run before _filter_objects() + main = _stash_modules(main) + main = _filter_objects(main, exclude, include, obj=original_main) + + print(list(vars(main))) + if hasattr(filename, 'write'): f = filename else: f = open(filename, 'wb') try: pickler = Pickler(f, protocol, **kwds) - pickler._original_main = main - if byref: - main = _stash_modules(main) pickler._main = main #FIXME: dill.settings are disabled pickler._byref = False # disable pickling by name reference pickler._recurse = False # disable pickling recursion for globals pickler._session = True # is best indicator of when pickling a session pickler._first_pass = True - pickler._main_modified = main is not pickler._original_main + if main is not original_main: + pickler._original_main = original_main pickler.dump(main) finally: if f is not filename: # If newly opened file f.close() return -def load_session(filename='/tmp/session.pkl', main=None, **kwds): +def load_session(filename: Union[PathLike, BytesIO] = '/tmp/session.pkl', + main: ModuleType = None, + **kwds) -> NoReturn: """update the __main__ module with the state from the session file""" - if main is None: main = _main_module + if main is None: + main = _main_module if hasattr(filename, 'read'): f = filename else: @@ -147,3 +182,43 @@ def load_session(filename='/tmp/session.pkl', main=None, **kwds): if f is not filename: # If newly opened file f.close() return + +############# +# IPython # +############# + +def ipython_filter(*, keep_input=True, keep_output=False): + """filter factory for IPython sessions (can't be added to settings currently) + + Usage: + >>> from dill.session import * + >>> dump_session(exclude=[ipython_filter()]) + """ + if not __builtins__.get('__IPYTHON__'): + # Return no-op filter if not in IPython. + return (lambda x: False) + + from IPython import get_ipython + ipython_shell = get_ipython() + + # Code snippet adapted from IPython.core.magics.namespace.who_ls() + user_ns = ipython_shell.user_ns + user_ns_hidden = ipython_shell.user_ns_hidden + nonmatching = object() # This can never be in user_ns + interactive_vars = {x for x in user_ns if user_ns[x] is not user_ns_hidden.get(x, nonmatching)} + + # Input and output history. + history_regex = [] + if keep_input: + interactive_vars |= {'_ih', 'In', '_i', '_ii', '_iii'} + history_regex.append(re.compile(r'_i\d+')) + if keep_output: + interactive_vars |= {'_oh', 'Out', '_', '__', '___'} + history_regex.append(re.compile(r'_\d+')) + + def not_interactive_var(obj): + if any(regex.fullmatch(obj.name) for regex in history_regex): + return False + return obj.name not in interactive_vars + + return not_interactive_var diff --git a/dill/settings.py b/dill/settings.py index 4d0226b0..9e3c06c9 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -9,12 +9,15 @@ global settings for Pickler """ +__all__ = ['settings', 'Settings'] + try: from pickle import DEFAULT_PROTOCOL except ImportError: from pickle import HIGHEST_PROTOCOL as DEFAULT_PROTOCOL +from ._utils import AttrDict as Settings, ExcludeRules -settings = { +settings = Settings({ #'main' : None, 'protocol' : DEFAULT_PROTOCOL, 'byref' : False, @@ -22,7 +25,8 @@ 'fmode' : 0, #HANDLE_FMODE 'recurse' : False, 'ignore' : False, -} + 'session_exclude': ExcludeRules(), +}) del DEFAULT_PROTOCOL From 3c87ae9325664e0278bfe2bd389254f18688e9f3 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Fri, 10 Jun 2022 12:49:49 -0300 Subject: [PATCH 003/109] Simplify AttrDict and create a separated Settings class --- dill/_utils.py | 34 ++++------------------------------ dill/session.py | 3 --- dill/settings.py | 29 ++++++++++++++++++++++++++++- 3 files changed, 32 insertions(+), 34 deletions(-) diff --git a/dill/_utils.py b/dill/_utils.py index 1ce67acd..cf4dd193 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -11,18 +11,8 @@ import logging logger = logging.getLogger('dill._utils') -import inspect -from functools import partialmethod - class AttrDict(dict): """syntactic sugar for accessing dictionary items""" - _CAST = object() # singleton - def __init__(self, *args, **kwargs): - data = args[0] if len(args) == 2 and args[1] is self._CAST else dict(*args, **kwargs) - for key, val in tuple(data.items()): - if isinstance(val, dict) and not isinstance(val, AttrDict): - data[key] = AttrDict(val, self._CAST) - super().__setattr__('_data', data) def _check_attr(self, name): try: super().__getattribute__(name) @@ -33,33 +23,17 @@ def _check_attr(self, name): def __getattr__(self, key): # This is called only if dict.__getattribute__(key) fails. try: - return self._data[key] + return self[key] except KeyError: raise AttributeError("'AttrDict' object has no attribute %r" % key) def __setattr__(self, key, value): self._check_attr(key) - if isinstance(value, dict): - self._data[key] = AttrDict(value, self._CAST) - else: - self._data[key] = value + self[key] = value def __delattr__(self, key): self._check_attr(key) - del self._data[key] - def __proxy__(self, method, *args, **kwargs): - return getattr(self._data, method)(*args, **kwargs) + del self[key] def __reduce__(self): - return AttrDict, (self._data,) - def copy(self): - # Deep copy. - copy = AttrDict(self._data) - for key, val in tuple(copy.items()): - if isinstance(val, AttrDict): - copy[key] = val.copy() - return copy - -for method, _ in inspect.getmembers(dict, inspect.ismethoddescriptor): - if method not in vars(AttrDict) and method not in {'__getattribute__', '__reduce_ex__'}: - setattr(AttrDict, method, partialmethod(AttrDict.__proxy__, method)) + return type(self), (dict(self),) ### Namespace filtering diff --git a/dill/session.py b/dill/session.py index 6e561e9e..68f7be1f 100644 --- a/dill/session.py +++ b/dill/session.py @@ -138,9 +138,6 @@ def dump_session(filename: Union[PathLike, BytesIO] = '/tmp/session.pkl', #NOTE: *must* run before _filter_objects() main = _stash_modules(main) main = _filter_objects(main, exclude, include, obj=original_main) - - print(list(vars(main))) - if hasattr(filename, 'write'): f = filename else: diff --git a/dill/settings.py b/dill/settings.py index 9e3c06c9..dbd8d243 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -15,7 +15,34 @@ from pickle import DEFAULT_PROTOCOL except ImportError: from pickle import HIGHEST_PROTOCOL as DEFAULT_PROTOCOL -from ._utils import AttrDict as Settings, ExcludeRules +from collections.abc import MutableMapping +from ._utils import AttrDict, ExcludeRules + +class Settings(AttrDict): + """allow multiple level attribute access""" + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + for key, value in tuple(self.items()): + if isinstance(value, MutableMapping): + self[key] = Settings(value) + @staticmethod + def _cast_dict(obj): + return Settings(obj) if isinstance(obj, MutableMapping) else obj + def __setitem__(self, key, value): + super().__setitem__(key, self._cast_dict(value)) + def setdefault(self, key, default=None): + super().setdefault(key, self._cast_dict(default)) + def update(self, *args, **kwargs): + super().update(Settings(*args, **kwargs)) + def __setattr__(self, key, value): + super().__setattr__(key, _cast_dict(value)) + def copy(self): + # Deep copy. + copy = Settings(self) + for key, value in self.items(): + if isinstance(value, Settings): + copy[key] = value.copy() + return copy settings = Settings({ #'main' : None, From 8341f1967755e7a9040aee6a9c26bb9ccce6ff7f Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sun, 12 Jun 2022 21:23:30 -0300 Subject: [PATCH 004/109] use typing.Pattern instead of re.Pattern for annotations --- dill/_utils.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/dill/_utils.py b/dill/_utils.py index cf4dd193..0d1a00b5 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -43,13 +43,12 @@ def __reduce__(self): from enum import Enum from functools import partialmethod from itertools import filterfalse -from re import Pattern -from typing import Callable, Iterable, Set, Tuple, Union +from typing import Callable, Iterable, Pattern, Set, Tuple, Union RuleType = Enum('RuleType', 'EXCLUDE INCLUDE', module=__name__) NamedObj = namedtuple('NamedObj', 'name value', module=__name__) -Filter = Union[str, Pattern, int, type, Callable] +Filter = Union[str, Pattern[str], int, type, Callable] Rule = Tuple[RuleType, Union[Filter, Iterable[Filter]]] def isiterable(arg): @@ -59,7 +58,7 @@ def isiterable(arg): class ExcludeFilters: ids: Set[int] = field(default_factory=set) names: Set[str] = field(default_factory=set) - regex: Set[Pattern] = field(default_factory=set) + regex: Set[Pattern[str]] = field(default_factory=set) types: Set[type] = field(default_factory=set) funcs: Set[Callable] = field(default_factory=set) @@ -74,7 +73,7 @@ def _check(self, filter): field = 'names' else: filter, field = re.compile(filter), 'regex' - elif isinstance(filter, Pattern): + elif isinstance(filter, re.Pattern): field = 'regex' elif isinstance(filter, int): field = 'ids' From de5c46701bf3f0ecda07dced5e6d79b822c98ece Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 23 Jun 2022 23:22:12 -0300 Subject: [PATCH 005/109] split module imports into separated lines --- dill/session.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dill/session.py b/dill/session.py index 68f7be1f..80ca47bb 100644 --- a/dill/session.py +++ b/dill/session.py @@ -12,7 +12,8 @@ __all__ = ['dump_session', 'load_session', 'ipython_filter', 'ExcludeRules', 'EXCLUDE', 'INCLUDE'] -import logging, re, sys +import re +import sys from copy import copy from dill import _dill, Pickler, Unpickler @@ -30,8 +31,6 @@ SESSION_IMPORTED_AS_TYPES = tuple([Exception] + [getattr(_dill, name) for name in ('ModuleType', 'TypeType', 'FunctionType', 'MethodType', 'BuiltinMethodType')]) -log = logging.getLogger('dill') - def _module_map(): """get map of imported modules""" from collections import defaultdict From 6f479e6f20b25840984a41465c2ca512577fb89d Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Wed, 6 Jul 2022 12:30:13 -0300 Subject: [PATCH 006/109] session: size filter --- dill/session.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/dill/session.py b/dill/session.py index 80ca47bb..c8e33a60 100644 --- a/dill/session.py +++ b/dill/session.py @@ -12,9 +12,10 @@ __all__ = ['dump_session', 'load_session', 'ipython_filter', 'ExcludeRules', 'EXCLUDE', 'INCLUDE'] +import random import re import sys -from copy import copy +from statistics import mean from dill import _dill, Pickler, Unpickler from ._dill import ModuleType, _import_module, _is_builtin_module, _main_module @@ -179,6 +180,74 @@ def load_session(filename: Union[PathLike, BytesIO] = '/tmp/session.pkl', f.close() return +import collections +import collections.abc +from sys import getsizeof + +# Cover "true" collections from 'builtins', 'collections' and 'collections.abc'. +COLLECTION_TYPES = ( + list, + tuple, + collections.deque, + collections.UserList, + collections.abc.Mapping, + collections.abc.Set, +) + +def _estimate_size(obj, memo): + obj_id = id(obj) + if obj_id in memo: + return 0 + memo.add(obj_id) + size = 0 + try: + if isinstance(obj, ModuleType) and _is_builtin_module(obj): + return 0 + size += getsizeof(obj) + if hasattr(obj, '__dict__'): + size += sum(_estimate_size(k, memo) + _estimate_size(v, memo) for k, v in obj.__dict__.items()) + if (isinstance(obj, str) # common case shortcut + or not isinstance(obj, collections.abc.Collection) # general, single test + or not isinstance(obj, COLLECTION_TYPES) # specific, multiple tests + ): + return size + if isinstance(obj, collections.ChainMap): # collections.Mapping subtype + size += sum(_estimate_size(mapping, memo) for mapping in obj.maps) + elif len(obj) < 1000: + if isinstance(obj, collections.abc.Mapping): + size += sum(_estimate_size(k, memo) + _estimate_size(v, memo) for k, v in obj.items()) + else: + size += sum(_estimate_size(item, memo) for item in obj) + else: + # Use random sample for large collections. + sample = set(random.sample(range(len(obj)), k=100)) + if isinstance(obj, collections.abc.Mapping): + samples_size = (_estimate_size(k, memo) + _estimate_size(v, memo) + for i, (k, v) in enumerate(obj.items()) if i in sample) + else: + samples_size = (_estimate_size(item, memo) for i, item in enumerate(obj) if i in sample) + size += len(obj) * mean(filter(None, samples_size)) + except Exception: + pass + return size + +def size_filter(limit): + match = re.fullmatch(r'(\d+)\s*(B|[KMGT]i?B?)', limit, re.IGNORECASE) + if not match: + raise ValueError("invalid 'limit' value: %r" % limit) + coeff, unit = match.groups() + coeff, unit = int(coeff), unit.lower() + if unit == 'b': + limit = coeff + else: + base = 1024 if unit[1:2] == 'i' else 1000 + exponent = 'kmgt'.index(unit[0]) + 1 + limit = coeff * base**exponent + def exclude_large(obj): + return _estimate_size(obj.value, memo=set()) < limit + return exclude_large + + ############# # IPython # ############# From 2a85d7bd3fcc5eb64a1590f31bd06697435963d7 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Wed, 6 Jul 2022 12:37:24 -0300 Subject: [PATCH 007/109] size filter: option to recurse in size estimate --- dill/session.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/dill/session.py b/dill/session.py index c8e33a60..1f41d849 100644 --- a/dill/session.py +++ b/dill/session.py @@ -194,7 +194,15 @@ def load_session(filename: Union[PathLike, BytesIO] = '/tmp/session.pkl', collections.abc.Set, ) -def _estimate_size(obj, memo): +def _estimate_size(obj, recursive=True): + if recursive: + return _estimate_size_recursively(obj, memo=set()) + try: + return getsizeof(obj) + except Exception: + return 0 + +def _estimate_size_recursively(obj, memo): obj_id = id(obj) if obj_id in memo: return 0 @@ -231,7 +239,7 @@ def _estimate_size(obj, memo): pass return size -def size_filter(limit): +def size_filter(limit, recursive=True): match = re.fullmatch(r'(\d+)\s*(B|[KMGT]i?B?)', limit, re.IGNORECASE) if not match: raise ValueError("invalid 'limit' value: %r" % limit) @@ -244,7 +252,7 @@ def size_filter(limit): exponent = 'kmgt'.index(unit[0]) + 1 limit = coeff * base**exponent def exclude_large(obj): - return _estimate_size(obj.value, memo=set()) < limit + return _estimate_size(obj.value, recursive) < limit return exclude_large From 61bf52044a677867900e3d820d1e15a06ec21ad6 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Wed, 6 Jul 2022 12:42:02 -0300 Subject: [PATCH 008/109] use __dict__ --- dill/session.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dill/session.py b/dill/session.py index 1f41d849..633c6683 100644 --- a/dill/session.py +++ b/dill/session.py @@ -65,7 +65,7 @@ def _stash_modules(main_module): imported_as = [] imported_top_level = [] # keep separeted for backwards compatibility original = {} - for name, obj in vars(main_module).items(): + for name, obj in main_module.__dict__.items(): if obj is main_module: original[name] = newmod # self-reference continue @@ -115,12 +115,12 @@ def _filter_objects(main, exclude_extra, include_extra, obj=None): if include_extra is not None: filters.update([(INCLUDE, include_extra)]) - namespace = filters.filter_namespace(vars(main), obj=obj) - if namespace is vars(main): + namespace = filters.filter_namespace(main.__dict__, obj=obj) + if namespace is main.__dict__: return main main = ModuleType(main.__name__) - vars(main).update(namespace) + main.__dict__.update(namespace) return main def dump_session(filename: Union[PathLike, BytesIO] = '/tmp/session.pkl', From 6bf135cea5581cda085551cea73fd16c209ae070 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 9 Jul 2022 20:46:48 -0300 Subject: [PATCH 009/109] mostly naminng changes --- dill/__init__.py | 2 +- dill/_utils.py | 24 +++++++++++++++--------- dill/session.py | 28 ++++++++++++++++------------ dill/settings.py | 34 ++-------------------------------- 4 files changed, 34 insertions(+), 54 deletions(-) diff --git a/dill/__init__.py b/dill/__init__.py index 12ad3cec..a59152c8 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -293,7 +293,7 @@ from . import detect, session, source, temp # get global settings -from .settings import Settings, settings +from .settings import settings # make sure "trace" is turned off detect.trace(False) diff --git a/dill/_utils.py b/dill/_utils.py index 0d1a00b5..2b2db13b 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -51,11 +51,16 @@ def __reduce__(self): Filter = Union[str, Pattern[str], int, type, Callable] Rule = Tuple[RuleType, Union[Filter, Iterable[Filter]]] -def isiterable(arg): - return isinstance(arg, abc.Iterable) and not isinstance(arg, (str, bytes)) +def _iter(filters): + if isinstance(filters, str): + return None + try: + return iter(filters) + except TypeError: + return None @dataclass -class ExcludeFilters: +class FilterSet: ids: Set[int] = field(default_factory=set) names: Set[str] = field(default_factory=set) regex: Set[Pattern[str]] = field(default_factory=set) @@ -112,8 +117,8 @@ def add_type(self, type_name): @dataclass class ExcludeRules: - exclude: ExcludeFilters = field(init=False, default_factory=ExcludeFilters) - include: ExcludeFilters = field(init=False, default_factory=ExcludeFilters) + exclude: FilterSet = field(init=False, default_factory=FilterSet) + include: FilterSet = field(init=False, default_factory=FilterSet) rules: InitVar[Iterable[Rule]] = None def __post_init__(self, rules): @@ -138,13 +143,14 @@ def update(self, rules): getattr(self.exclude, filter_set).update(getattr(rules.exclude, filter_set)) getattr(self.include, filter_set).update(getattr(rules.include, filter_set)) else: - # Validate rules. for rule in rules: + # Validate rules. if not isinstance(rule, tuple) or len(rule) != 2: raise ValueError("invalid rule format: %r" % rule) for rule_type, filter in rules: - if isiterable(filter): - for f in filter: + filters = _iter(filter) + if filters is not None: + for f in filters: self.add(f, rule_type=rule_type) else: self.add(filter, rule_type=rule_type) @@ -158,7 +164,7 @@ def filter_namespace(self, namespace, obj=None): return namespace # Protect agains dict changes during the call. - namespace_copy = namespace.copy() if obj is None or namespace is vars(obj) else namespace + namespace_copy = namespace.copy() if obj is None or namespace is obj.__dict__ else namespace objects = all_objects = [NamedObj._make(item) for item in namespace_copy.items()] for filters in (self.exclude, self.include): diff --git a/dill/session.py b/dill/session.py index 633c6683..6354c6d7 100644 --- a/dill/session.py +++ b/dill/session.py @@ -10,16 +10,20 @@ Pickle and restore the intepreter session. """ -__all__ = ['dump_session', 'load_session', 'ipython_filter', 'ExcludeRules', 'EXCLUDE', 'INCLUDE'] +__all__ = [ + 'dump_session', 'load_session', 'ipython_filter', 'size_filter', + 'FilterRules', 'EXCLUDE', 'INCLUDE', +] import random import re import sys from statistics import mean +from types import SimpleNamespace from dill import _dill, Pickler, Unpickler from ._dill import ModuleType, _import_module, _is_builtin_module, _main_module -from ._utils import AttrDict, ExcludeRules, Filter, RuleType +from ._utils import FilterRules, Filter, RuleType from .settings import settings # Classes and abstract classes for type hints. @@ -35,7 +39,7 @@ def _module_map(): """get map of imported modules""" from collections import defaultdict - modmap = AttrDict(by_name=defaultdict(list), by_id=defaultdict(list), top_level={}) + modmap = SimpleNamespace(by_name=defaultdict(list), by_id=defaultdict(list), top_level={}) for modname, module in sys.modules.items(): if not isinstance(module, ModuleType): continue @@ -108,14 +112,14 @@ def _restore_modules(unpickler, main_module): except KeyError: pass -def _filter_objects(main, exclude_extra, include_extra, obj=None): - filters = ExcludeRules(getattr(settings, 'session_exclude', None)) - if exclude_extra is not None: - filters.update([(EXCLUDE, exclude_extra)]) - if include_extra is not None: - filters.update([(INCLUDE, include_extra)]) +def _filter_objects(main, exclude, include, obj=None): + rules = FilterRules(getattr(settings, 'dump_module', None)) + if exclude is not None: + rules.update([(EXCLUDE, exclude)]) + if include is not None: + rules.update([(INCLUDE, include)]) - namespace = filters.filter_namespace(main.__dict__, obj=obj) + namespace = rules.filter_namespace(main.__dict__, obj=obj) if namespace is main.__dict__: return main @@ -125,7 +129,7 @@ def _filter_objects(main, exclude_extra, include_extra, obj=None): def dump_session(filename: Union[PathLike, BytesIO] = '/tmp/session.pkl', main: Union[str, ModuleType] = '__main__', - byref: bool = False, + refimported: bool = False, exclude: Union[Filter, Iterable[Filter]] = None, include: Union[Filter, Iterable[Filter]] = None, **kwds) -> NoReturn: @@ -134,7 +138,7 @@ def dump_session(filename: Union[PathLike, BytesIO] = '/tmp/session.pkl', if isinstance(main, str): main = _import_module(main) original_main = main - if byref: + if refimported: #NOTE: *must* run before _filter_objects() main = _stash_modules(main) main = _filter_objects(main, exclude, include, obj=original_main) diff --git a/dill/settings.py b/dill/settings.py index dbd8d243..6fdc73c0 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -15,36 +15,7 @@ from pickle import DEFAULT_PROTOCOL except ImportError: from pickle import HIGHEST_PROTOCOL as DEFAULT_PROTOCOL -from collections.abc import MutableMapping -from ._utils import AttrDict, ExcludeRules - -class Settings(AttrDict): - """allow multiple level attribute access""" - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - for key, value in tuple(self.items()): - if isinstance(value, MutableMapping): - self[key] = Settings(value) - @staticmethod - def _cast_dict(obj): - return Settings(obj) if isinstance(obj, MutableMapping) else obj - def __setitem__(self, key, value): - super().__setitem__(key, self._cast_dict(value)) - def setdefault(self, key, default=None): - super().setdefault(key, self._cast_dict(default)) - def update(self, *args, **kwargs): - super().update(Settings(*args, **kwargs)) - def __setattr__(self, key, value): - super().__setattr__(key, _cast_dict(value)) - def copy(self): - # Deep copy. - copy = Settings(self) - for key, value in self.items(): - if isinstance(value, Settings): - copy[key] = value.copy() - return copy - -settings = Settings({ +settings = { #'main' : None, 'protocol' : DEFAULT_PROTOCOL, 'byref' : False, @@ -52,8 +23,7 @@ def copy(self): 'fmode' : 0, #HANDLE_FMODE 'recurse' : False, 'ignore' : False, - 'session_exclude': ExcludeRules(), -}) +} del DEFAULT_PROTOCOL From 656ae30ca9eff587cf45cc2b218621bb5fa01ff1 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 12 Jul 2022 18:25:12 -0300 Subject: [PATCH 010/109] developmental changes --- dill/_utils.py | 206 ++++++++++++++++++++++++++++------------------- dill/session.py | 71 +++++++++------- dill/settings.py | 61 ++++++++++++++ 3 files changed, 222 insertions(+), 116 deletions(-) diff --git a/dill/_utils.py b/dill/_utils.py index 2b2db13b..f6338d31 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -6,51 +6,29 @@ # - https://github.com/uqfoundation/dill/blob/master/LICENSE """auxiliary internal classes used in multiple submodules, set here to avoid import recursion""" -__all__ = ['AttrDict', 'ExcludeRules', 'Filter', 'RuleType'] +from __future__ import annotations + +__all__ = ['FilterRules', 'Filter', 'RuleType'] import logging logger = logging.getLogger('dill._utils') -class AttrDict(dict): - """syntactic sugar for accessing dictionary items""" - def _check_attr(self, name): - try: - super().__getattribute__(name) - except AttributeError: - pass - else: - raise AttributeError("'AttrDict' object attribute %r is read-only" % name) - def __getattr__(self, key): - # This is called only if dict.__getattribute__(key) fails. - try: - return self[key] - except KeyError: - raise AttributeError("'AttrDict' object has no attribute %r" % key) - def __setattr__(self, key, value): - self._check_attr(key) - self[key] = value - def __delattr__(self, key): - self._check_attr(key) - del self[key] - def __reduce__(self): - return type(self), (dict(self),) - - -### Namespace filtering import re -from dataclasses import InitVar, dataclass, field, fields -from collections import abc, namedtuple +from dataclasses import dataclass, field, fields +from collections import namedtuple +from collections.abc import MutableSet from enum import Enum from functools import partialmethod -from itertools import filterfalse -from typing import Callable, Iterable, Pattern, Set, Tuple, Union - -RuleType = Enum('RuleType', 'EXCLUDE INCLUDE', module=__name__) -NamedObj = namedtuple('NamedObj', 'name value', module=__name__) +from itertools import chain, filterfalse +from types import ModuleType +from typing import Any, Callable, Dict, Iterable, Pattern, Set, Tuple, Union Filter = Union[str, Pattern[str], int, type, Callable] +RuleType = Enum('RuleType', 'EXCLUDE INCLUDE', module=__name__) Rule = Tuple[RuleType, Union[Filter, Iterable[Filter]]] +NamedObj = namedtuple('NamedObj', 'name value', module=__name__) + def _iter(filters): if isinstance(filters, str): return None @@ -60,26 +38,22 @@ def _iter(filters): return None @dataclass -class FilterSet: +class FilterSet(MutableSet): ids: Set[int] = field(default_factory=set) names: Set[str] = field(default_factory=set) - regex: Set[Pattern[str]] = field(default_factory=set) + regexes: Set[Pattern[str]] = field(default_factory=set) types: Set[type] = field(default_factory=set) funcs: Set[Callable] = field(default_factory=set) - - @property - def filter_sets(self): - return tuple(field.name for field in fields(self)) - def __bool__(self): - return any(getattr(self, filter_set) for filter_set in self.filter_sets) - def _check(self, filter): + _fields = None + _rtypemap = None + def _match_type(self, filter): if isinstance(filter, str): if filter.isidentifier(): field = 'names' else: - filter, field = re.compile(filter), 'regex' + filter, field = re.compile(filter), 'regexes' elif isinstance(filter, re.Pattern): - field = 'regex' + field = 'regexes' elif isinstance(filter, int): field = 'ids' elif isinstance(filter, type): @@ -89,42 +63,105 @@ def _check(self, filter): else: raise ValueError("invalid filter: %r" % filter) return filter, getattr(self, field) + # Mandatory MutableSet methods. + @classmethod + def _from_iterable(cls, it): + obj = cls() + obj |= it + return obj + def __contains__(self, filter): + filter, filter_set = self._match_type(filter) + return filter in filter_set + def __iter__(self): + return chain.from_iterable(gettatr(self, field) for field in self._fields) + def __len__(self): + return sum(len(gettatr(self, field)) for field in self._fields) def add(self, filter): - filter, filter_set = self._check(filter) + filter, filter_set = self._match_type(filter) filter_set.add(filter) def discard(self, filter): - filter, filter_set = self._check(filter) + filter, filter_set = self._match_type(filter) filter_set.discard(filter) + # Overwrite generic methods (optimization). def remove(self, filter): - filter, filter_set = self._check(filter) + filter, filter_set = self._match_type(filter) filter_set.remove(filter) - def update(self, filters): - for filter in filters: - self.add(filter) def clear(self): - for filter_set in self.filter_sets: - getattr(self, filter_set).clear() + for field in self._fields: + getattr(self, field).clear() + def __or__(self, other): + if not isinstance(other, Iterable): + return NotImplemented + obj = self.copy() + obj |= other + return obj + __ror__ = __or__ + def __ior__(self, filters): + if isinstance(filters, FilterSet): + for field in self._fields: + getattr(self, field) |= getattr(filters, field) + else: + for filter in filters: + self.add(filter) + return self + # Extra methods. + def update(self, filters): + self |= filters + def copy(self): + return FilterSet(*(getattr(self, field).copy() for field in self._fields)) + @classmethod + def get_type(cls, key): + if cls._rtypemap is None: + from ._dill import _reverse_typemap + cls._rtypemap = {(k[:-4] if k.endswith('Type') else k).lower(): v + for k, v in _reverse_typemap.items()} + if key.endswith('Type') + key = key[:-4] + return cls._rtypemap[key.lower()] def add_type(self, type_name): - import types - name_suffix = type_name + 'Type' if not type_name.endswith('Type') else type_name - if hasattr(types, name_suffix): - type_name = name_suffix - type_obj = getattr(types, type_name, None) - if not isinstance(type_obj, type): - named = type_name if type_name == name_suffix else "%r or %r" % (type_name, name_suffix) - raise NameError("could not find a type named %s in module 'types'" % named) - self.types.add(type_obj) - -@dataclass -class ExcludeRules: - exclude: FilterSet = field(init=False, default_factory=FilterSet) - include: FilterSet = field(init=False, default_factory=FilterSet) - rules: InitVar[Iterable[Rule]] = None - - def __post_init__(self, rules): + self.types.add(self.get_type(type_name)) +FilterSet._fields = tuple(field.name for field in fields(cls)) + +class _FilterSetDescriptor: + """descriptor for FilterSet members of FilterRules""" + def __set_name__(self, owner, name): + self.name = name + self._name = '_' + name + def __set__(self, obj, value): + # This is the important method. + if isinstance(value, FilterSet): + setattr(obj, self._name, value) + else: + setattr(obj, self._name, FilterSet(value)) + def __get__(self, obj, objtype=None): + try: + return getattr(obj, self._name) + except AttributeError: + raise AttributeError(self.name) from None + def __delete__(self, obj): + try: + delattr(obj, self._name) + except AttributeError: + raise AttributeError(self.name) from None + +class FilterRules: + __slots__ = '_exclude', '_include' + exclude = _FilterSetDescriptor() + include = _FilterSetDescriptor() + def __init__(self, rules: Union[Iterable[Rule], FilterRules] = None): + self._exclude = FilterSet() + self._include = FilterSet() if rules is not None: self.update(rules) - + def __repr__(self): + desc = [" 2 else " " + return sep.join(desc).replace("set()", "{}") + ">" + # Proxy add(), discard(), remove() and clear() to FilterSets. def __proxy__(self, method, filter, *, rule_type=RuleType.EXCLUDE): if rule_type is RuleType.EXCLUDE: getattr(self.exclude, method)(filter) @@ -132,16 +169,18 @@ def __proxy__(self, method, filter, *, rule_type=RuleType.EXCLUDE): getattr(self.include, method)(filter) else: raise ValueError("invalid rule type: %r (must be one of %r)" % (rule_type, list(RuleType))) - add = partialmethod(__proxy__, 'add') discard = partialmethod(__proxy__, 'discard') remove = partialmethod(__proxy__, 'remove') - - def update(self, rules): - if isinstance(rules, ExcludeRules): - for filter_set in self.exclude.filter_sets: - getattr(self.exclude, filter_set).update(getattr(rules.exclude, filter_set)) - getattr(self.include, filter_set).update(getattr(rules.include, filter_set)) + def clear(self): + self.exclude.clear() + self.include.clear() + def update(self, rules: Union[Iterable[Rule], FilterRules]): + """Update both FilterSets from a list of (RuleType, Filter) rules.""" + if isinstance(rules, FilterRules): + for field in FilterSet._fields: + getattr(self.exclude, field).update(getattr(rules.exclude, field)) + getattr(self.include, field).update(getattr(rules.include, field)) else: for rule in rules: # Validate rules. @@ -155,11 +194,8 @@ def update(self, rules): else: self.add(filter, rule_type=rule_type) - def clear(self): - self.exclude.clear() - self.include.clear() - - def filter_namespace(self, namespace, obj=None): + def filter_vars(self, namespace: Dict[str, Any], obj: ModuleType = None): + """Apply filters to dictionary with names as keys.""" if not self.exclude and not self.include: return namespace @@ -186,8 +222,8 @@ def filter_namespace(self, namespace, obj=None): flist.append(lambda obj: id(obj.value) in filters.ids) if filters.names: flist.append(lambda obj: obj.name in filters.names) - if filters.regex: - flist.append(lambda obj: any(regex.fullmatch(obj.name) for regex in filters.regex)) + if filters.regexes: + flist.append(lambda obj: any(regex.fullmatch(obj.name) for regex in filters.regexes)) flist.extend(filters.funcs) for f in flist: objects = filterfalse(f, objects) diff --git a/dill/session.py b/dill/session.py index e4e50846..50fd0ad7 100644 --- a/dill/session.py +++ b/dill/session.py @@ -124,14 +124,14 @@ def _restore_modules(unpickler, main_module): except KeyError: pass -def _filter_objects(main, exclude, include, obj=None): +def _filter_vars(main, exclude, include, obj=None): rules = FilterRules(getattr(settings, 'dump_module', None)) if exclude is not None: rules.update([(EXCLUDE, exclude)]) if include is not None: rules.update([(INCLUDE, include)]) - namespace = rules.filter_namespace(main.__dict__, obj=obj) + namespace = rules.filter_vars(main.__dict__, obj=obj) if namespace is main.__dict__: return main @@ -141,7 +141,7 @@ def _filter_objects(main, exclude, include, obj=None): def dump_module( filename = str(TEMPDIR/'session.pkl'), - main: Union[str, ModuleType] = '__main__', + module: Union[str, ModuleType] = '__main__', refimported: bool = False, exclude: Union[Filter, Iterable[Filter]] = None, include: Union[Filter, Iterable[Filter]] = None, @@ -158,7 +158,7 @@ def dump_module( Parameters: filename: a path-like object or a writable stream. - main: a module object or an importable module name. + module: a module object or an importable module name. refimported: if `True`, all imported objects in the module's namespace are saved by reference. *Note:* this is different from the ``byref`` option of other "dump" functions and is not affected by @@ -178,7 +178,7 @@ def dump_module( >>> import my_mod as m >>> m.var = 'new value' - >>> dill.dump_module('my_mod_session.pkl', main='my_mod') + >>> dill.dump_module('my_mod_session.pkl', module='my_mod') - Save the state of a non-importable, runtime-created module: @@ -186,32 +186,32 @@ def dump_module( >>> runtime = ModuleType('runtime') >>> runtime.food = ['bacon', 'eggs', 'spam'] >>> runtime.process_food = m.process_food - >>> dill.dump_module('runtime_session.pkl', main=runtime, refimported=True) + >>> dill.dump_module('runtime_session.pkl', module=runtime, refimported=True) *Changed in version 0.3.6:* the function ``dump_session()`` was renamed to ``dump_module()``. - *Changed in version 0.3.6:* the parameter ``byref`` was renamed to - ``refimported``. + *Changed in version 0.3.6:* the parameters ``main`` and ``byref`` were + renamed to ``module`` and ``refimported``, respectively. """ - if 'byref' in kwds: - warnings.warn( - "The parameter 'byref' was renamed to 'refimported', use this" - " instead. Note: the underlying dill.Pickler do accept a 'byref'" - " argument, but it has no effect on session saving.", - PendingDeprecationWarning - ) - if refimported: - raise ValueError("both 'refimported' and 'byref' arguments were used.") - refimported = kwds.pop('byref') from .settings import settings protocol = settings['protocol'] + for old_par, par in [('main', 'module'), ('byref', 'refimported')]: + if old_par in kwds: + message = "The parameter %r was renamed to %r, use this instead." % (old_par, par) + if old_par == 'byref': + message += " Note: the underlying dill.Pickler do accept a 'byref'" + " argument, but it has no effect on session saving." + warnings.warn(message, PendingDeprecationWarning) + refimported = kwds.pop('byref', refimported) + module = kwds.pop('main', module) + main = module if isinstance(main, str): main = _import_module(main) original_main = main if refimported: main = _stash_modules(main) - main = _filter_objects(main, exclude, include, obj=original_main) + main = _filter_vars(main, exclude, include, obj=original_main) if hasattr(filename, 'write'): file = filename else: @@ -237,7 +237,7 @@ def dump_module( # Backward compatibility. def dump_session(filename=str(TEMPDIR/'session.pkl'), main=None, byref=False, **kwds): warnings.warn("dump_session() was renamed to dump_module()", PendingDeprecationWarning) - dump_module(filename, main, refimported=byref, **kwds) + dump_module(filename, module=main, refimported=byref, **kwds) dump_session.__doc__ = dump_module.__doc__ class _PeekableReader: @@ -301,7 +301,7 @@ def _identify_module(file, main=None): def load_module( filename = str(TEMPDIR/'session.pkl'), - main: Union[ModuleType, str] = None, + module: Union[ModuleType, str] = None, **kwds ) -> Optional[ModuleType]: """Update :py:mod:`__main__` or another module with the state from the @@ -318,7 +318,7 @@ def load_module( Parameters: filename: a path-like object or a readable stream. - main: an importable module name or a module object. + module: an importable module name or a module object. **kwds: extra keyword arguments passed to :py:class:`Unpickler()`. Raises: @@ -366,11 +366,20 @@ def load_module( *Changed in version 0.3.6:* the function ``load_session()`` was renamed to ``load_module()``. + *Changed in version 0.3.6:* the parameter ``main`` was renamed to + ``module``. + See also: :py:func:`load_module_asdict` to load the contents of a saved session (from :py:mod:`__main__` or any importable module) into a dictionary. """ - main_arg = main + if 'main' in kwds: + warnings.warn( + "The parameter 'main' was renamed to 'module', use this instead.", + PendingDeprecationWarning + ) + module = kwds.pop('main') + main = module if hasattr(filename, 'read'): file = filename else: @@ -426,7 +435,7 @@ def load_module( runtime_main = '__runtime__.%s' % main.__name__ sys.modules[runtime_main] = main - module = unpickler.load() + loaded = unpickler.load() finally: if not hasattr(filename, 'read'): # if newly opened file file.close() @@ -434,17 +443,17 @@ def load_module( del sys.modules[runtime_main] except (KeyError, NameError): pass - assert module is main - _restore_modules(unpickler, module) - if module is _main_module or module is main_arg: + assert loaded is main + _restore_modules(unpickler, main) + if main is _main_module or main is module: return None else: - return module + return main # Backward compatibility. def load_session(filename=str(TEMPDIR/'session.pkl'), main=None, **kwds): warnings.warn("load_session() was renamed to load_module().", PendingDeprecationWarning) - load_module(filename, main, **kwds) + load_module(filename, module=main, **kwds) load_session.__doc__ = load_module.__doc__ def load_module_asdict( @@ -500,8 +509,8 @@ def load_module_asdict( >>> new_var in main_vars # would be True if the option 'update' was set False """ - if 'main' in kwds: - raise TypeError("'main' is an invalid keyword argument for load_module_asdict()") + if 'module' in kwds: + raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") if hasattr(filename, 'read'): file = filename else: diff --git a/dill/settings.py b/dill/settings.py index babcdd2d..042a6578 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -9,9 +9,12 @@ global settings for Pickler """ +from __future__ import annotations + __all__ = ['settings'] from pickle import DEFAULT_PROTOCOL +from ._utils import FilterRules, FilterSet settings = { #'main' : None, @@ -25,3 +28,61 @@ del DEFAULT_PROTOCOL +class ModuleRules(FilterRules): + __slots__ = 'module', '_parent', '__dict__' + _fields = tuple(x.lstrip('_') for x in FilterRules.__slots__) + def __init__(self, + module: str, + parent: ModuleRules = None, + rules: Union[Iterable[Rule], FilterRules] = None + ): + super().__setattr__('module', module) + super().__setattr__('_parent', parent) + # Don't call super().__init__(). + if rules is not None: + super().__init__(rules) + def __repr__(self): + desc = "DEFAULT" if self.module == 'DEFAULT' else "for %r" % self.module + return "" % (desc, super().__repr__()) + def __setattr__(self, name, value): + if name in FilterRules.__slots__: + # Don't interfere with superclass attributes. + super().__setattr__(name, value) + elif name in self._fields: + if not any(hasattr(self, x) for x in FilterRules.__slots__): + # Initialize other. This is not a placeholder anymore. + other = '_include' if name == 'exclude' else '_exclude' + super().__setattr__(other, FilterSet()) + super().__setattr__(name, value) + else: + # Create a child node for submodule 'name'. + super().__setattr__(name, ModuleRules(parent=self, module=name, rules=value)) + def __setitem__(self, name, value): + if '.' not in name: + setattr(self, name, value) + else: + module, _, submodules = name.partition('.') + if module not in self.__dict__: + # Create a placeholder node, like logging.PlaceHolder. + setattr(self, module, None) + mod_rules = getattr(self, module) + mod_rules[submodules] = value + def __getitem__(self, name): + module, _, submodules = name.partition('.') + mod_rules = getattr(self, module) + if not submodules: + return mod_rules + else: + return mod_rules[submodules] + def get_filters(self, name): + if name not in self._fields: + raise ValueError("invalid name %r (must be one of %r)" % (name, self._fields)) + try: + return getattr(self, name) + except AttributeError: + # 'self' is a placeholder, 'exclude' and 'include' are unset. + if self._parent is None: + raise + return self._parent.get_filters(name) + +settings['dump_module'] = ModuleRules('DEFAULT', rules=()) From 9781f191435ed56f0a205f3d2430d568169beb88 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 12 Jul 2022 18:30:23 -0300 Subject: [PATCH 011/109] move session things to session.py --- dill/_dill.py | 511 ------------------------------------------------ dill/session.py | 507 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 507 insertions(+), 511 deletions(-) create mode 100644 dill/session.py diff --git a/dill/_dill.py b/dill/_dill.py index e341357c..0925c248 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -319,517 +319,6 @@ def loads(str, ignore=None, **kwds): ### End: Shorthands ### -### Pickle the Interpreter Session -import pathlib -import tempfile - -SESSION_IMPORTED_AS_TYPES = (ModuleType, ClassType, TypeType, Exception, - FunctionType, MethodType, BuiltinMethodType) -TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) - -def _module_map(): - """get map of imported modules""" - from collections import defaultdict, namedtuple - modmap = namedtuple('Modmap', ['by_name', 'by_id', 'top_level']) - modmap = modmap(defaultdict(list), defaultdict(list), {}) - for modname, module in sys.modules.items(): - if not isinstance(module, ModuleType): - continue - if '.' not in modname: - modmap.top_level[id(module)] = modname - for objname, modobj in module.__dict__.items(): - modmap.by_name[objname].append((modobj, modname)) - modmap.by_id[id(modobj)].append((modobj, objname, modname)) - return modmap - -def _lookup_module(modmap, name, obj, main_module): - """lookup name or id of obj if module is imported""" - for modobj, modname in modmap.by_name[name]: - if modobj is obj and sys.modules[modname] is not main_module: - return modname, name - if isinstance(obj, SESSION_IMPORTED_AS_TYPES): - for modobj, objname, modname in modmap.by_id[id(obj)]: - if sys.modules[modname] is not main_module: - return modname, objname - return None, None - -def _stash_modules(main_module): - modmap = _module_map() - newmod = ModuleType(main_module.__name__) - - imported = [] - imported_as = [] - imported_top_level = [] # keep separeted for backwards compatibility - original = {} - for name, obj in main_module.__dict__.items(): - if obj is main_module: - original[name] = newmod # self-reference - continue - - # Avoid incorrectly matching a singleton value in another package (ex.: __doc__). - if any(obj is singleton for singleton in (None, False, True)) or \ - isinstance(obj, ModuleType) and _is_builtin_module(obj): # always saved by ref - original[name] = obj - continue - - source_module, objname = _lookup_module(modmap, name, obj, main_module) - if source_module: - if objname == name: - imported.append((source_module, name)) - else: - imported_as.append((source_module, objname, name)) - else: - try: - imported_top_level.append((modmap.top_level[id(obj)], name)) - except KeyError: - original[name] = obj - - if len(original) < len(main_module.__dict__): - newmod.__dict__.update(original) - newmod.__dill_imported = imported - newmod.__dill_imported_as = imported_as - newmod.__dill_imported_top_level = imported_top_level - return newmod - else: - return main_module - -def _restore_modules(unpickler, main_module): - try: - for modname, name in main_module.__dict__.pop('__dill_imported'): - main_module.__dict__[name] = unpickler.find_class(modname, name) - for modname, objname, name in main_module.__dict__.pop('__dill_imported_as'): - main_module.__dict__[name] = unpickler.find_class(modname, objname) - for modname, name in main_module.__dict__.pop('__dill_imported_top_level'): - main_module.__dict__[name] = __import__(modname) - except KeyError: - pass - -#NOTE: 06/03/15 renamed main_module to main -def dump_module( - filename = str(TEMPDIR/'session.pkl'), - main: Optional[Union[ModuleType, str]] = None, - refimported: bool = False, - **kwds -) -> None: - """Pickle the current state of :py:mod:`__main__` or another module to a file. - - Save the contents of :py:mod:`__main__` (e.g. from an interactive - interpreter session), an imported module, or a module-type object (e.g. - built with :py:class:`~types.ModuleType`), to a file. The pickled - module can then be restored with the function :py:func:`load_module`. - - Parameters: - filename: a path-like object or a writable stream. - main: a module object or the name of an importable module. - refimported: if `True`, all objects imported into the module's - namespace are saved by reference. *Note:* this is similar but - independent from ``dill.settings[`byref`]``, as ``refimported`` - refers to all imported objects, while ``byref`` only affects - select objects. - **kwds: extra keyword arguments passed to :py:class:`Pickler()`. - - Raises: - :py:exc:`PicklingError`: if pickling fails. - - Examples: - - Save current interpreter session state: - - >>> import dill - >>> squared = lambda x:x*x - >>> dill.dump_module() # save state of __main__ to /tmp/session.pkl - - - Save the state of an imported/importable module: - - >>> import dill - >>> import pox - >>> pox.plus_one = lambda x:x+1 - >>> dill.dump_module('pox_session.pkl', main=pox) - - - Save the state of a non-importable, module-type object: - - >>> import dill - >>> from types import ModuleType - >>> foo = ModuleType('foo') - >>> foo.values = [1,2,3] - >>> import math - >>> foo.sin = math.sin - >>> dill.dump_module('foo_session.pkl', main=foo, refimported=True) - - - Restore the state of the saved modules: - - >>> import dill - >>> dill.load_module() - >>> squared(2) - 4 - >>> pox = dill.load_module('pox_session.pkl') - >>> pox.plus_one(1) - 2 - >>> foo = dill.load_module('foo_session.pkl') - >>> [foo.sin(x) for x in foo.values] - [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] - - *Changed in version 0.3.6:* the function ``dump_session()`` was renamed to - ``dump_module()``. - - *Changed in version 0.3.6:* the parameter ``byref`` was renamed to - ``refimported``. - """ - if 'byref' in kwds: - warnings.warn( - "The argument 'byref' has been renamed 'refimported'" - " to distinguish it from dill.settings['byref'].", - PendingDeprecationWarning - ) - if refimported: - raise TypeError("both 'refimported' and 'byref' were used") - refimported = kwds.pop('byref') - from .settings import settings - protocol = settings['protocol'] - if main is None: main = _main_module - if hasattr(filename, 'write'): - file = filename - else: - file = open(filename, 'wb') - try: - pickler = Pickler(file, protocol, **kwds) - pickler._original_main = main - if refimported: - main = _stash_modules(main) - pickler._main = main #FIXME: dill.settings are disabled - pickler._byref = False # disable pickling by name reference - pickler._recurse = False # disable pickling recursion for globals - pickler._session = True # is best indicator of when pickling a session - pickler._first_pass = True - pickler._main_modified = main is not pickler._original_main - pickler.dump(main) - finally: - if file is not filename: # if newly opened file - file.close() - return - -# Backward compatibility. -def dump_session(filename=str(TEMPDIR/'session.pkl'), main=None, byref=False, **kwds): - warnings.warn("dump_session() has been renamed dump_module()", PendingDeprecationWarning) - dump_module(filename, main, refimported=byref, **kwds) -dump_session.__doc__ = dump_module.__doc__ - -class _PeekableReader: - """lightweight stream wrapper that implements peek()""" - def __init__(self, stream): - self.stream = stream - def read(self, n): - return self.stream.read(n) - def readline(self): - return self.stream.readline() - def tell(self): - return self.stream.tell() - def close(self): - return self.stream.close() - def peek(self, n): - stream = self.stream - try: - if hasattr(stream, 'flush'): stream.flush() - position = stream.tell() - stream.seek(position) # assert seek() works before reading - chunk = stream.read(n) - stream.seek(position) - return chunk - except (AttributeError, OSError): - raise NotImplementedError("stream is not peekable: %r", stream) from None - -def _make_peekable(stream): - """return stream as an object with a peek() method""" - import io - if hasattr(stream, 'peek'): - return stream - if not (hasattr(stream, 'tell') and hasattr(stream, 'seek')): - try: - return io.BufferedReader(stream) - except Exception: - pass - return _PeekableReader(stream) - -def _identify_module(file, main=None): - """identify the session file's module name""" - from pickletools import genops - UNICODE = {'UNICODE', 'BINUNICODE', 'SHORT_BINUNICODE'} - found_import = False - try: - for opcode, arg, pos in genops(file.peek(256)): - if not found_import: - if opcode.name in ('GLOBAL', 'SHORT_BINUNICODE') and \ - arg.endswith('_import_module'): - found_import = True - else: - if opcode.name in UNICODE: - return arg - else: - raise UnpicklingError("reached STOP without finding main module") - except (NotImplementedError, ValueError) as error: - # ValueError occours when the end of the chunk is reached (without a STOP). - if isinstance(error, NotImplementedError) and main is not None: - # file is not peekable, but we have main. - return None - raise UnpicklingError("unable to identify main module") from error - -def load_module( - filename = str(TEMPDIR/'session.pkl'), - main: Union[ModuleType, str] = None, - **kwds -) -> Optional[ModuleType]: - """Update :py:mod:`__main__` or another module with the state from the - session file. - - Restore a module to the state saved with :py:func:`dump_module`. The - saved module can be :py:mod:`__main__` (e.g. an interpreter session), - an imported module, or a module-type object (e.g. created with - :py:class:`~types.ModuleType`). - - When restoring the state of a non-importable module-type object, the - current instance of this module may be passed as the argument ``main``. - Otherwise, a new instance is created with :py:class:`~types.ModuleType` - and returned. - - Parameters: - filename: a path-like object or a readable stream. - main: a module object or the name of an importable module. - **kwds: extra keyword arguments passed to :py:class:`Unpickler()`. - - Raises: - :py:exc:`UnpicklingError`: if unpickling fails. - :py:exc:`ValueError`: if the argument ``main`` and module saved - at ``filename`` are incompatible. - - Returns: - A module object, if the saved module is not :py:mod:`__main__` or - a module instance wasn't provided with the argument ``main``. - - Examples: - - - Save the state of some modules: - - >>> import dill - >>> squared = lambda x:x*x - >>> dill.dump_module() # save state of __main__ to /tmp/session.pkl - >>> - >>> import pox # an imported module - >>> pox.plus_one = lambda x:x+1 - >>> dill.dump_module('pox_session.pkl', main=pox) - >>> - >>> from types import ModuleType - >>> foo = ModuleType('foo') # a module-type object - >>> foo.values = [1,2,3] - >>> import math - >>> foo.sin = math.sin - >>> dill.dump_module('foo_session.pkl', main=foo, refimported=True) - - - Restore the state of the interpreter: - - >>> import dill - >>> dill.load_module() # updates __main__ from /tmp/session.pkl - >>> squared(2) - 4 - - - Load the saved state of an importable module: - - >>> import dill - >>> pox = dill.load_module('pox_session.pkl') - >>> pox.plus_one(1) - 2 - >>> import sys - >>> pox in sys.modules.values() - True - - - Load the saved state of a non-importable module-type object: - - >>> import dill - >>> foo = dill.load_module('foo_session.pkl') - >>> [foo.sin(x) for x in foo.values] - [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] - >>> import math - >>> foo.sin is math.sin # foo.sin was saved by reference - True - >>> import sys - >>> foo in sys.modules.values() - False - - - Update the state of a non-importable module-type object: - - >>> import dill - >>> from types import ModuleType - >>> foo = ModuleType('foo') - >>> foo.values = ['a','b'] - >>> foo.sin = lambda x:x*x - >>> dill.load_module('foo_session.pkl', main=foo) - >>> [foo.sin(x) for x in foo.values] - [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] - - *Changed in version 0.3.6:* the function ``load_session()`` was renamed to - ``load_module()``. - - See also: - :py:func:`load_module_asdict` to load the contents of module saved - with :py:func:`dump_module` into a dictionary. - """ - main_arg = main - if hasattr(filename, 'read'): - file = filename - else: - file = open(filename, 'rb') - try: - file = _make_peekable(file) - #FIXME: dill.settings are disabled - unpickler = Unpickler(file, **kwds) - unpickler._session = True - pickle_main = _identify_module(file, main) - - # Resolve unpickler._main - if main is None and pickle_main is not None: - main = pickle_main - if isinstance(main, str): - if main.startswith('__runtime__.'): - # Create runtime module to load the session into. - main = ModuleType(main.partition('.')[-1]) - else: - main = _import_module(main) - if main is not None: - if not isinstance(main, ModuleType): - raise ValueError("%r is not a module" % main) - unpickler._main = main - else: - main = unpickler._main - - # Check against the pickle's main. - is_main_imported = _is_imported_module(main) - if pickle_main is not None: - is_runtime_mod = pickle_main.startswith('__runtime__.') - if is_runtime_mod: - pickle_main = pickle_main.partition('.')[-1] - if is_runtime_mod and is_main_imported: - raise ValueError( - "can't restore non-imported module %r into an imported one" - % pickle_main - ) - if not is_runtime_mod and not is_main_imported: - raise ValueError( - "can't restore imported module %r into a non-imported one" - % pickle_main - ) - if main.__name__ != pickle_main: - raise ValueError( - "can't restore module %r into module %r" - % (pickle_main, main.__name__) - ) - - # This is for find_class() to be able to locate it. - if not is_main_imported: - runtime_main = '__runtime__.%s' % main.__name__ - sys.modules[runtime_main] = main - - module = unpickler.load() - finally: - if not hasattr(filename, 'read'): # if newly opened file - file.close() - try: - del sys.modules[runtime_main] - except (KeyError, NameError): - pass - assert module is main - _restore_modules(unpickler, module) - if not (module is _main_module or module is main_arg): - return module - -# Backward compatibility. -def load_session(filename=str(TEMPDIR/'session.pkl'), main=None, **kwds): - warnings.warn("load_session() has been renamed load_module().", PendingDeprecationWarning) - load_module(filename, main, **kwds) -load_session.__doc__ = load_module.__doc__ - -def load_module_asdict( - filename = str(TEMPDIR/'session.pkl'), - update: bool = False, - **kwds -) -> dict: - """ - Load the contents of a saved module into a dictionary. - - ``load_module_asdict()`` is the near-equivalent of:: - - lambda filename: vars(dill.load_module(filename)).copy() - - however, does not alter the original module. Also, the path of - the loaded module is stored in the ``__session__`` attribute. - - Parameters: - filename: a path-like object or a readable stream - update: if `True`, initialize the dictionary with the current state - of the module prior to loading the state stored at filename. - **kwds: extra keyword arguments passed to :py:class:`Unpickler()` - - Raises: - :py:exc:`UnpicklingError`: if unpickling fails - - Returns: - A copy of the restored module's dictionary. - - Note: - If ``update`` is True, the saved module may be imported then updated. - - Example: - >>> import dill - >>> alist = [1, 2, 3] - >>> anum = 42 - >>> dill.dump_module() - >>> anum = 0 - >>> new_var = 'spam' - >>> main = dill.load_module_asdict() - >>> main['__name__'], main['__session__'] - ('__main__', '/tmp/session.pkl') - >>> main is globals() # loaded objects don't reference globals - False - >>> main['alist'] == alist - True - >>> main['alist'] is alist # was saved by value - False - >>> main['anum'] == anum # changed after the session was saved - False - >>> new_var in main # would be True if the option 'update' was set - False - """ - if 'main' in kwds: - raise TypeError("'main' is an invalid keyword argument for load_module_asdict()") - if hasattr(filename, 'read'): - file = filename - else: - file = open(filename, 'rb') - try: - file = _make_peekable(file) - main_name = _identify_module(file) - old_main = sys.modules.get(main_name) - main = ModuleType(main_name) - if update: - if old_main is None: - old_main = _import_module(main_name) - main.__dict__.update(old_main.__dict__) - else: - main.__builtins__ = __builtin__ - sys.modules[main_name] = main - load_module(file, **kwds) - main.__session__ = str(filename) - finally: - if not hasattr(filename, 'read'): # if newly opened file - file.close() - try: - if old_main is None: - del sys.modules[main_name] - else: - sys.modules[main_name] = old_main - except NameError: # failed before setting old_main - pass - return main.__dict__ - -### End: Pickle the Interpreter - class MetaCatchingDict(dict): def get(self, key, default=None): try: diff --git a/dill/session.py b/dill/session.py new file mode 100644 index 00000000..c42ed464 --- /dev/null +++ b/dill/session.py @@ -0,0 +1,507 @@ +import pathlib +import tempfile + +SESSION_IMPORTED_AS_TYPES = (ModuleType, ClassType, TypeType, Exception, + FunctionType, MethodType, BuiltinMethodType) +TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) + +def _module_map(): + """get map of imported modules""" + from collections import defaultdict, namedtuple + modmap = namedtuple('Modmap', ['by_name', 'by_id', 'top_level']) + modmap = modmap(defaultdict(list), defaultdict(list), {}) + for modname, module in sys.modules.items(): + if not isinstance(module, ModuleType): + continue + if '.' not in modname: + modmap.top_level[id(module)] = modname + for objname, modobj in module.__dict__.items(): + modmap.by_name[objname].append((modobj, modname)) + modmap.by_id[id(modobj)].append((modobj, objname, modname)) + return modmap + +def _lookup_module(modmap, name, obj, main_module): + """lookup name or id of obj if module is imported""" + for modobj, modname in modmap.by_name[name]: + if modobj is obj and sys.modules[modname] is not main_module: + return modname, name + if isinstance(obj, SESSION_IMPORTED_AS_TYPES): + for modobj, objname, modname in modmap.by_id[id(obj)]: + if sys.modules[modname] is not main_module: + return modname, objname + return None, None + +def _stash_modules(main_module): + modmap = _module_map() + newmod = ModuleType(main_module.__name__) + + imported = [] + imported_as = [] + imported_top_level = [] # keep separeted for backwards compatibility + original = {} + for name, obj in main_module.__dict__.items(): + if obj is main_module: + original[name] = newmod # self-reference + continue + + # Avoid incorrectly matching a singleton value in another package (ex.: __doc__). + if any(obj is singleton for singleton in (None, False, True)) or \ + isinstance(obj, ModuleType) and _is_builtin_module(obj): # always saved by ref + original[name] = obj + continue + + source_module, objname = _lookup_module(modmap, name, obj, main_module) + if source_module: + if objname == name: + imported.append((source_module, name)) + else: + imported_as.append((source_module, objname, name)) + else: + try: + imported_top_level.append((modmap.top_level[id(obj)], name)) + except KeyError: + original[name] = obj + + if len(original) < len(main_module.__dict__): + newmod.__dict__.update(original) + newmod.__dill_imported = imported + newmod.__dill_imported_as = imported_as + newmod.__dill_imported_top_level = imported_top_level + return newmod + else: + return main_module + +def _restore_modules(unpickler, main_module): + try: + for modname, name in main_module.__dict__.pop('__dill_imported'): + main_module.__dict__[name] = unpickler.find_class(modname, name) + for modname, objname, name in main_module.__dict__.pop('__dill_imported_as'): + main_module.__dict__[name] = unpickler.find_class(modname, objname) + for modname, name in main_module.__dict__.pop('__dill_imported_top_level'): + main_module.__dict__[name] = __import__(modname) + except KeyError: + pass + +#NOTE: 06/03/15 renamed main_module to main +def dump_module( + filename = str(TEMPDIR/'session.pkl'), + main: Optional[Union[ModuleType, str]] = None, + refimported: bool = False, + **kwds +) -> None: + """Pickle the current state of :py:mod:`__main__` or another module to a file. + + Save the contents of :py:mod:`__main__` (e.g. from an interactive + interpreter session), an imported module, or a module-type object (e.g. + built with :py:class:`~types.ModuleType`), to a file. The pickled + module can then be restored with the function :py:func:`load_module`. + + Parameters: + filename: a path-like object or a writable stream. + main: a module object or the name of an importable module. + refimported: if `True`, all objects imported into the module's + namespace are saved by reference. *Note:* this is similar but + independent from ``dill.settings[`byref`]``, as ``refimported`` + refers to all imported objects, while ``byref`` only affects + select objects. + **kwds: extra keyword arguments passed to :py:class:`Pickler()`. + + Raises: + :py:exc:`PicklingError`: if pickling fails. + + Examples: + - Save current interpreter session state: + + >>> import dill + >>> squared = lambda x:x*x + >>> dill.dump_module() # save state of __main__ to /tmp/session.pkl + + - Save the state of an imported/importable module: + + >>> import dill + >>> import pox + >>> pox.plus_one = lambda x:x+1 + >>> dill.dump_module('pox_session.pkl', main=pox) + + - Save the state of a non-importable, module-type object: + + >>> import dill + >>> from types import ModuleType + >>> foo = ModuleType('foo') + >>> foo.values = [1,2,3] + >>> import math + >>> foo.sin = math.sin + >>> dill.dump_module('foo_session.pkl', main=foo, refimported=True) + + - Restore the state of the saved modules: + + >>> import dill + >>> dill.load_module() + >>> squared(2) + 4 + >>> pox = dill.load_module('pox_session.pkl') + >>> pox.plus_one(1) + 2 + >>> foo = dill.load_module('foo_session.pkl') + >>> [foo.sin(x) for x in foo.values] + [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] + + *Changed in version 0.3.6:* the function ``dump_session()`` was renamed to + ``dump_module()``. + + *Changed in version 0.3.6:* the parameter ``byref`` was renamed to + ``refimported``. + """ + if 'byref' in kwds: + warnings.warn( + "The argument 'byref' has been renamed 'refimported'" + " to distinguish it from dill.settings['byref'].", + PendingDeprecationWarning + ) + if refimported: + raise TypeError("both 'refimported' and 'byref' were used") + refimported = kwds.pop('byref') + from .settings import settings + protocol = settings['protocol'] + if main is None: main = _main_module + if hasattr(filename, 'write'): + file = filename + else: + file = open(filename, 'wb') + try: + pickler = Pickler(file, protocol, **kwds) + pickler._original_main = main + if refimported: + main = _stash_modules(main) + pickler._main = main #FIXME: dill.settings are disabled + pickler._byref = False # disable pickling by name reference + pickler._recurse = False # disable pickling recursion for globals + pickler._session = True # is best indicator of when pickling a session + pickler._first_pass = True + pickler._main_modified = main is not pickler._original_main + pickler.dump(main) + finally: + if file is not filename: # if newly opened file + file.close() + return + +# Backward compatibility. +def dump_session(filename=str(TEMPDIR/'session.pkl'), main=None, byref=False, **kwds): + warnings.warn("dump_session() has been renamed dump_module()", PendingDeprecationWarning) + dump_module(filename, main, refimported=byref, **kwds) +dump_session.__doc__ = dump_module.__doc__ + +class _PeekableReader: + """lightweight stream wrapper that implements peek()""" + def __init__(self, stream): + self.stream = stream + def read(self, n): + return self.stream.read(n) + def readline(self): + return self.stream.readline() + def tell(self): + return self.stream.tell() + def close(self): + return self.stream.close() + def peek(self, n): + stream = self.stream + try: + if hasattr(stream, 'flush'): stream.flush() + position = stream.tell() + stream.seek(position) # assert seek() works before reading + chunk = stream.read(n) + stream.seek(position) + return chunk + except (AttributeError, OSError): + raise NotImplementedError("stream is not peekable: %r", stream) from None + +def _make_peekable(stream): + """return stream as an object with a peek() method""" + import io + if hasattr(stream, 'peek'): + return stream + if not (hasattr(stream, 'tell') and hasattr(stream, 'seek')): + try: + return io.BufferedReader(stream) + except Exception: + pass + return _PeekableReader(stream) + +def _identify_module(file, main=None): + """identify the session file's module name""" + from pickletools import genops + UNICODE = {'UNICODE', 'BINUNICODE', 'SHORT_BINUNICODE'} + found_import = False + try: + for opcode, arg, pos in genops(file.peek(256)): + if not found_import: + if opcode.name in ('GLOBAL', 'SHORT_BINUNICODE') and \ + arg.endswith('_import_module'): + found_import = True + else: + if opcode.name in UNICODE: + return arg + else: + raise UnpicklingError("reached STOP without finding main module") + except (NotImplementedError, ValueError) as error: + # ValueError occours when the end of the chunk is reached (without a STOP). + if isinstance(error, NotImplementedError) and main is not None: + # file is not peekable, but we have main. + return None + raise UnpicklingError("unable to identify main module") from error + +def load_module( + filename = str(TEMPDIR/'session.pkl'), + main: Union[ModuleType, str] = None, + **kwds +) -> Optional[ModuleType]: + """Update :py:mod:`__main__` or another module with the state from the + session file. + + Restore a module to the state saved with :py:func:`dump_module`. The + saved module can be :py:mod:`__main__` (e.g. an interpreter session), + an imported module, or a module-type object (e.g. created with + :py:class:`~types.ModuleType`). + + When restoring the state of a non-importable module-type object, the + current instance of this module may be passed as the argument ``main``. + Otherwise, a new instance is created with :py:class:`~types.ModuleType` + and returned. + + Parameters: + filename: a path-like object or a readable stream. + main: a module object or the name of an importable module. + **kwds: extra keyword arguments passed to :py:class:`Unpickler()`. + + Raises: + :py:exc:`UnpicklingError`: if unpickling fails. + :py:exc:`ValueError`: if the argument ``main`` and module saved + at ``filename`` are incompatible. + + Returns: + A module object, if the saved module is not :py:mod:`__main__` or + a module instance wasn't provided with the argument ``main``. + + Examples: + + - Save the state of some modules: + + >>> import dill + >>> squared = lambda x:x*x + >>> dill.dump_module() # save state of __main__ to /tmp/session.pkl + >>> + >>> import pox # an imported module + >>> pox.plus_one = lambda x:x+1 + >>> dill.dump_module('pox_session.pkl', main=pox) + >>> + >>> from types import ModuleType + >>> foo = ModuleType('foo') # a module-type object + >>> foo.values = [1,2,3] + >>> import math + >>> foo.sin = math.sin + >>> dill.dump_module('foo_session.pkl', main=foo, refimported=True) + + - Restore the state of the interpreter: + + >>> import dill + >>> dill.load_module() # updates __main__ from /tmp/session.pkl + >>> squared(2) + 4 + + - Load the saved state of an importable module: + + >>> import dill + >>> pox = dill.load_module('pox_session.pkl') + >>> pox.plus_one(1) + 2 + >>> import sys + >>> pox in sys.modules.values() + True + + - Load the saved state of a non-importable module-type object: + + >>> import dill + >>> foo = dill.load_module('foo_session.pkl') + >>> [foo.sin(x) for x in foo.values] + [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] + >>> import math + >>> foo.sin is math.sin # foo.sin was saved by reference + True + >>> import sys + >>> foo in sys.modules.values() + False + + - Update the state of a non-importable module-type object: + + >>> import dill + >>> from types import ModuleType + >>> foo = ModuleType('foo') + >>> foo.values = ['a','b'] + >>> foo.sin = lambda x:x*x + >>> dill.load_module('foo_session.pkl', main=foo) + >>> [foo.sin(x) for x in foo.values] + [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] + + *Changed in version 0.3.6:* the function ``load_session()`` was renamed to + ``load_module()``. + + See also: + :py:func:`load_module_asdict` to load the contents of module saved + with :py:func:`dump_module` into a dictionary. + """ + main_arg = main + if hasattr(filename, 'read'): + file = filename + else: + file = open(filename, 'rb') + try: + file = _make_peekable(file) + #FIXME: dill.settings are disabled + unpickler = Unpickler(file, **kwds) + unpickler._session = True + pickle_main = _identify_module(file, main) + + # Resolve unpickler._main + if main is None and pickle_main is not None: + main = pickle_main + if isinstance(main, str): + if main.startswith('__runtime__.'): + # Create runtime module to load the session into. + main = ModuleType(main.partition('.')[-1]) + else: + main = _import_module(main) + if main is not None: + if not isinstance(main, ModuleType): + raise ValueError("%r is not a module" % main) + unpickler._main = main + else: + main = unpickler._main + + # Check against the pickle's main. + is_main_imported = _is_imported_module(main) + if pickle_main is not None: + is_runtime_mod = pickle_main.startswith('__runtime__.') + if is_runtime_mod: + pickle_main = pickle_main.partition('.')[-1] + if is_runtime_mod and is_main_imported: + raise ValueError( + "can't restore non-imported module %r into an imported one" + % pickle_main + ) + if not is_runtime_mod and not is_main_imported: + raise ValueError( + "can't restore imported module %r into a non-imported one" + % pickle_main + ) + if main.__name__ != pickle_main: + raise ValueError( + "can't restore module %r into module %r" + % (pickle_main, main.__name__) + ) + + # This is for find_class() to be able to locate it. + if not is_main_imported: + runtime_main = '__runtime__.%s' % main.__name__ + sys.modules[runtime_main] = main + + module = unpickler.load() + finally: + if not hasattr(filename, 'read'): # if newly opened file + file.close() + try: + del sys.modules[runtime_main] + except (KeyError, NameError): + pass + assert module is main + _restore_modules(unpickler, module) + if not (module is _main_module or module is main_arg): + return module + +# Backward compatibility. +def load_session(filename=str(TEMPDIR/'session.pkl'), main=None, **kwds): + warnings.warn("load_session() has been renamed load_module().", PendingDeprecationWarning) + load_module(filename, main, **kwds) +load_session.__doc__ = load_module.__doc__ + +def load_module_asdict( + filename = str(TEMPDIR/'session.pkl'), + update: bool = False, + **kwds +) -> dict: + """ + Load the contents of a saved module into a dictionary. + + ``load_module_asdict()`` is the near-equivalent of:: + + lambda filename: vars(dill.load_module(filename)).copy() + + however, does not alter the original module. Also, the path of + the loaded module is stored in the ``__session__`` attribute. + + Parameters: + filename: a path-like object or a readable stream + update: if `True`, initialize the dictionary with the current state + of the module prior to loading the state stored at filename. + **kwds: extra keyword arguments passed to :py:class:`Unpickler()` + + Raises: + :py:exc:`UnpicklingError`: if unpickling fails + + Returns: + A copy of the restored module's dictionary. + + Note: + If ``update`` is True, the saved module may be imported then updated. + + Example: + >>> import dill + >>> alist = [1, 2, 3] + >>> anum = 42 + >>> dill.dump_module() + >>> anum = 0 + >>> new_var = 'spam' + >>> main = dill.load_module_asdict() + >>> main['__name__'], main['__session__'] + ('__main__', '/tmp/session.pkl') + >>> main is globals() # loaded objects don't reference globals + False + >>> main['alist'] == alist + True + >>> main['alist'] is alist # was saved by value + False + >>> main['anum'] == anum # changed after the session was saved + False + >>> new_var in main # would be True if the option 'update' was set + False + """ + if 'main' in kwds: + raise TypeError("'main' is an invalid keyword argument for load_module_asdict()") + if hasattr(filename, 'read'): + file = filename + else: + file = open(filename, 'rb') + try: + file = _make_peekable(file) + main_name = _identify_module(file) + old_main = sys.modules.get(main_name) + main = ModuleType(main_name) + if update: + if old_main is None: + old_main = _import_module(main_name) + main.__dict__.update(old_main.__dict__) + else: + main.__builtins__ = __builtin__ + sys.modules[main_name] = main + load_module(file, **kwds) + main.__session__ = str(filename) + finally: + if not hasattr(filename, 'read'): # if newly opened file + file.close() + try: + if old_main is None: + del sys.modules[main_name] + else: + sys.modules[main_name] = old_main + except NameError: # failed before setting old_main + pass + return main.__dict__ From 65157a9e2c6bd22130cb547a48ad0e26b35ca183 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 12 Jul 2022 20:55:30 -0300 Subject: [PATCH 012/109] small fixes --- dill/_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dill/_utils.py b/dill/_utils.py index f6338d31..92ccb2a2 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -73,9 +73,9 @@ def __contains__(self, filter): filter, filter_set = self._match_type(filter) return filter in filter_set def __iter__(self): - return chain.from_iterable(gettatr(self, field) for field in self._fields) + return chain.from_iterable(getattr(self, field) for field in self._fields) def __len__(self): - return sum(len(gettatr(self, field)) for field in self._fields) + return sum(len(getattr(self, field)) for field in self._fields) def add(self, filter): filter, filter_set = self._match_type(filter) filter_set.add(filter) @@ -99,7 +99,7 @@ def __or__(self, other): def __ior__(self, filters): if isinstance(filters, FilterSet): for field in self._fields: - getattr(self, field) |= getattr(filters, field) + getattr(self, field).update(getattr(filters, field)) else: for filter in filters: self.add(filter) @@ -115,12 +115,12 @@ def get_type(cls, key): from ._dill import _reverse_typemap cls._rtypemap = {(k[:-4] if k.endswith('Type') else k).lower(): v for k, v in _reverse_typemap.items()} - if key.endswith('Type') + if key.endswith('Type'): key = key[:-4] return cls._rtypemap[key.lower()] def add_type(self, type_name): self.types.add(self.get_type(type_name)) -FilterSet._fields = tuple(field.name for field in fields(cls)) +FilterSet._fields = tuple(field.name for field in fields(FilterSet)) class _FilterSetDescriptor: """descriptor for FilterSet members of FilterRules""" From 17a3b505b6008bdb9a3c29758b4906aeb1775b1f Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 12 Jul 2022 22:03:06 -0300 Subject: [PATCH 013/109] complete merge with master --- MANIFEST.in | 1 + dill/__init__.py | 302 +++-------------------------------------- dill/_dill.py | 2 +- dill/settings.py | 6 +- dill/tests/__main__.py | 9 +- setup.py | 53 ++------ tox.ini | 5 +- version.py | 80 +++++++++++ 8 files changed, 127 insertions(+), 331 deletions(-) create mode 100644 version.py diff --git a/MANIFEST.in b/MANIFEST.in index 76ad6227..1309d768 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,6 +3,7 @@ include README* include MANIFEST.in include pyproject.toml include tox.ini +include version.py include scripts/* recursive-include docs * include .* diff --git a/dill/__init__.py b/dill/__init__.py index e6f080e7..1e6df963 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -7,292 +7,28 @@ # - https://github.com/uqfoundation/dill/blob/master/LICENSE # author, version, license, and long description -__version__ = '0.3.6.dev0' -__author__ = 'Mike McKerns' +try: # the package is installed + from .__info__ import __version__, __author__, __doc__, __license__ +except: # pragma: no cover + import os + import sys + parent = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) + sys.path.append(parent) + # get distribution meta info + from version import (__version__, __author__, + get_license_text, get_readme_as_rst) + __license__ = get_license_text(os.path.join(parent, 'LICENSE')) + __license__ = "\n%s" % __license__ + __doc__ = get_readme_as_rst(os.path.join(parent, 'README.md')) + del os, sys, parent, get_license_text, get_readme_as_rst -__doc__ = """ ------------------------------ -dill: serialize all of python ------------------------------ - -About Dill -========== - -``dill`` extends python's ``pickle`` module for serializing and de-serializing -python objects to the majority of the built-in python types. Serialization -is the process of converting an object to a byte stream, and the inverse -of which is converting a byte stream back to a python object hierarchy. - -``dill`` provides the user the same interface as the ``pickle`` module, and -also includes some additional features. In addition to pickling python -objects, ``dill`` provides the ability to save the state of an interpreter -session in a single command. Hence, it would be feasible to save an -interpreter session, close the interpreter, ship the pickled file to -another computer, open a new interpreter, unpickle the session and -thus continue from the 'saved' state of the original interpreter -session. - -``dill`` can be used to store python objects to a file, but the primary -usage is to send python objects across the network as a byte stream. -``dill`` is quite flexible, and allows arbitrary user defined classes -and functions to be serialized. Thus ``dill`` is not intended to be -secure against erroneously or maliciously constructed data. It is -left to the user to decide whether the data they unpickle is from -a trustworthy source. - -``dill`` is part of ``pathos``, a python framework for heterogeneous computing. -``dill`` is in active development, so any user feedback, bug reports, comments, -or suggestions are highly appreciated. A list of issues is located at -https://github.com/uqfoundation/dill/issues, with a legacy list maintained at -https://uqfoundation.github.io/project/pathos/query. - - -Major Features -============== - -``dill`` can pickle the following standard types: - - - none, type, bool, int, float, complex, bytes, str, - - tuple, list, dict, file, buffer, builtin, - - python classes, namedtuples, dataclasses, metaclasses, - - instances of classes, - - set, frozenset, array, functions, exceptions - -``dill`` can also pickle more 'exotic' standard types: - - - functions with yields, nested functions, lambdas, - - cell, method, unboundmethod, module, code, methodwrapper, - - methoddescriptor, getsetdescriptor, memberdescriptor, wrapperdescriptor, - - dictproxy, slice, notimplemented, ellipsis, quit - -``dill`` cannot yet pickle these standard types: - - - frame, generator, traceback - -``dill`` also provides the capability to: - - - save and load python interpreter sessions - - save and extract the source code from functions and classes - - interactively diagnose pickling errors - - -Current Release -=============== - -The latest released version of ``dill`` is available from: - - https://pypi.org/project/dill - -``dill`` is distributed under a 3-clause BSD license. - - -Development Version -=================== - -You can get the latest development version with all the shiny new features at: - - https://github.com/uqfoundation - -If you have a new contribution, please submit a pull request. - - -Installation -============ - -``dill`` can be installed with ``pip``:: - - $ pip install dill - -To optionally include the ``objgraph`` diagnostic tool in the install:: - - $ pip install dill[graph] - -For windows users, to optionally install session history tools:: - - $ pip install dill[readline] - - -Requirements -============ - -``dill`` requires: - - - ``python`` (or ``pypy``), **>=3.7** - - ``setuptools``, **>=42** - -Optional requirements: - - - ``objgraph``, **>=1.7.2** - - ``pyreadline``, **>=1.7.1** (on windows) - - -Basic Usage -=========== - -``dill`` is a drop-in replacement for ``pickle``. Existing code can be -updated to allow complete pickling using:: - - >>> import dill as pickle - -or:: - - >>> from dill import dumps, loads - -``dumps`` converts the object to a unique byte string, and ``loads`` performs -the inverse operation:: - - >>> squared = lambda x: x**2 - >>> loads(dumps(squared))(3) - 9 - -There are a number of options to control serialization which are provided -as keyword arguments to several ``dill`` functions: - -* with *protocol*, the pickle protocol level can be set. This uses the - same value as the ``pickle`` module, *DEFAULT_PROTOCOL*. -* with *byref=True*, ``dill`` to behave a lot more like pickle with - certain objects (like modules) pickled by reference as opposed to - attempting to pickle the object itself. -* with *recurse=True*, objects referred to in the global dictionary are - recursively traced and pickled, instead of the default behavior of - attempting to store the entire global dictionary. -* with *fmode*, the contents of the file can be pickled along with the file - handle, which is useful if the object is being sent over the wire to a - remote system which does not have the original file on disk. Options are - *HANDLE_FMODE* for just the handle, *CONTENTS_FMODE* for the file content - and *FILE_FMODE* for content and handle. -* with *ignore=False*, objects reconstructed with types defined in the - top-level script environment use the existing type in the environment - rather than a possibly different reconstructed type. - -The default serialization can also be set globally in *dill.settings*. -Thus, we can modify how ``dill`` handles references to the global dictionary -locally or globally:: - - >>> import dill.settings - >>> dumps(absolute) == dumps(absolute, recurse=True) - False - >>> dill.settings['recurse'] = True - >>> dumps(absolute) == dumps(absolute, recurse=True) - True - -``dill`` also includes source code inspection, as an alternate to pickling:: - - >>> import dill.source - >>> print(dill.source.getsource(squared)) - squared = lambda x:x**2 - -To aid in debugging pickling issues, use *dill.detect* which provides -tools like pickle tracing:: - - >>> import dill.detect - >>> with dill.detect.trace(): - >>> dumps(squared) - ┬ F1: at 0x7fe074f8c280> - ├┬ F2: - │└ # F2 [34 B] - ├┬ Co: at 0x7fe07501eb30, file "", line 1> - │├┬ F2: - ││└ # F2 [19 B] - │└ # Co [87 B] - ├┬ D1: - │└ # D1 [22 B] - ├┬ D2: - │└ # D2 [2 B] - ├┬ D2: - │├┬ D2: - ││└ # D2 [2 B] - │└ # D2 [23 B] - └ # F1 [180 B] - -With trace, we see how ``dill`` stored the lambda (``F1``) by first storing -``_create_function``, the underlying code object (``Co``) and ``_create_code`` -(which is used to handle code objects), then we handle the reference to -the global dict (``D2``) plus other dictionaries (``D1`` and ``D2``) that -save the lambda object's state. A ``#`` marks when the object is actually stored. - - -More Information -================ - -Probably the best way to get started is to look at the documentation at -http://dill.rtfd.io. Also see ``dill.tests`` for a set of scripts that -demonstrate how ``dill`` can serialize different python objects. You can -run the test suite with ``python -m dill.tests``. The contents of any -pickle file can be examined with ``undill``. As ``dill`` conforms to -the ``pickle`` interface, the examples and documentation found at -http://docs.python.org/library/pickle.html also apply to ``dill`` -if one will ``import dill as pickle``. The source code is also generally -well documented, so further questions may be resolved by inspecting the -code itself. Please feel free to submit a ticket on github, or ask a -question on stackoverflow (**@Mike McKerns**). -If you would like to share how you use ``dill`` in your work, please send -an email (to **mmckerns at uqfoundation dot org**). - - -Citation -======== - -If you use ``dill`` to do research that leads to publication, we ask that you -acknowledge use of ``dill`` by citing the following in your publication:: - - M.M. McKerns, L. Strand, T. Sullivan, A. Fang, M.A.G. Aivazis, - "Building a framework for predictive science", Proceedings of - the 10th Python in Science Conference, 2011; - http://arxiv.org/pdf/1202.1056 - - Michael McKerns and Michael Aivazis, - "pathos: a framework for heterogeneous computing", 2010- ; - https://uqfoundation.github.io/project/pathos - -Please see https://uqfoundation.github.io/project/pathos or -http://arxiv.org/pdf/1202.1056 for further information. - -""" - -__license__ = """ -Copyright (c) 2004-2016 California Institute of Technology. -Copyright (c) 2016-2022 The Uncertainty Quantification Foundation. -All rights reserved. - -This software is available subject to the conditions and terms laid -out below. By downloading and using this software you are agreeing -to the following conditions. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met:: - - - Redistribution of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - - Redistribution in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentations and/or other materials provided with the distribution. - - - Neither the names of the copyright holders nor the names of any of - the contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -""" from ._dill import ( - Pickler, Unpickler, check, copy, dump, dumps, load, loads, pickle, pickles, - register, DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, CONTENTS_FMODE, FILE_FMODE, - HANDLE_FMODE, PickleError, PickleWarning, PicklingError, PicklingWarning, - UnpicklingError, UnpicklingWarning, + Pickler, Unpickler, + check, copy, dump, dumps, load, loads, pickle, pickles, register, + DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, CONTENTS_FMODE, FILE_FMODE, HANDLE_FMODE, + PickleError, PickleWarning, PicklingError, PicklingWarning, UnpicklingError, + UnpicklingWarning, ) from .session import dump_module, load_module, load_module_asdict from .session import dump_session, load_session # backward compatibility diff --git a/dill/_dill.py b/dill/_dill.py index b449c768..50f81105 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -1780,7 +1780,7 @@ def save_function(pickler, obj): # If the globals is the __dict__ from the module being saved as a # session, substitute it by the dictionary being actually saved. - if _original_main and globs_copy is _original_main.__dict__: + if _original_main is not None and globs_copy is _original_main.__dict__: globs_copy = getattr(pickler, '_main', _original_main).__dict__ globs = globs_copy # If the globals is a module __dict__, do not save it in the pickle. diff --git a/dill/settings.py b/dill/settings.py index 042a6578..19ef8e4b 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -11,10 +11,10 @@ from __future__ import annotations -__all__ = ['settings'] +__all__ = ['settings', 'ModuleRules'] from pickle import DEFAULT_PROTOCOL -from ._utils import FilterRules, FilterSet +from ._utils import FilterRules settings = { #'main' : None, @@ -52,7 +52,7 @@ def __setattr__(self, name, value): if not any(hasattr(self, x) for x in FilterRules.__slots__): # Initialize other. This is not a placeholder anymore. other = '_include' if name == 'exclude' else '_exclude' - super().__setattr__(other, FilterSet()) + super().__setattr__(other, ()) super().__setattr__(name, value) else: # Create a child node for submodule 'name'. diff --git a/dill/tests/__main__.py b/dill/tests/__main__.py index 1570b399..b68e8677 100644 --- a/dill/tests/__main__.py +++ b/dill/tests/__main__.py @@ -23,8 +23,13 @@ if __name__ == '__main__': + failed = 0 for test in tests: p = sp.Popen([python, test], shell=shell).wait() - if not p: + if p: + print('F', end='', flush=True) + failed = 1 + else: print('.', end='', flush=True) - print() + print('') + exit(failed) diff --git a/setup.py b/setup.py index c0716c2a..bd182e23 100644 --- a/setup.py +++ b/setup.py @@ -15,41 +15,16 @@ # get distribution meta info here = os.path.abspath(os.path.dirname(__file__)) -meta_fh = open(os.path.join(here, 'dill/__init__.py')) -try: - meta = {} - for line in meta_fh: - if line.startswith('__version__'): - VERSION = line.split()[-1].strip("'").strip('"') - break - meta['VERSION'] = VERSION - for line in meta_fh: - if line.startswith('__author__'): - AUTHOR = line.split(' = ')[-1].strip().strip("'").strip('"') - break - meta['AUTHOR'] = AUTHOR - LONG_DOC = "" - DOC_STOP = "FAKE_STOP_12345" - for line in meta_fh: - if LONG_DOC: - if line.startswith(DOC_STOP): - LONG_DOC = LONG_DOC.strip().strip("'").strip('"').lstrip() - break - else: - LONG_DOC += line - elif line.startswith('__doc__'): - DOC_STOP = line.split(' = ')[-1] - LONG_DOC = "\n" - meta['LONG_DOC'] = LONG_DOC -finally: - meta_fh.close() - -# get version numbers, long_description, etc -AUTHOR = meta['AUTHOR'] -VERSION = meta['VERSION'] -LONG_DOC = meta['LONG_DOC'] #FIXME: near-duplicate of README.md -#LICENSE = meta['LICENSE'] #FIXME: duplicate of LICENSE -AUTHOR_EMAIL = 'mmckerns@uqfoundation.org' +sys.path.append(here) +from version import (__version__, __author__, __contact__ as AUTHOR_EMAIL, + get_license_text, get_readme_as_rst, write_info_file) +LICENSE = get_license_text(os.path.join(here, 'LICENSE')) +README = get_readme_as_rst(os.path.join(here, 'README.md')) + +# write meta info file +write_info_file(here, 'dill', doc=README, license=LICENSE, + version=__version__, author=__author__) +del here, get_license_text, get_readme_as_rst, write_info_file # check if setuptools is available try: @@ -64,12 +39,12 @@ # build the 'setup' call setup_kwds = dict( name='dill', - version=VERSION, + version=__version__, description='serialize all of python', - long_description = LONG_DOC, - author = AUTHOR, + long_description = README.strip(), + author = __author__, author_email = AUTHOR_EMAIL, - maintainer = AUTHOR, + maintainer = __author__, maintainer_email = AUTHOR_EMAIL, license = '3-clause BSD', platforms = ['Linux', 'Windows', 'Mac'], diff --git a/tox.ini b/tox.ini index a63d8067..12c896c0 100644 --- a/tox.ini +++ b/tox.ini @@ -15,8 +15,7 @@ envlist = deps = # numpy whitelist_externals = - bash +# bash commands = {envpython} -m pip install . - bash -c "failed=0; for test in dill/tests/__main__.py; do echo $test; \ - {envpython} $test || failed=1; done; exit $failed" + {envpython} dill/tests/__main__.py diff --git a/version.py b/version.py new file mode 100644 index 00000000..465f3ce4 --- /dev/null +++ b/version.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# +# Author: Mike McKerns (mmckerns @caltech and @uqfoundation) +# Copyright (c) 2022 The Uncertainty Quantification Foundation. +# License: 3-clause BSD. The full license text is available at: +# - https://github.com/uqfoundation/dill/blob/master/LICENSE + +__version__ = '0.3.6.dev0' +__author__ = 'Mike McKerns' +__contact__ = 'mmckerns@uqfoundation.org' + + +def get_license_text(filepath): + "open the LICENSE file and read the contents" + try: + LICENSE = open(filepath).read() + except: + LICENSE = '' + return LICENSE + + +def get_readme_as_rst(filepath): + "open the README file and read the markdown as rst" + try: + fh = open(filepath) + name, null = fh.readline().rstrip(), fh.readline() + tag, null = fh.readline(), fh.readline() + tag = "%s: %s" % (name, tag) + split = '-'*(len(tag)-1)+'\n' + README = ''.join((null,split,tag,split,'\n')) + skip = False + for line in fh: + if line.startswith('['): + continue + elif skip and line.startswith(' http'): + README += '\n' + line + elif line.startswith('* with'): #XXX: don't indent + README += line + elif line.startswith('* '): + README += line.replace('* ',' - ',1) + elif line.startswith('-'): + README += line.replace('-','=') + '\n' + else: + README += line + skip = line.endswith(':\n') + fh.close() + except: + README = '' + return README + + +def write_info_file(dirpath, modulename, **info): + """write the given info to 'modulename/__info__.py' + + info expects: + doc: the module's long_description + version: the module's version string + author: the module's author string + license: the module's license contents + """ + import os + infofile = os.path.join(dirpath, '%s/__info__.py' % modulename) + header = '''#!/usr/bin/env python +# +# Author: Mike McKerns (mmckerns @caltech and @uqfoundation) +# Copyright (c) 2022 The Uncertainty Quantification Foundation. +# License: 3-clause BSD. The full license text is available at: +# - https://github.com/uqfoundation/%s/blob/master/LICENSE +''' % modulename #XXX: author and email are hardwired in the header + doc = info.get('doc', None) + version = info.get('version', None) + author = info.get('author', None) + license = info.get('license', None) + with open(infofile, 'w') as fh: + fh.write(header) + if doc is not None: fh.write("'''%s'''\n\n" % doc) + if version is not None: fh.write("__version__ = %r\n" % version) + if author is not None: fh.write("__author__ = %r\n\n" % author) + if license is not None: fh.write("__license__ = '''\n%s'''\n" % license) + return From 51c23bca130ee0f65338a9a3187b29af3d288510 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Wed, 13 Jul 2022 01:02:13 -0300 Subject: [PATCH 014/109] fixes --- dill/_utils.py | 25 ++++--- dill/session.py | 137 +++++++++++++++++-------------------- dill/settings.py | 17 +++-- dill/tests/test_session.py | 23 +++---- 4 files changed, 101 insertions(+), 101 deletions(-) diff --git a/dill/_utils.py b/dill/_utils.py index 92ccb2a2..7da18859 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -8,11 +8,12 @@ from __future__ import annotations -__all__ = ['FilterRules', 'Filter', 'RuleType'] +__all__ = ['FilterRules', 'Filter', 'RuleType', '_open'] import logging logger = logging.getLogger('dill._utils') +import contextlib import re from dataclasses import dataclass, field, fields from collections import namedtuple @@ -23,6 +24,16 @@ from types import ModuleType from typing import Any, Callable, Dict, Iterable, Pattern, Set, Tuple, Union +def _open(filename, mode): + """return a context manager with an opened file""" + attr = 'write' if 'w' in mode else 'read' + if hasattr(filename, attr): + return contextlib.nullcontext(filename) + else: + return open(filename, mode) + +# Namespace filtering. + Filter = Union[str, Pattern[str], int, type, Callable] RuleType = Enum('RuleType', 'EXCLUDE INCLUDE', module=__name__) Rule = Tuple[RuleType, Union[Filter, Iterable[Filter]]] @@ -163,12 +174,10 @@ def __repr__(self): return sep.join(desc).replace("set()", "{}") + ">" # Proxy add(), discard(), remove() and clear() to FilterSets. def __proxy__(self, method, filter, *, rule_type=RuleType.EXCLUDE): - if rule_type is RuleType.EXCLUDE: - getattr(self.exclude, method)(filter) - elif rule_type is RuleType.INCLUDE: - getattr(self.include, method)(filter) - else: + if not isinstance(rule_type, RuleType): raise ValueError("invalid rule type: %r (must be one of %r)" % (rule_type, list(RuleType))) + filter_set = getattr(self, rule_type.name.lower()) + getattr(filter_set, method)(filter) add = partialmethod(__proxy__, 'add') discard = partialmethod(__proxy__, 'discard') remove = partialmethod(__proxy__, 'remove') @@ -194,13 +203,13 @@ def update(self, rules: Union[Iterable[Rule], FilterRules]): else: self.add(filter, rule_type=rule_type) - def filter_vars(self, namespace: Dict[str, Any], obj: ModuleType = None): + def filter_vars(self, namespace: Dict[str, Any]): """Apply filters to dictionary with names as keys.""" if not self.exclude and not self.include: return namespace # Protect agains dict changes during the call. - namespace_copy = namespace.copy() if obj is None or namespace is obj.__dict__ else namespace + namespace_copy = namespace.copy() objects = all_objects = [NamedObj._make(item) for item in namespace_copy.items()] for filters in (self.exclude, self.include): diff --git a/dill/session.py b/dill/session.py index d544aeb4..ed53f516 100644 --- a/dill/session.py +++ b/dill/session.py @@ -22,6 +22,8 @@ import re import sys import tempfile +import warnings +from contextlib import suppress from statistics import mean from types import SimpleNamespace @@ -35,7 +37,7 @@ # Type hints. from typing import Iterable, Optional, Union -from ._utils import Filter +from ._utils import Filter, _open EXCLUDE, INCLUDE = RuleType.EXCLUDE, RuleType.INCLUDE @@ -73,42 +75,44 @@ def _lookup_module(modmap, name, obj, main_module): return modname, objname return None, None -def _stash_modules(main_module): +def _stash_modules(main_module, original_main): modmap = _module_map() newmod = ModuleType(main_module.__name__) imported = [] imported_as = [] - imported_top_level = [] # keep separeted for backwards compatibility + imported_top_level = [] # keep separeted for backward compatibility original = {} for name, obj in main_module.__dict__.items(): if obj is main_module: original[name] = newmod # self-reference - continue - + elif obj is main_module.__dict__: + original[name] = newmod.__dict__ # Avoid incorrectly matching a singleton value in another package (ex.: __doc__). - if any(obj is singleton for singleton in (None, False, True)) or \ - isinstance(obj, ModuleType) and _is_builtin_module(obj): # always saved by ref + elif any(obj is singleton for singleton in (None, False, True)) \ + or isinstance(obj, ModuleType) and _is_builtin_module(obj): # always saved by ref original[name] = obj - continue - - source_module, objname = _lookup_module(modmap, name, obj, main_module) - if source_module: - if objname == name: - imported.append((source_module, name)) - else: - imported_as.append((source_module, objname, name)) else: - try: - imported_top_level.append((modmap.top_level[id(obj)], name)) - except KeyError: - original[name] = obj + source_module, objname = _lookup_module(modmap, name, obj, main_module=original_main) + if source_module: + if objname == name: + imported.append((source_module, name)) + else: + imported_as.append((source_module, objname, name)) + else: + try: + imported_top_level.append((modmap.top_level[id(obj)], name)) + except KeyError: + original[name] = obj if len(original) < len(main_module.__dict__): newmod.__dict__.update(original) newmod.__dill_imported = imported newmod.__dill_imported_as = imported_as newmod.__dill_imported_top_level = imported_top_level + if getattr(newmod, '__loader__', None) is None and _is_imported_module(main_module): + # Trick _is_imported_module() to force saving as an imported module. + newmod.__loader__ = True # will be discarded by save_module() return newmod else: return main_module @@ -124,24 +128,32 @@ def _restore_modules(unpickler, main_module): except KeyError: pass -def _filter_vars(main, exclude, include, obj=None): - rules = FilterRules(getattr(settings, 'dump_module', None)) +def _filter_vars(main, default_rules, exclude, include): + rules = FilterRules() + mod_rules = default_rules.get(main.__name__, default_rules) + rules.exclude |= mod_rules.get_filters(EXCLUDE) + rules.include |= mod_rules.get_filters(INCLUDE) if exclude is not None: rules.update([(EXCLUDE, exclude)]) if include is not None: rules.update([(INCLUDE, include)]) - namespace = rules.filter_vars(main.__dict__, obj=obj) + namespace = rules.filter_vars(main.__dict__) if namespace is main.__dict__: return main - main = ModuleType(main.__name__) - main.__dict__.update(namespace) - return main + newmod = ModuleType(main.__name__) + newmod.__dict__.update(namespace) + for name, obj in namespace.items(): + if obj is main: + setattr(newmod, name, newmod) + elif obj is main.__dict__: + setattr(newmod, name, newmod.__dict__) + return newmod def dump_module( filename = str(TEMPDIR/'session.pkl'), - module: Union[str, ModuleType] = '__main__', + module: Union[str, ModuleType] = None, refimported: bool = False, exclude: Union[Filter, Iterable[Filter]] = None, include: Union[Filter, Iterable[Filter]] = None, @@ -210,8 +222,6 @@ def dump_module( ``dump_module()``. Parameters ``main`` and ``byref`` were renamed to ``module`` and ``refimported``, respectively. """ - from .settings import settings - protocol = settings['protocol'] for old_par, par in [('main', 'module'), ('byref', 'refimported')]: if old_par in kwds: message = "The argument %r has been renamed %r" % (old_par, par) @@ -222,35 +232,29 @@ def dump_module( raise TypeError("both %r and %r arguments were used" % (par, old_par)) refimported = kwds.pop('byref', refimported) module = kwds.pop('main', module) + + from .settings import settings + protocol = settings['protocol'] + default_rules = settings['dump_module'] main = module if main is None: main = _main_module elif isinstance(main, str): main = _import_module(main) original_main = main + main = _filter_vars(main, default_rules, exclude, include) if refimported: - main = _stash_modules(main) - main = _filter_vars(main, exclude, include, obj=original_main) - if hasattr(filename, 'write'): - file = filename - else: - file = open(filename, 'wb') - try: + main = _stash_modules(main, original_main) + with _open(filename, 'wb') as file: pickler = Pickler(file, protocol, **kwds) - pickler._original_main = main - if refimported: - main = _stash_modules(main) + if main is not original_main: + pickler._original_main = original_main pickler._main = main #FIXME: dill.settings are disabled pickler._byref = False # disable pickling by name reference pickler._recurse = False # disable pickling recursion for globals pickler._session = True # is best indicator of when pickling a session pickler._first_pass = True - if main is not original_main: - pickler._original_main = original_main pickler.dump(main) - finally: - if file is not filename: # if newly opened file - file.close() return # Backward compatibility. @@ -426,19 +430,15 @@ def load_module( raise TypeError("both 'module' and 'main' arguments were used") module = kwds.pop('main') main = module - if hasattr(filename, 'read'): - file = filename - else: - file = open(filename, 'rb') - try: + with _open(filename, 'rb') as file: file = _make_peekable(file) #FIXME: dill.settings are disabled unpickler = Unpickler(file, **kwds) unpickler._main = main unpickler._session = True - pickle_main = _identify_module(file, main) # Resolve unpickler._main + pickle_main = _identify_module(file, main) if main is None and pickle_main is not None: main = pickle_main if isinstance(main, str): @@ -476,19 +476,16 @@ def load_module( % (pickle_main, main.__name__) ) - # This is for find_class() to be able to locate it. - if not is_main_imported: - runtime_main = '__runtime__.%s' % main.__name__ - sys.modules[runtime_main] = main - - loaded = unpickler.load() - finally: - if not hasattr(filename, 'read'): # if newly opened file - file.close() try: - del sys.modules[runtime_main] - except (KeyError, NameError): - pass + # This is for find_class() to be able to locate it. + if not is_main_imported: + runtime_main = '__runtime__.%s' % main.__name__ + sys.modules[runtime_main] = main + loaded = unpickler.load() + finally: + with suppress(KeyError, NameError): + del sys.modules[runtime_main] + assert loaded is main _restore_modules(unpickler, main) if main is _main_module or main is module: @@ -556,11 +553,7 @@ def load_module_asdict( """ if 'module' in kwds: raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") - if hasattr(filename, 'read'): - file = filename - else: - file = open(filename, 'rb') - try: + with _open(filename, 'rb') as file: file = _make_peekable(file) main_name = _identify_module(file) old_main = sys.modules.get(main_name) @@ -571,19 +564,15 @@ def load_module_asdict( main.__dict__.update(old_main.__dict__) else: main.__builtins__ = builtins - sys.modules[main_name] = main - load_module(file, **kwds) - main.__session__ = str(filename) - finally: - if not hasattr(filename, 'read'): # if newly opened file - file.close() try: + sys.modules[main_name] = main + load_module(file, **kwds) + finally: if old_main is None: del sys.modules[main_name] else: sys.modules[main_name] = old_main - except NameError: # failed before setting old_main - pass + main.__session__ = str(filename) return main.__dict__ diff --git a/dill/settings.py b/dill/settings.py index 19ef8e4b..50f6862e 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -14,7 +14,7 @@ __all__ = ['settings', 'ModuleRules'] from pickle import DEFAULT_PROTOCOL -from ._utils import FilterRules +from ._utils import FilterRules, RuleType settings = { #'main' : None, @@ -74,15 +74,20 @@ def __getitem__(self, name): return mod_rules else: return mod_rules[submodules] - def get_filters(self, name): - if name not in self._fields: - raise ValueError("invalid name %r (must be one of %r)" % (name, self._fields)) + def get(self, name: str, default: ModuleRules = None): try: - return getattr(self, name) + return self[name] + except AttributeError: + return default + def get_filters(self, rule_type: RuleType): + if not isinstance(rule_type, RuleType): + raise ValueError("invalid rule type: %r (must be one of %r)" % (rule_type, list(RuleType))) + try: + return getattr(self, rule_type.name.lower()) except AttributeError: # 'self' is a placeholder, 'exclude' and 'include' are unset. if self._parent is None: raise - return self._parent.get_filters(name) + return self._parent.get_filters(rule_type) settings['dump_module'] = ModuleRules('DEFAULT', rules=()) diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index 2d7729bb..5480dfde 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -9,6 +9,7 @@ import os import sys import __main__ +from contextlib import suppress from io import BytesIO import dill @@ -27,7 +28,7 @@ def _error_line(error, obj, refimported): if __name__ == '__main__' and len(sys.argv) >= 3 and sys.argv[1] == '--child': # Test session loading in a fresh interpreter session. refimported = (sys.argv[2] == 'True') - dill.load_module(session_file % refimported) + dill.load_module(session_file % refimported, module='__main__') def test_modules(refimported): # FIXME: In this test setting with CPython 3.7, 'calendar' is not included @@ -111,10 +112,8 @@ def _clean_up_cache(module): cached = module.__cached__ if hasattr(module, '__cached__') else cached pycache = os.path.join(os.path.dirname(module.__file__), '__pycache__') for remove, file in [(os.remove, cached), (os.removedirs, pycache)]: - try: + with suppress(OSError): remove(file) - except OSError: - pass atexit.register(_clean_up_cache, local_mod) @@ -163,16 +162,14 @@ def test_session_main(refimported): error = sp.call([python, __file__, '--child', str(refimported)], shell=shell) if error: sys.exit(error) finally: - try: + with suppress(OSError): os.remove(session_file % refimported) - except OSError: - pass # Test session loading in the same session. session_buffer = BytesIO() dill.dump_module(session_buffer, refimported=refimported) session_buffer.seek(0) - dill.load_module(session_buffer) + dill.load_module(session_buffer, module='__main__') ns.backup['_test_objects'](__main__, ns.backup, refimported) def test_session_other(): @@ -183,13 +180,13 @@ def test_session_other(): dict_objects = [obj for obj in module.__dict__.keys() if not obj.startswith('__')] session_buffer = BytesIO() - dill.dump_module(session_buffer, main=module) + dill.dump_module(session_buffer, module) for obj in dict_objects: del module.__dict__[obj] session_buffer.seek(0) - dill.load_module(session_buffer) #, main=module) + dill.load_module(session_buffer, module) assert all(obj in module.__dict__ for obj in dict_objects) assert module.selfref is module @@ -200,7 +197,7 @@ def test_runtime_module(): runtime = ModuleType(modname) runtime.x = 42 - mod = dill.session._stash_modules(runtime) + mod = dill.session._stash_modules(runtime, runtime) if mod is not runtime: print("There are objects to save by referenece that shouldn't be:", mod.__dill_imported, mod.__dill_imported_as, mod.__dill_imported_top_level, @@ -210,12 +207,12 @@ def test_runtime_module(): # without imported objects in the namespace. It's a contrived example because # even dill can't be in it. This should work after fixing #462. session_buffer = BytesIO() - dill.dump_module(session_buffer, main=runtime, refimported=True) + dill.dump_module(session_buffer, module=runtime, refimported=True) session_dump = session_buffer.getvalue() # Pass a new runtime created module with the same name. runtime = ModuleType(modname) # empty - return_val = dill.load_module(BytesIO(session_dump), main=runtime) + return_val = dill.load_module(BytesIO(session_dump), module=runtime) assert return_val is None assert runtime.__name__ == modname assert runtime.x == 42 From e6adf30ce349333617788434d6a7503f7a02953b Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 14 Jul 2022 17:40:08 -0300 Subject: [PATCH 015/109] sync with master --- .gitignore | 2 +- dill/session.py | 65 +++++++++++++++++++++++--------------- dill/tests/test_session.py | 24 ++++++++++++++ 3 files changed, 64 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index 9e136965..477f7cec 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,4 @@ /docs/build /build /README -/dill/info.py \ No newline at end of file +/dill/__info__.py diff --git a/dill/session.py b/dill/session.py index ed53f516..fb4f4834 100644 --- a/dill/session.py +++ b/dill/session.py @@ -30,7 +30,8 @@ from dill import _dill, Pickler, Unpickler from ._dill import ( BuiltinMethodType, FunctionType, MethodType, ModuleType, TypeType, - _import_module, _is_builtin_module, _is_imported_module, _main_module + _import_module, _is_builtin_module, _is_imported_module, _main_module, + _reverse_typemap, ) from ._utils import FilterRules, RuleType from .settings import settings @@ -41,9 +42,6 @@ EXCLUDE, INCLUDE = RuleType.EXCLUDE, RuleType.INCLUDE -SESSION_IMPORTED_AS_TYPES = (BuiltinMethodType, FunctionType, MethodType, - ModuleType, TypeType) - TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) def _module_map(): @@ -55,7 +53,7 @@ def _module_map(): top_level={}, ) for modname, module in sys.modules.items(): - if not isinstance(module, ModuleType): + if modname in ('__main__', '__mp_main__') or not isinstance(module, ModuleType): continue if '.' not in modname: modmap.top_level[id(module)] = modname @@ -64,12 +62,23 @@ def _module_map(): modmap.by_id[id(modobj)].append((modobj, objname, modname)) return modmap +IMPORTED_AS_TYPES = (ModuleType, TypeType, FunctionType, MethodType, BuiltinMethodType) +PyCapsuleType = _reverse_typemap.get('PyCapsuleType') +if PyCapsuleType is not None: IMPORTED_AS_TYPES += (PyCapsuleType,) + +IMPORTED_AS_MODULES = [re.compile(x) for x in ( + 'ctypes', 'typing', 'subprocess', 'threading', + r'concurrent\.futures(\.\w+)?', r'multiprocessing(\.\w+)?' +)] + def _lookup_module(modmap, name, obj, main_module): """lookup name or id of obj if module is imported""" for modobj, modname in modmap.by_name[name]: if modobj is obj and sys.modules[modname] is not main_module: return modname, name - if isinstance(obj, SESSION_IMPORTED_AS_TYPES): + __module__ = getattr(obj, '__module__', None) + if isinstance(obj, IMPORTED_AS_TYPES) or (__module__ is not None + and any(regex.fullmatch(__module__) for regex in IMPORTED_AS_MODULES)): for modobj, objname, modname in modmap.by_id[id(obj)]: if sys.modules[modname] is not main_module: return modname, objname @@ -94,7 +103,7 @@ def _stash_modules(main_module, original_main): original[name] = obj else: source_module, objname = _lookup_module(modmap, name, obj, main_module=original_main) - if source_module: + if source_module is not None: if objname == name: imported.append((source_module, name)) else: @@ -153,7 +162,7 @@ def _filter_vars(main, default_rules, exclude, include): def dump_module( filename = str(TEMPDIR/'session.pkl'), - module: Union[str, ModuleType] = None, + module: Union[ModuleType, str] = None, refimported: bool = False, exclude: Union[Filter, Iterable[Filter]] = None, include: Union[Filter, Iterable[Filter]] = None, @@ -168,13 +177,13 @@ def dump_module( Parameters: filename: a path-like object or a writable stream. - module: a module object or the name of an importable module. If `None`, - (the default) :py:mod:`__main__` will be saved. - refimported: if `True`, all objects imported into the module's - namespace are saved by reference. *Note:* this is similar but - independent from ``dill.settings[`byref`]``, as ``refimported`` - refers to all imported objects, while ``byref`` only affects - select objects. + module: a module object or the name of an importable module. If `None` + (the default), :py:mod:`__main__` is saved. + refimported: if `True`, all objects identified as having been imported + into the module's namespace are saved by reference. *Note:* this is + similar but independent from ``dill.settings[`byref`]``, as + ``refimported`` refers to virtually all imported objects, while + ``byref`` only affects select objects. **kwds: extra keyword arguments passed to :py:class:`Pickler()`. Raises: @@ -221,6 +230,10 @@ def dump_module( *Changed in version 0.3.6:* Function ``dump_session()`` was renamed to ``dump_module()``. Parameters ``main`` and ``byref`` were renamed to ``module`` and ``refimported``, respectively. + + Note: + Currently, ``dill.settings['byref']`` and ``dill.settings['recurse']`` + don't apply to this function.` """ for old_par, par in [('main', 'module'), ('byref', 'refimported')]: if old_par in kwds: @@ -342,7 +355,9 @@ def load_module( Parameters: filename: a path-like object or a readable stream. - module: a module object or the name of an importable module. + module: a module object or the name of an importable module, either of + which must match the name and kind (importable or module-type + object) of the session file's module. **kwds: extra keyword arguments passed to :py:class:`Unpickler()`. Raises: @@ -449,7 +464,7 @@ def load_module( main = _import_module(main) if main is not None: if not isinstance(main, ModuleType): - raise ValueError("%r is not a module" % main) + raise TypeError("%r is not a module" % main) unpickler._main = main else: main = unpickler._main @@ -460,20 +475,18 @@ def load_module( is_runtime_mod = pickle_main.startswith('__runtime__.') if is_runtime_mod: pickle_main = pickle_main.partition('.')[-1] + error_msg = "can't update{} module{} %r with the saved state of{} module{} %r" + if main.__name__ != pickle_main: + raise ValueError(error_msg.format("", "", "", "") % (main.__name__, pickle_main)) if is_runtime_mod and is_main_imported: raise ValueError( - "can't restore non-imported module %r into an imported one" - % pickle_main + error_msg.format(" imported", "", "", "-type object") + % (main.__name__, main.__name__) ) if not is_runtime_mod and not is_main_imported: raise ValueError( - "can't restore imported module %r into a non-imported one" - % pickle_main - ) - if main.__name__ != pickle_main: - raise ValueError( - "can't restore module %r into module %r" - % (pickle_main, main.__name__) + error_msg.format("", "-type object", " imported", "") + % (main.__name__, main.__name__) ) try: diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index 5480dfde..3cd8a4bb 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -225,6 +225,29 @@ def test_runtime_module(): assert runtime.x == 42 assert runtime not in sys.modules.values() +def test_refimported_imported_as(): + import collections + import concurrent.futures + import types + import typing + mod = sys.modules['__test__'] = types.ModuleType('__test__') + dill.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) + mod.Dict = collections.UserDict # select by type + mod.AsyncCM = typing.AsyncContextManager # select by __module__ + mod.thread_exec = dill.executor # select by __module__ with regex + + session_buffer = BytesIO() + dill.dump_module(session_buffer, mod, refimported=True) + session_buffer.seek(0) + mod = dill.load(session_buffer) + del sys.modules['__test__'] + + assert set(mod.__dill_imported_as) == { + ('collections', 'UserDict', 'Dict'), + ('typing', 'AsyncContextManager', 'AsyncCM'), + ('dill', 'executor', 'thread_exec'), + } + def test_load_module_asdict(): with TestNamespace(): session_buffer = BytesIO() @@ -253,4 +276,5 @@ def test_load_module_asdict(): test_session_main(refimported=True) test_session_other() test_runtime_module() + test_refimported_imported_as() test_load_module_asdict() From 06f9f54c0af423fa74dd2403af36bd3d440b7e14 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 14 Jul 2022 18:48:22 -0300 Subject: [PATCH 016/109] unroll the filtering loop --- dill/_utils.py | 75 +++++++++++++++++++++---------------------------- dill/session.py | 9 ++++++ 2 files changed, 41 insertions(+), 43 deletions(-) diff --git a/dill/_utils.py b/dill/_utils.py index 7da18859..96c0e068 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -10,9 +10,6 @@ __all__ = ['FilterRules', 'Filter', 'RuleType', '_open'] -import logging -logger = logging.getLogger('dill._utils') - import contextlib import re from dataclasses import dataclass, field, fields @@ -203,56 +200,48 @@ def update(self, rules: Union[Iterable[Rule], FilterRules]): else: self.add(filter, rule_type=rule_type) + def _apply_filters(filter_set, objects): + filters = [] + types_list = tuple(filter_set.types) + # Apply broader/cheaper filters first. + if types_list: + filters.append(lambda obj: isinstance(obj.value, types_list)) + if filter_set.ids: + filters.append(lambda obj: id(obj.value) in filter_set.ids) + if filter_set.names: + filters.append(lambda obj: obj.name in filter_set.names) + if filter_set.regexes: + filters.append(lambda obj: any(regex.fullmatch(obj.name) for regex in filter_set.regexes)) + filters.extend(filter_set.funcs) + for filter in filters: + objects = filterfalse(filter, objects) + return objects + def filter_vars(self, namespace: Dict[str, Any]): """Apply filters to dictionary with names as keys.""" - if not self.exclude and not self.include: + if not namespace or not (self.exclude or self.include): return namespace - # Protect agains dict changes during the call. namespace_copy = namespace.copy() - objects = all_objects = [NamedObj._make(item) for item in namespace_copy.items()] - - for filters in (self.exclude, self.include): - if filters is self.exclude and not filters: - # Treat the rule set as an allowlist. - exclude_objs = objects - continue - elif filters is self.include: - if not filters or not exclude_objs: - break - objects = exclude_objs - - flist = [] - types_list = tuple(filters.types) - # Apply cheaper/broader filters first. - if types_list: - flist.append(lambda obj: isinstance(obj.value, types_list)) - if filters.ids: - flist.append(lambda obj: id(obj.value) in filters.ids) - if filters.names: - flist.append(lambda obj: obj.name in filters.names) - if filters.regexes: - flist.append(lambda obj: any(regex.fullmatch(obj.name) for regex in filters.regexes)) - flist.extend(filters.funcs) - for f in flist: - objects = filterfalse(f, objects) - - if filters is self.exclude: - include_names = {obj.name for obj in objects} - exclude_objs = [obj for obj in all_objects if obj.name not in include_names] - else: - exclude_objs = list(objects) + all_objs = [NamedObj._make(item) for item in namespace_copy.items()] + if not self.exclude: + # Treat this rule set as an allowlist. + exclude_objs = all_objs + else: + include_names = {obj.name for obj in self._apply_filters(self.exclude, all_objs)} + exclude_objs = [obj for obj in all_objs if obj.name not in include_names] + if self.include and exclude_objs: + exclude_objs = list(self._apply_filters(self.include, exclude_objs)) if not exclude_objs: return namespace + if len(exclude_objs) == len(namespace): - warnings.warn("filtering operation left the namespace empty!", PicklingWarning) + warnings.warn( + "the exclude/include rules applied have excluded all the %d items" % len(all_objects), + PicklingWarning + ) return {} - if logger.isEnabledFor(logging.INFO): - exclude_listing = {obj.name: type(obj.value).__name__ for obj in sorted(exclude_objs)} - exclude_listing = str(exclude_listing).translate({ord(","): "\n", ord("'"): None}) - logger.info("Objects excluded from dump_session():\n%s\n", exclude_listing) - for obj in exclude_objs: del namespace_copy[obj.name] return namespace_copy diff --git a/dill/session.py b/dill/session.py index fb4f4834..d5189c5d 100644 --- a/dill/session.py +++ b/dill/session.py @@ -16,6 +16,9 @@ 'dump_session', 'load_session' # backward compatibility ] +import logging +logger = logging.getLogger('dill.session') + import builtins import pathlib import random @@ -151,6 +154,12 @@ def _filter_vars(main, default_rules, exclude, include): if namespace is main.__dict__: return main + if logger.isEnabledFor(logging.INFO): + excluded = {name: type(value).__name__ + for name, value in sorted(main.__dict__.items()) if name not in namespace} + excluded = str(excluded).translate({ord(","): "\n", ord("'"): None}) + logger.info("Objects excluded from dump_session():\n%s\n", excluded) + newmod = ModuleType(main.__name__) newmod.__dict__.update(namespace) for name, obj in namespace.items(): From ab13325d1c42a186d987c2d424a2412fbd2b151c Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 16 Jul 2022 11:43:23 -0300 Subject: [PATCH 017/109] code formatting changes --- dill/_dill.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 1c8af8dc..12daabe7 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -346,9 +346,11 @@ def _module_map(): return modmap SESSION_IMPORTED_AS_TYPES = (ModuleType, TypeType, FunctionType, MethodType, BuiltinMethodType) -SESSION_IMPORTED_AS_MODULES = ('ctypes', 'typing', 'subprocess', 'threading', - r'concurrent\.futures(\.\w+)?', r'multiprocessing(\.\w+)?') -SESSION_IMPORTED_AS_MODULES = tuple(re.compile(x) for x in SESSION_IMPORTED_AS_MODULES) + +SESSION_IMPORTED_AS_MODULES = [re.compile(x) for x in ( + 'ctypes', 'typing', 'subprocess', 'threading', + r'concurrent\.futures(\.\w+)?', r'multiprocessing(\.\w+)?' +)] def _lookup_module(modmap, name, obj, main_module): """lookup name or id of obj if module is imported""" @@ -740,18 +742,18 @@ def load_module( if is_runtime_mod: pickle_main = pickle_main.partition('.')[-1] error_msg = "can't update{} module{} %r with the saved state of{} module{} %r" + if main.__name__ != pickle_main: + raise ValueError(error_msg.format("", "", "", "") % (main.__name__, pickle_main)) if is_runtime_mod and is_main_imported: raise ValueError( error_msg.format(" imported", "", "", "-type object") - % (main.__name__, pickle_main) + % (main.__name__, main.__name__) ) if not is_runtime_mod and not is_main_imported: raise ValueError( error_msg.format("", "-type object", " imported", "") - % (pickle_main, main.__name__) + % (main.__name__, main.__name__) ) - if main.__name__ != pickle_main: - raise ValueError(error_msg.format("", "", "", "") % (main.__name__, pickle_main)) # This is for find_class() to be able to locate it. if not is_main_imported: From a829849371ba73c534cc6838fd4ca8251d63533c Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 16 Jul 2022 11:27:19 -0300 Subject: [PATCH 018/109] sync with master --- dill/session.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/dill/session.py b/dill/session.py index d5189c5d..92bea292 100644 --- a/dill/session.py +++ b/dill/session.py @@ -93,7 +93,7 @@ def _stash_modules(main_module, original_main): imported = [] imported_as = [] - imported_top_level = [] # keep separeted for backward compatibility + imported_top_level = [] # keep separated for backward compatibility original = {} for name, obj in main_module.__dict__.items(): if obj is main_module: @@ -211,7 +211,7 @@ def dump_module( >>> import dill >>> import pox >>> pox.plus_one = lambda x: x+1 - >>> dill.dump_module('pox_session.pkl', main=pox) + >>> dill.dump_module('pox_session.pkl', module=pox) - Save the state of a non-importable, module-type object: @@ -221,7 +221,7 @@ def dump_module( >>> foo.values = [1,2,3] >>> import math >>> foo.sin = math.sin - >>> dill.dump_module('foo_session.pkl', main=foo, refimported=True) + >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) - Restore the state of the saved modules: @@ -263,6 +263,8 @@ def dump_module( main = _main_module elif isinstance(main, str): main = _import_module(main) + if not isinstance(main, ModuleType): + raise TypeError("%r is not a module" % main) original_main = main main = _filter_vars(main, default_rules, exclude, include) if refimported: @@ -322,7 +324,7 @@ def _make_peekable(stream): return _PeekableReader(stream) def _identify_module(file, main=None): - """identify the session file's module name""" + """identify the name of the module stored in the given file-type object""" from pickletools import genops UNICODE = {'UNICODE', 'BINUNICODE', 'SHORT_BINUNICODE'} found_import = False @@ -349,8 +351,8 @@ def load_module( module: Union[ModuleType, str] = None, **kwds ) -> Optional[ModuleType]: - """Update :py:mod:`__main__` or another module with the state from the - session file. + """Update the selected module (default is :py:mod:`__main__`) with + the state saved at ``filename``. Restore a module to the state saved with :py:func:`dump_module`. The saved module can be :py:mod:`__main__` (e.g. an interpreter session), @@ -364,9 +366,9 @@ def load_module( Parameters: filename: a path-like object or a readable stream. - module: a module object or the name of an importable module, either of - which must match the name and kind (importable or module-type - object) of the session file's module. + module: a module object or the name of an importable module; + the module name and kind (i.e. imported or non-imported) must + match the name and kind of the module stored at ``filename``. **kwds: extra keyword arguments passed to :py:class:`Unpickler()`. Raises: @@ -388,14 +390,14 @@ def load_module( >>> >>> import pox # an imported module >>> pox.plus_one = lambda x: x+1 - >>> dill.dump_module('pox_session.pkl', main=pox) + >>> dill.dump_module('pox_session.pkl', module=pox) >>> >>> from types import ModuleType >>> foo = ModuleType('foo') # a module-type object >>> foo.values = [1,2,3] >>> import math >>> foo.sin = math.sin - >>> dill.dump_module('foo_session.pkl', main=foo, refimported=True) + >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) - Restore the state of the interpreter: @@ -434,7 +436,7 @@ def load_module( >>> foo = ModuleType('foo') >>> foo.values = ['a','b'] >>> foo.sin = lambda x: x*x - >>> dill.load_module('foo_session.pkl', main=foo) + >>> dill.load_module('foo_session.pkl', module=foo) >>> [foo.sin(x) for x in foo.values] [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] @@ -549,8 +551,11 @@ def load_module_asdict( A copy of the restored module's dictionary. Note: - If ``update`` is True, the saved module may be imported then updated. - If imported, the loaded module remains unchanged as in the general case. + If ``update`` is True, the corresponding module may first be imported + into the current namespace before the saved state is loaded from + filename to the dictionary. Note that any module that is imported into + the current namespace as a side-effect of using ``update`` will not be + modified by loading the saved module in filename to a dictionary. Example: >>> import dill From 33ca2ed80907a0fe6ead78855a12e36263ef8362 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 16 Jul 2022 12:31:29 -0300 Subject: [PATCH 019/109] remove unnecessary '_main_modified' attribute from pickler --- dill/_dill.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 12daabe7..da64ac25 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -512,21 +512,22 @@ def dump_module( main = _import_module(main) if not isinstance(main, ModuleType): raise TypeError("%r is not a module" % main) + original_main = main + if refimported: + main = _stash_modules(main) if hasattr(filename, 'write'): file = filename else: file = open(filename, 'wb') try: pickler = Pickler(file, protocol, **kwds) - pickler._original_main = main - if refimported: - main = _stash_modules(main) + if main is not original_main: + pickler._original_main = original_main pickler._main = main #FIXME: dill.settings are disabled pickler._byref = False # disable pickling by name reference pickler._recurse = False # disable pickling recursion for globals pickler._session = True # is best indicator of when pickling a session pickler._first_pass = True - pickler._main_modified = main is not pickler._original_main pickler.dump(main) finally: if file is not filename: # if newly opened file @@ -2317,8 +2318,7 @@ def save_function(pickler, obj): logger.trace(pickler, "F1: %s", obj) _recurse = getattr(pickler, '_recurse', None) _postproc = getattr(pickler, '_postproc', None) - _main_modified = getattr(pickler, '_main_modified', None) - _original_main = getattr(pickler, '_original_main', __builtin__)#'None' + _original_main = getattr(pickler, '_original_main', None) postproc_list = [] if _recurse: # recurse to get all globals referred to by obj @@ -2335,7 +2335,7 @@ def save_function(pickler, obj): # If the globals is the __dict__ from the module being saved as a # session, substitute it by the dictionary being actually saved. - if _main_modified and globs_copy is _original_main.__dict__: + if _original_main is not None and globs_copy is _original_main.__dict__: globs_copy = getattr(pickler, '_main', _original_main).__dict__ globs = globs_copy # If the globals is a module __dict__, do not save it in the pickle. From ad8db21fd674d06815237559e37bae3b91505694 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 16 Jul 2022 13:03:32 -0300 Subject: [PATCH 020/109] new _open() function to handle file names and file-like objects --- dill/_dill.py | 64 +++++++++++++++++++++------------------------------ 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index da64ac25..6799125d 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -320,13 +320,23 @@ def loads(str, ignore=None, **kwds): ### End: Shorthands ### ### Pickle the Interpreter Session +import contextlib import pathlib import re import tempfile +from contextlib import suppress from types import SimpleNamespace TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) +def _open(file, mode): + """return a context manager with an opened file-like object""" + attr = 'write' if 'w' in mode else 'read' + if not hasattr(file, attr): + return open(file, mode) + else: + return contextlib.nullcontext(file) + def _module_map(): """get map of imported modules""" from collections import defaultdict @@ -515,11 +525,7 @@ def dump_module( original_main = main if refimported: main = _stash_modules(main) - if hasattr(filename, 'write'): - file = filename - else: - file = open(filename, 'wb') - try: + with _open(filename, 'wb') as file: pickler = Pickler(file, protocol, **kwds) if main is not original_main: pickler._original_main = original_main @@ -529,9 +535,6 @@ def dump_module( pickler._session = True # is best indicator of when pickling a session pickler._first_pass = True pickler.dump(main) - finally: - if file is not filename: # if newly opened file - file.close() return # Backward compatibility. @@ -709,11 +712,7 @@ def load_module( raise TypeError("both 'module' and 'main' arguments were used") module = kwds.pop('main') main = module - if hasattr(filename, 'read'): - file = filename - else: - file = open(filename, 'rb') - try: + with _open(filename, 'rb') as file: file = _make_peekable(file) #FIXME: dill.settings are disabled unpickler = Unpickler(file, **kwds) @@ -756,19 +755,16 @@ def load_module( % (main.__name__, main.__name__) ) - # This is for find_class() to be able to locate it. - if not is_main_imported: - runtime_main = '__runtime__.%s' % main.__name__ - sys.modules[runtime_main] = main - - loaded = unpickler.load() - finally: - if not hasattr(filename, 'read'): # if newly opened file - file.close() try: - del sys.modules[runtime_main] - except (KeyError, NameError): - pass + if not is_main_imported: + # This is for find_class() to be able to locate it. + runtime_main = '__runtime__.%s' % main.__name__ + sys.modules[runtime_main] = main + loaded = unpickler.load() + finally: + if not is_main_imported: + del sys.modules[runtime_main] + assert loaded is main _restore_modules(unpickler, main) if main is _main_module or main is module: @@ -839,11 +835,7 @@ def load_module_asdict( """ if 'module' in kwds: raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") - if hasattr(filename, 'read'): - file = filename - else: - file = open(filename, 'rb') - try: + with _open(filename, 'rb') as file: file = _make_peekable(file) main_name = _identify_module(file) old_main = sys.modules.get(main_name) @@ -854,18 +846,14 @@ def load_module_asdict( main.__dict__.update(old_main.__dict__) else: main.__builtins__ = __builtin__ - sys.modules[main_name] = main - load_module(file, **kwds) - finally: - if not hasattr(filename, 'read'): # if newly opened file - file.close() try: + sys.modules[main_name] = main + load_module(file, **kwds) + finally: if old_main is None: del sys.modules[main_name] else: sys.modules[main_name] = old_main - except NameError: # failed before setting old_main - pass main.__session__ = str(filename) return main.__dict__ @@ -914,7 +902,7 @@ def __init__(self, file, *args, **kwds): self._fmode = settings['fmode'] if _fmode is None else _fmode self._recurse = settings['recurse'] if _recurse is None else _recurse self._postproc = OrderedDict() - self._file = file + self._file = file # for the logger def dump(self, obj): #NOTE: if settings change, need to update attributes # register if the object is a numpy ufunc From da4cc072c0ef5d644acee636f8de4f7eb7ebf482 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 16 Jul 2022 13:42:01 -0300 Subject: [PATCH 021/109] merge function _make_peekable() with _open() --- dill/_dill.py | 90 ++++++++++++++++++++++++++------------------------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 6799125d..dd59e6f9 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -329,13 +329,53 @@ def loads(str, ignore=None, **kwds): TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) -def _open(file, mode): +class _PeekableReader: + """lightweight readable stream wrapper that implements peek()""" + def __init__(self, stream): + self.stream = stream + def read(self, n): + return self.stream.read(n) + def readline(self): + return self.stream.readline() + def tell(self): + return self.stream.tell() + def close(self): + return self.stream.close() + def peek(self, n): + stream = self.stream + try: + if hasattr(stream, 'flush'): stream.flush() + position = stream.tell() + stream.seek(position) # assert seek() works before reading + chunk = stream.read(n) + stream.seek(position) + return chunk + except (AttributeError, OSError): + raise NotImplementedError("stream is not peekable: %r", stream) from None + +def _open(file, mode, *, peekable=False): """return a context manager with an opened file-like object""" + import io attr = 'write' if 'w' in mode else 'read' - if not hasattr(file, attr): - return open(file, mode) - else: + was_open = hasattr(file, attr) + if not was_open: + file = open(file, mode) + if attr == 'read' and peekable and not hasattr(file, 'peek'): + # Try our best to return the stream as an object with a peek() method. + if hasattr(file, 'tell') and hasattr(file, 'seek'): + file = _PeekableReader(file) + else: + try: + file = io.BufferedReader(file) + except Exception: + # Stream won't be peekable, but will fail gracefully in _identify_module(). + file = _PeekableReader(file) + if was_open: # should not close at exit return contextlib.nullcontext(file) + elif type(file) == _PeekableReader: + return contextlib.closing(file) + else: + return file def _module_map(): """get map of imported modules""" @@ -543,42 +583,6 @@ def dump_session(filename=str(TEMPDIR/'session.pkl'), main=None, byref=False, ** dump_module(filename, module=main, refimported=byref, **kwds) dump_session.__doc__ = dump_module.__doc__ -class _PeekableReader: - """lightweight stream wrapper that implements peek()""" - def __init__(self, stream): - self.stream = stream - def read(self, n): - return self.stream.read(n) - def readline(self): - return self.stream.readline() - def tell(self): - return self.stream.tell() - def close(self): - return self.stream.close() - def peek(self, n): - stream = self.stream - try: - if hasattr(stream, 'flush'): stream.flush() - position = stream.tell() - stream.seek(position) # assert seek() works before reading - chunk = stream.read(n) - stream.seek(position) - return chunk - except (AttributeError, OSError): - raise NotImplementedError("stream is not peekable: %r", stream) from None - -def _make_peekable(stream): - """return stream as an object with a peek() method""" - import io - if hasattr(stream, 'peek'): - return stream - if not (hasattr(stream, 'tell') and hasattr(stream, 'seek')): - try: - return io.BufferedReader(stream) - except Exception: - pass - return _PeekableReader(stream) - def _identify_module(file, main=None): """identify the name of the module stored in the given file-type object""" from pickletools import genops @@ -712,8 +716,7 @@ def load_module( raise TypeError("both 'module' and 'main' arguments were used") module = kwds.pop('main') main = module - with _open(filename, 'rb') as file: - file = _make_peekable(file) + with _open(filename, 'rb', peekable=True) as file: #FIXME: dill.settings are disabled unpickler = Unpickler(file, **kwds) unpickler._session = True @@ -835,8 +838,7 @@ def load_module_asdict( """ if 'module' in kwds: raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") - with _open(filename, 'rb') as file: - file = _make_peekable(file) + with _open(filename, 'rb', peekable=True) as file: main_name = _identify_module(file) old_main = sys.modules.get(main_name) main = ModuleType(main_name) From 1732e3d217ebd40f4b7dd9a0577e34a24e38f7af Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 16 Jul 2022 14:09:37 -0300 Subject: [PATCH 022/109] new function is_module_pickle() --- dill/__init__.py | 8 ++++---- dill/_dill.py | 32 +++++++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/dill/__init__.py b/dill/__init__.py index 028112dc..e45d5146 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -25,10 +25,10 @@ from ._dill import ( dump, dumps, load, loads, dump_module, load_module, load_module_asdict, - dump_session, load_session, Pickler, Unpickler, register, copy, pickle, - pickles, check, HIGHEST_PROTOCOL, DEFAULT_PROTOCOL, PicklingError, - UnpicklingError, HANDLE_FMODE, CONTENTS_FMODE, FILE_FMODE, PickleError, - PickleWarning, PicklingWarning, UnpicklingWarning, + dump_session, load_session, is_module_pickle, Pickler, Unpickler, register, + copy, pickle, pickles, check, HIGHEST_PROTOCOL, DEFAULT_PROTOCOL, + PicklingError, UnpicklingError, HANDLE_FMODE, CONTENTS_FMODE, FILE_FMODE, + PickleError, PickleWarning, PicklingWarning, UnpicklingWarning, ) from . import source, temp, detect diff --git a/dill/_dill.py b/dill/_dill.py index dd59e6f9..017ca347 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -17,11 +17,11 @@ """ __all__ = [ 'dump', 'dumps', 'load', 'loads', 'dump_module', 'load_module', - 'load_module_asdict', 'dump_session', 'load_session', 'Pickler', 'Unpickler', - 'register', 'copy', 'pickle', 'pickles', 'check', 'HIGHEST_PROTOCOL', - 'DEFAULT_PROTOCOL', 'PicklingError', 'UnpicklingError', 'HANDLE_FMODE', - 'CONTENTS_FMODE', 'FILE_FMODE', 'PickleError', 'PickleWarning', - 'PicklingWarning', 'UnpicklingWarning' + 'load_module_asdict', 'dump_session', 'load_session', 'is_module_pickle', + 'Pickler', 'Unpickler', 'register', 'copy', 'pickle', 'pickles', 'check', + 'HIGHEST_PROTOCOL', 'DEFAULT_PROTOCOL', 'PicklingError', 'UnpicklingError', + 'HANDLE_FMODE', 'CONTENTS_FMODE', 'FILE_FMODE', 'PickleError', + 'PickleWarning', 'PicklingWarning', 'UnpicklingWarning', ] __module__ = 'dill' @@ -606,6 +606,28 @@ def _identify_module(file, main=None): return None raise UnpicklingError("unable to identify main module") from error +def is_module_pickle(filename, importable: bool = True) -> bool: + """Check if a file is a module state pickle file. + + Parameters: + filename: a path-like object or a readable stream. + importable: expected kind of the file's saved module. Use `True` for + importable modules (the default) or `False` for module-type objects. + + Returns: + `True` if the pickle file at ``filename`` was generated with + :py:func:`dump_module` **AND** the module whose state is saved in it is + of the kind specified by the ``importable`` argument. `False` otherwise. + """ + with _open(filename, 'rb', peekable=True) as file: + try: + pickle_main = _identify_module(file) + except UnpicklingError: + return False + else: + is_runtime_mod = pickle_main.startswith('__runtime__.') + return importable ^ is_runtime_mod + def load_module( filename = str(TEMPDIR/'session.pkl'), module: Union[ModuleType, str] = None, From 2fdd31d6d1bd855f54877cd5721a8659561ea8cb Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 16 Jul 2022 18:51:23 -0300 Subject: [PATCH 023/109] move session-related code to session.py submodule --- dill/__init__.py | 14 +- dill/_dill.py | 567 -------------------------------------------- dill/session.py | 593 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 601 insertions(+), 573 deletions(-) create mode 100644 dill/session.py diff --git a/dill/__init__.py b/dill/__init__.py index e45d5146..8f8429bd 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -24,13 +24,15 @@ from ._dill import ( - dump, dumps, load, loads, dump_module, load_module, load_module_asdict, - dump_session, load_session, is_module_pickle, Pickler, Unpickler, register, - copy, pickle, pickles, check, HIGHEST_PROTOCOL, DEFAULT_PROTOCOL, - PicklingError, UnpicklingError, HANDLE_FMODE, CONTENTS_FMODE, FILE_FMODE, - PickleError, PickleWarning, PicklingWarning, UnpicklingWarning, + Pickler, Unpickler, + check, copy, dump, dumps, load, loads, pickle, pickles, register, + DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, CONTENTS_FMODE, FILE_FMODE, HANDLE_FMODE, + PickleError, PickleWarning, PicklingError, PicklingWarning, UnpicklingError, + UnpicklingWarning, ) -from . import source, temp, detect +from .session import dump_module, is_module_pickle, load_module, load_module_asdict +from .session import dump_session, load_session # backward compatibility +from . import detect, session, source, temp # get global settings from .settings import settings diff --git a/dill/_dill.py b/dill/_dill.py index 017ca347..aaa14101 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -30,8 +30,6 @@ from .logger import adapter as logger from .logger import trace as _trace -from typing import Optional, Union - import os import sys diff = None @@ -319,570 +317,6 @@ def loads(str, ignore=None, **kwds): ### End: Shorthands ### -### Pickle the Interpreter Session -import contextlib -import pathlib -import re -import tempfile -from contextlib import suppress -from types import SimpleNamespace - -TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) - -class _PeekableReader: - """lightweight readable stream wrapper that implements peek()""" - def __init__(self, stream): - self.stream = stream - def read(self, n): - return self.stream.read(n) - def readline(self): - return self.stream.readline() - def tell(self): - return self.stream.tell() - def close(self): - return self.stream.close() - def peek(self, n): - stream = self.stream - try: - if hasattr(stream, 'flush'): stream.flush() - position = stream.tell() - stream.seek(position) # assert seek() works before reading - chunk = stream.read(n) - stream.seek(position) - return chunk - except (AttributeError, OSError): - raise NotImplementedError("stream is not peekable: %r", stream) from None - -def _open(file, mode, *, peekable=False): - """return a context manager with an opened file-like object""" - import io - attr = 'write' if 'w' in mode else 'read' - was_open = hasattr(file, attr) - if not was_open: - file = open(file, mode) - if attr == 'read' and peekable and not hasattr(file, 'peek'): - # Try our best to return the stream as an object with a peek() method. - if hasattr(file, 'tell') and hasattr(file, 'seek'): - file = _PeekableReader(file) - else: - try: - file = io.BufferedReader(file) - except Exception: - # Stream won't be peekable, but will fail gracefully in _identify_module(). - file = _PeekableReader(file) - if was_open: # should not close at exit - return contextlib.nullcontext(file) - elif type(file) == _PeekableReader: - return contextlib.closing(file) - else: - return file - -def _module_map(): - """get map of imported modules""" - from collections import defaultdict - modmap = SimpleNamespace( - by_name=defaultdict(list), - by_id=defaultdict(list), - top_level={}, - ) - for modname, module in sys.modules.items(): - if modname in ('__main__', '__mp_main__') or not isinstance(module, ModuleType): - continue - if '.' not in modname: - modmap.top_level[id(module)] = modname - for objname, modobj in module.__dict__.items(): - modmap.by_name[objname].append((modobj, modname)) - modmap.by_id[id(modobj)].append((modobj, objname, modname)) - return modmap - -SESSION_IMPORTED_AS_TYPES = (ModuleType, TypeType, FunctionType, MethodType, BuiltinMethodType) - -SESSION_IMPORTED_AS_MODULES = [re.compile(x) for x in ( - 'ctypes', 'typing', 'subprocess', 'threading', - r'concurrent\.futures(\.\w+)?', r'multiprocessing(\.\w+)?' -)] - -def _lookup_module(modmap, name, obj, main_module): - """lookup name or id of obj if module is imported""" - for modobj, modname in modmap.by_name[name]: - if modobj is obj and sys.modules[modname] is not main_module: - return modname, name - __module__ = getattr(obj, '__module__', None) - if isinstance(obj, SESSION_IMPORTED_AS_TYPES) or (__module__ is not None - and any(regex.fullmatch(__module__) for regex in SESSION_IMPORTED_AS_MODULES)): - for modobj, objname, modname in modmap.by_id[id(obj)]: - if sys.modules[modname] is not main_module: - return modname, objname - return None, None - -def _stash_modules(main_module): - modmap = _module_map() - newmod = ModuleType(main_module.__name__) - - imported = [] - imported_as = [] - imported_top_level = [] # keep separated for backward compatibility - original = {} - for name, obj in main_module.__dict__.items(): - if obj is main_module: - original[name] = newmod # self-reference - elif obj is main_module.__dict__: - original[name] = newmod.__dict__ - # Avoid incorrectly matching a singleton value in another package (ex.: __doc__). - elif any(obj is singleton for singleton in (None, False, True)) \ - or isinstance(obj, ModuleType) and _is_builtin_module(obj): # always saved by ref - original[name] = obj - else: - source_module, objname = _lookup_module(modmap, name, obj, main_module) - if source_module is not None: - if objname == name: - imported.append((source_module, name)) - else: - imported_as.append((source_module, objname, name)) - else: - try: - imported_top_level.append((modmap.top_level[id(obj)], name)) - except KeyError: - original[name] = obj - - if len(original) < len(main_module.__dict__): - newmod.__dict__.update(original) - newmod.__dill_imported = imported - newmod.__dill_imported_as = imported_as - newmod.__dill_imported_top_level = imported_top_level - if getattr(newmod, '__loader__', None) is None and _is_imported_module(main_module): - # Trick _is_imported_module() to force saving as an imported module. - newmod.__loader__ = True # will be discarded by save_module() - return newmod - else: - return main_module - -def _restore_modules(unpickler, main_module): - try: - for modname, name in main_module.__dict__.pop('__dill_imported'): - main_module.__dict__[name] = unpickler.find_class(modname, name) - for modname, objname, name in main_module.__dict__.pop('__dill_imported_as'): - main_module.__dict__[name] = unpickler.find_class(modname, objname) - for modname, name in main_module.__dict__.pop('__dill_imported_top_level'): - main_module.__dict__[name] = __import__(modname) - except KeyError: - pass - -#NOTE: 06/03/15 renamed main_module to main -def dump_module( - filename = str(TEMPDIR/'session.pkl'), - module: Union[ModuleType, str] = None, - refimported: bool = False, - **kwds -) -> None: - """Pickle the current state of :py:mod:`__main__` or another module to a file. - - Save the contents of :py:mod:`__main__` (e.g. from an interactive - interpreter session), an imported module, or a module-type object (e.g. - built with :py:class:`~types.ModuleType`), to a file. The pickled - module can then be restored with the function :py:func:`load_module`. - - Parameters: - filename: a path-like object or a writable stream. - module: a module object or the name of an importable module. If `None` - (the default), :py:mod:`__main__` is saved. - refimported: if `True`, all objects identified as having been imported - into the module's namespace are saved by reference. *Note:* this is - similar but independent from ``dill.settings[`byref`]``, as - ``refimported`` refers to virtually all imported objects, while - ``byref`` only affects select objects. - **kwds: extra keyword arguments passed to :py:class:`Pickler()`. - - Raises: - :py:exc:`PicklingError`: if pickling fails. - - Examples: - - - Save current interpreter session state: - - >>> import dill - >>> squared = lambda x: x*x - >>> dill.dump_module() # save state of __main__ to /tmp/session.pkl - - - Save the state of an imported/importable module: - - >>> import dill - >>> import pox - >>> pox.plus_one = lambda x: x+1 - >>> dill.dump_module('pox_session.pkl', module=pox) - - - Save the state of a non-importable, module-type object: - - >>> import dill - >>> from types import ModuleType - >>> foo = ModuleType('foo') - >>> foo.values = [1,2,3] - >>> import math - >>> foo.sin = math.sin - >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) - - - Restore the state of the saved modules: - - >>> import dill - >>> dill.load_module() - >>> squared(2) - 4 - >>> pox = dill.load_module('pox_session.pkl') - >>> pox.plus_one(1) - 2 - >>> foo = dill.load_module('foo_session.pkl') - >>> [foo.sin(x) for x in foo.values] - [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] - - *Changed in version 0.3.6:* Function ``dump_session()`` was renamed to - ``dump_module()``. Parameters ``main`` and ``byref`` were renamed to - ``module`` and ``refimported``, respectively. - - Note: - Currently, ``dill.settings['byref']`` and ``dill.settings['recurse']`` - don't apply to this function.` - """ - for old_par, par in [('main', 'module'), ('byref', 'refimported')]: - if old_par in kwds: - message = "The argument %r has been renamed %r" % (old_par, par) - if old_par == 'byref': - message += " to distinguish it from dill.settings['byref']" - warnings.warn(message + ".", PendingDeprecationWarning) - if locals()[par]: # the defaults are None and False - raise TypeError("both %r and %r arguments were used" % (par, old_par)) - refimported = kwds.pop('byref', refimported) - module = kwds.pop('main', module) - - from .settings import settings - protocol = settings['protocol'] - main = module - if main is None: - main = _main_module - elif isinstance(main, str): - main = _import_module(main) - if not isinstance(main, ModuleType): - raise TypeError("%r is not a module" % main) - original_main = main - if refimported: - main = _stash_modules(main) - with _open(filename, 'wb') as file: - pickler = Pickler(file, protocol, **kwds) - if main is not original_main: - pickler._original_main = original_main - pickler._main = main #FIXME: dill.settings are disabled - pickler._byref = False # disable pickling by name reference - pickler._recurse = False # disable pickling recursion for globals - pickler._session = True # is best indicator of when pickling a session - pickler._first_pass = True - pickler.dump(main) - return - -# Backward compatibility. -def dump_session(filename=str(TEMPDIR/'session.pkl'), main=None, byref=False, **kwds): - warnings.warn("dump_session() has been renamed dump_module()", PendingDeprecationWarning) - dump_module(filename, module=main, refimported=byref, **kwds) -dump_session.__doc__ = dump_module.__doc__ - -def _identify_module(file, main=None): - """identify the name of the module stored in the given file-type object""" - from pickletools import genops - UNICODE = {'UNICODE', 'BINUNICODE', 'SHORT_BINUNICODE'} - found_import = False - try: - for opcode, arg, pos in genops(file.peek(256)): - if not found_import: - if opcode.name in ('GLOBAL', 'SHORT_BINUNICODE') and \ - arg.endswith('_import_module'): - found_import = True - else: - if opcode.name in UNICODE: - return arg - else: - raise UnpicklingError("reached STOP without finding main module") - except (NotImplementedError, ValueError) as error: - # ValueError occours when the end of the chunk is reached (without a STOP). - if isinstance(error, NotImplementedError) and main is not None: - # file is not peekable, but we have main. - return None - raise UnpicklingError("unable to identify main module") from error - -def is_module_pickle(filename, importable: bool = True) -> bool: - """Check if a file is a module state pickle file. - - Parameters: - filename: a path-like object or a readable stream. - importable: expected kind of the file's saved module. Use `True` for - importable modules (the default) or `False` for module-type objects. - - Returns: - `True` if the pickle file at ``filename`` was generated with - :py:func:`dump_module` **AND** the module whose state is saved in it is - of the kind specified by the ``importable`` argument. `False` otherwise. - """ - with _open(filename, 'rb', peekable=True) as file: - try: - pickle_main = _identify_module(file) - except UnpicklingError: - return False - else: - is_runtime_mod = pickle_main.startswith('__runtime__.') - return importable ^ is_runtime_mod - -def load_module( - filename = str(TEMPDIR/'session.pkl'), - module: Union[ModuleType, str] = None, - **kwds -) -> Optional[ModuleType]: - """Update the selected module (default is :py:mod:`__main__`) with - the state saved at ``filename``. - - Restore a module to the state saved with :py:func:`dump_module`. The - saved module can be :py:mod:`__main__` (e.g. an interpreter session), - an imported module, or a module-type object (e.g. created with - :py:class:`~types.ModuleType`). - - When restoring the state of a non-importable module-type object, the - current instance of this module may be passed as the argument ``main``. - Otherwise, a new instance is created with :py:class:`~types.ModuleType` - and returned. - - Parameters: - filename: a path-like object or a readable stream. - module: a module object or the name of an importable module; - the module name and kind (i.e. imported or non-imported) must - match the name and kind of the module stored at ``filename``. - **kwds: extra keyword arguments passed to :py:class:`Unpickler()`. - - Raises: - :py:exc:`UnpicklingError`: if unpickling fails. - :py:exc:`ValueError`: if the argument ``main`` and module saved - at ``filename`` are incompatible. - - Returns: - A module object, if the saved module is not :py:mod:`__main__` or - a module instance wasn't provided with the argument ``main``. - - Examples: - - - Save the state of some modules: - - >>> import dill - >>> squared = lambda x: x*x - >>> dill.dump_module() # save state of __main__ to /tmp/session.pkl - >>> - >>> import pox # an imported module - >>> pox.plus_one = lambda x: x+1 - >>> dill.dump_module('pox_session.pkl', module=pox) - >>> - >>> from types import ModuleType - >>> foo = ModuleType('foo') # a module-type object - >>> foo.values = [1,2,3] - >>> import math - >>> foo.sin = math.sin - >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) - - - Restore the state of the interpreter: - - >>> import dill - >>> dill.load_module() # updates __main__ from /tmp/session.pkl - >>> squared(2) - 4 - - - Load the saved state of an importable module: - - >>> import dill - >>> pox = dill.load_module('pox_session.pkl') - >>> pox.plus_one(1) - 2 - >>> import sys - >>> pox in sys.modules.values() - True - - - Load the saved state of a non-importable module-type object: - - >>> import dill - >>> foo = dill.load_module('foo_session.pkl') - >>> [foo.sin(x) for x in foo.values] - [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] - >>> import math - >>> foo.sin is math.sin # foo.sin was saved by reference - True - >>> import sys - >>> foo in sys.modules.values() - False - - - Update the state of a non-importable module-type object: - - >>> import dill - >>> from types import ModuleType - >>> foo = ModuleType('foo') - >>> foo.values = ['a','b'] - >>> foo.sin = lambda x: x*x - >>> dill.load_module('foo_session.pkl', module=foo) - >>> [foo.sin(x) for x in foo.values] - [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] - - *Changed in version 0.3.6:* Function ``load_session()`` was renamed to - ``load_module()``. Parameter ``main`` was renamed to ``module``. - - See also: - :py:func:`load_module_asdict` to load the contents of module saved - with :py:func:`dump_module` into a dictionary. - """ - if 'main' in kwds: - warnings.warn( - "The argument 'main' has been renamed 'module'.", - PendingDeprecationWarning - ) - if module is not None: - raise TypeError("both 'module' and 'main' arguments were used") - module = kwds.pop('main') - main = module - with _open(filename, 'rb', peekable=True) as file: - #FIXME: dill.settings are disabled - unpickler = Unpickler(file, **kwds) - unpickler._session = True - - # Resolve unpickler._main - pickle_main = _identify_module(file, main) - if main is None and pickle_main is not None: - main = pickle_main - if isinstance(main, str): - if main.startswith('__runtime__.'): - # Create runtime module to load the session into. - main = ModuleType(main.partition('.')[-1]) - else: - main = _import_module(main) - if main is not None: - if not isinstance(main, ModuleType): - raise TypeError("%r is not a module" % main) - unpickler._main = main - else: - main = unpickler._main - - # Check against the pickle's main. - is_main_imported = _is_imported_module(main) - if pickle_main is not None: - is_runtime_mod = pickle_main.startswith('__runtime__.') - if is_runtime_mod: - pickle_main = pickle_main.partition('.')[-1] - error_msg = "can't update{} module{} %r with the saved state of{} module{} %r" - if main.__name__ != pickle_main: - raise ValueError(error_msg.format("", "", "", "") % (main.__name__, pickle_main)) - if is_runtime_mod and is_main_imported: - raise ValueError( - error_msg.format(" imported", "", "", "-type object") - % (main.__name__, main.__name__) - ) - if not is_runtime_mod and not is_main_imported: - raise ValueError( - error_msg.format("", "-type object", " imported", "") - % (main.__name__, main.__name__) - ) - - try: - if not is_main_imported: - # This is for find_class() to be able to locate it. - runtime_main = '__runtime__.%s' % main.__name__ - sys.modules[runtime_main] = main - loaded = unpickler.load() - finally: - if not is_main_imported: - del sys.modules[runtime_main] - - assert loaded is main - _restore_modules(unpickler, main) - if main is _main_module or main is module: - return None - else: - return main - -# Backward compatibility. -def load_session(filename=str(TEMPDIR/'session.pkl'), main=None, **kwds): - warnings.warn("load_session() has been renamed load_module().", PendingDeprecationWarning) - load_module(filename, module=main, **kwds) -load_session.__doc__ = load_module.__doc__ - -def load_module_asdict( - filename = str(TEMPDIR/'session.pkl'), - update: bool = False, - **kwds -) -> dict: - """ - Load the contents of a saved module into a dictionary. - - ``load_module_asdict()`` is the near-equivalent of:: - - lambda filename: vars(dill.load_module(filename)).copy() - - however, does not alter the original module. Also, the path of - the loaded module is stored in the ``__session__`` attribute. - - Parameters: - filename: a path-like object or a readable stream - update: if `True`, initialize the dictionary with the current state - of the module prior to loading the state stored at filename. - **kwds: extra keyword arguments passed to :py:class:`Unpickler()` - - Raises: - :py:exc:`UnpicklingError`: if unpickling fails - - Returns: - A copy of the restored module's dictionary. - - Note: - If ``update`` is True, the corresponding module may first be imported - into the current namespace before the saved state is loaded from - filename to the dictionary. Note that any module that is imported into - the current namespace as a side-effect of using ``update`` will not be - modified by loading the saved module in filename to a dictionary. - - Example: - >>> import dill - >>> alist = [1, 2, 3] - >>> anum = 42 - >>> dill.dump_module() - >>> anum = 0 - >>> new_var = 'spam' - >>> main = dill.load_module_asdict() - >>> main['__name__'], main['__session__'] - ('__main__', '/tmp/session.pkl') - >>> main is globals() # loaded objects don't reference globals - False - >>> main['alist'] == alist - True - >>> main['alist'] is alist # was saved by value - False - >>> main['anum'] == anum # changed after the session was saved - False - >>> new_var in main # would be True if the option 'update' was set - False - """ - if 'module' in kwds: - raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") - with _open(filename, 'rb', peekable=True) as file: - main_name = _identify_module(file) - old_main = sys.modules.get(main_name) - main = ModuleType(main_name) - if update: - if old_main is None: - old_main = _import_module(main_name) - main.__dict__.update(old_main.__dict__) - else: - main.__builtins__ = __builtin__ - try: - sys.modules[main_name] = main - load_module(file, **kwds) - finally: - if old_main is None: - del sys.modules[main_name] - else: - sys.modules[main_name] = old_main - main.__session__ = str(filename) - return main.__dict__ - -### End: Pickle the Interpreter - class MetaCatchingDict(dict): def get(self, key, default=None): try: @@ -2462,7 +1896,6 @@ def save_capsule(pickler, obj): _incedental_reverse_typemap['PyCapsuleType'] = PyCapsuleType _reverse_typemap['PyCapsuleType'] = PyCapsuleType _incedental_types.add(PyCapsuleType) - SESSION_IMPORTED_AS_TYPES += (PyCapsuleType,) else: _testcapsule = None diff --git a/dill/session.py b/dill/session.py new file mode 100644 index 00000000..dc26ae99 --- /dev/null +++ b/dill/session.py @@ -0,0 +1,593 @@ +#!/usr/bin/env python +# +# Author: Mike McKerns (mmckerns @caltech and @uqfoundation) +# Author: Leonardo Gama (@leogama) +# Copyright (c) 2008-2015 California Institute of Technology. +# Copyright (c) 2016-2022 The Uncertainty Quantification Foundation. +# License: 3-clause BSD. The full license text is available at: +# - https://github.com/uqfoundation/dill/blob/master/LICENSE +""" +Pickle and restore the intepreter session. +""" + +__all__ = [ + 'dump_module', 'is_module_pickle', 'load_module', 'load_module_asdict', + 'dump_session', 'load_session' # backward compatibility +] + +import logging +logger = logging.getLogger('dill.session') + +import builtins +import contextlib +import pathlib +import re +import sys +import tempfile +import warnings +from types import SimpleNamespace + +from dill import _dill, Pickler, Unpickler +from ._dill import ( + BuiltinMethodType, FunctionType, MethodType, ModuleType, TypeType, + _import_module, _is_builtin_module, _is_imported_module, _main_module, + _reverse_typemap, +) + +# Type hints. +from typing import Optional, Union + +TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) + +class _PeekableReader: + """lightweight readable stream wrapper that implements peek()""" + def __init__(self, stream): + self.stream = stream + def read(self, n): + return self.stream.read(n) + def readline(self): + return self.stream.readline() + def tell(self): + return self.stream.tell() + def close(self): + return self.stream.close() + def peek(self, n): + stream = self.stream + try: + if hasattr(stream, 'flush'): stream.flush() + position = stream.tell() + stream.seek(position) # assert seek() works before reading + chunk = stream.read(n) + stream.seek(position) + return chunk + except (AttributeError, OSError): + raise NotImplementedError("stream is not peekable: %r", stream) from None + +def _open(file, mode, *, peekable=False): + """return a context manager with an opened file-like object""" + import io + attr = 'write' if 'w' in mode else 'read' + was_open = hasattr(file, attr) + if not was_open: + file = open(file, mode) + if attr == 'read' and peekable and not hasattr(file, 'peek'): + # Try our best to return the stream as an object with a peek() method. + if hasattr(file, 'tell') and hasattr(file, 'seek'): + file = _PeekableReader(file) + else: + try: + file = io.BufferedReader(file) + except Exception: + # Stream won't be peekable, but will fail gracefully in _identify_module(). + file = _PeekableReader(file) + if was_open: # should not close at exit + return contextlib.nullcontext(file) + elif type(file) == _PeekableReader: + return contextlib.closing(file) + else: + return file + +def _module_map(): + """get map of imported modules""" + from collections import defaultdict + modmap = SimpleNamespace( + by_name=defaultdict(list), + by_id=defaultdict(list), + top_level={}, + ) + for modname, module in sys.modules.items(): + if modname in ('__main__', '__mp_main__') or not isinstance(module, ModuleType): + continue + if '.' not in modname: + modmap.top_level[id(module)] = modname + for objname, modobj in module.__dict__.items(): + modmap.by_name[objname].append((modobj, modname)) + modmap.by_id[id(modobj)].append((modobj, objname, modname)) + return modmap + +IMPORTED_AS_TYPES = (ModuleType, TypeType, FunctionType, MethodType, BuiltinMethodType) +PyCapsuleType = _reverse_typemap.get('PyCapsuleType') +if PyCapsuleType is not None: IMPORTED_AS_TYPES += (PyCapsuleType,) + +IMPORTED_AS_MODULES = [re.compile(x) for x in ( + 'ctypes', 'typing', 'subprocess', 'threading', + r'concurrent\.futures(\.\w+)?', r'multiprocessing(\.\w+)?' +)] + +def _lookup_module(modmap, name, obj, main_module): + """lookup name or id of obj if module is imported""" + for modobj, modname in modmap.by_name[name]: + if modobj is obj and sys.modules[modname] is not main_module: + return modname, name + __module__ = getattr(obj, '__module__', None) + if isinstance(obj, IMPORTED_AS_TYPES) or (__module__ is not None + and any(regex.fullmatch(__module__) for regex in IMPORTED_AS_MODULES)): + for modobj, objname, modname in modmap.by_id[id(obj)]: + if sys.modules[modname] is not main_module: + return modname, objname + return None, None + +def _stash_modules(main_module): + modmap = _module_map() + newmod = ModuleType(main_module.__name__) + + imported = [] + imported_as = [] + imported_top_level = [] # keep separated for backward compatibility + original = {} + for name, obj in main_module.__dict__.items(): + if obj is main_module: + original[name] = newmod # self-reference + elif obj is main_module.__dict__: + original[name] = newmod.__dict__ + # Avoid incorrectly matching a singleton value in another package (ex.: __doc__). + elif any(obj is singleton for singleton in (None, False, True)) \ + or isinstance(obj, ModuleType) and _is_builtin_module(obj): # always saved by ref + original[name] = obj + else: + source_module, objname = _lookup_module(modmap, name, obj, main_module) + if source_module is not None: + if objname == name: + imported.append((source_module, name)) + else: + imported_as.append((source_module, objname, name)) + else: + try: + imported_top_level.append((modmap.top_level[id(obj)], name)) + except KeyError: + original[name] = obj + + if len(original) < len(main_module.__dict__): + newmod.__dict__.update(original) + newmod.__dill_imported = imported + newmod.__dill_imported_as = imported_as + newmod.__dill_imported_top_level = imported_top_level + if getattr(newmod, '__loader__', None) is None and _is_imported_module(main_module): + # Trick _is_imported_module() to force saving as an imported module. + newmod.__loader__ = True # will be discarded by save_module() + return newmod + else: + return main_module + +def _restore_modules(unpickler, main_module): + try: + for modname, name in main_module.__dict__.pop('__dill_imported'): + main_module.__dict__[name] = unpickler.find_class(modname, name) + for modname, objname, name in main_module.__dict__.pop('__dill_imported_as'): + main_module.__dict__[name] = unpickler.find_class(modname, objname) + for modname, name in main_module.__dict__.pop('__dill_imported_top_level'): + main_module.__dict__[name] = __import__(modname) + except KeyError: + pass + +def dump_module( + filename = str(TEMPDIR/'session.pkl'), + module: Union[ModuleType, str] = None, + refimported: bool = False, + **kwds +) -> None: + """Pickle the current state of :py:mod:`__main__` or another module to a file. + + Save the contents of :py:mod:`__main__` (e.g. from an interactive + interpreter session), an imported module, or a module-type object (e.g. + built with :py:class:`~types.ModuleType`), to a file. The pickled + module can then be restored with the function :py:func:`load_module`. + + Parameters: + filename: a path-like object or a writable stream. + module: a module object or the name of an importable module. If `None` + (the default), :py:mod:`__main__` is saved. + refimported: if `True`, all objects identified as having been imported + into the module's namespace are saved by reference. *Note:* this is + similar but independent from ``dill.settings[`byref`]``, as + ``refimported`` refers to virtually all imported objects, while + ``byref`` only affects select objects. + **kwds: extra keyword arguments passed to :py:class:`Pickler()`. + + Raises: + :py:exc:`PicklingError`: if pickling fails. + + Examples: + + - Save current interpreter session state: + + >>> import dill + >>> squared = lambda x: x*x + >>> dill.dump_module() # save state of __main__ to /tmp/session.pkl + + - Save the state of an imported/importable module: + + >>> import dill + >>> import pox + >>> pox.plus_one = lambda x: x+1 + >>> dill.dump_module('pox_session.pkl', module=pox) + + - Save the state of a non-importable, module-type object: + + >>> import dill + >>> from types import ModuleType + >>> foo = ModuleType('foo') + >>> foo.values = [1,2,3] + >>> import math + >>> foo.sin = math.sin + >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) + + - Restore the state of the saved modules: + + >>> import dill + >>> dill.load_module() + >>> squared(2) + 4 + >>> pox = dill.load_module('pox_session.pkl') + >>> pox.plus_one(1) + 2 + >>> foo = dill.load_module('foo_session.pkl') + >>> [foo.sin(x) for x in foo.values] + [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] + + *Changed in version 0.3.6:* Function ``dump_session()`` was renamed to + ``dump_module()``. Parameters ``main`` and ``byref`` were renamed to + ``module`` and ``refimported``, respectively. + + Note: + Currently, ``dill.settings['byref']`` and ``dill.settings['recurse']`` + don't apply to this function.` + """ + for old_par, par in [('main', 'module'), ('byref', 'refimported')]: + if old_par in kwds: + message = "The argument %r has been renamed %r" % (old_par, par) + if old_par == 'byref': + message += " to distinguish it from dill.settings['byref']" + warnings.warn(message + ".", PendingDeprecationWarning) + if locals()[par]: # the defaults are None and False + raise TypeError("both %r and %r arguments were used" % (par, old_par)) + refimported = kwds.pop('byref', refimported) + module = kwds.pop('main', module) + + from .settings import settings + protocol = settings['protocol'] + main = module + if main is None: + main = _main_module + elif isinstance(main, str): + main = _import_module(main) + if not isinstance(main, ModuleType): + raise TypeError("%r is not a module" % main) + original_main = main + if refimported: + main = _stash_modules(main) + with _open(filename, 'wb') as file: + pickler = Pickler(file, protocol, **kwds) + if main is not original_main: + pickler._original_main = original_main + pickler._main = main #FIXME: dill.settings are disabled + pickler._byref = False # disable pickling by name reference + pickler._recurse = False # disable pickling recursion for globals + pickler._session = True # is best indicator of when pickling a session + pickler._first_pass = True + pickler.dump(main) + return + +# Backward compatibility. +def dump_session(filename=str(TEMPDIR/'session.pkl'), main=None, byref=False, **kwds): + warnings.warn("dump_session() has been renamed dump_module()", PendingDeprecationWarning) + dump_module(filename, module=main, refimported=byref, **kwds) +dump_session.__doc__ = dump_module.__doc__ + +def _identify_module(file, main=None): + """identify the name of the module stored in the given file-type object""" + from pickletools import genops + UNICODE = {'UNICODE', 'BINUNICODE', 'SHORT_BINUNICODE'} + found_import = False + try: + for opcode, arg, pos in genops(file.peek(256)): + if not found_import: + if opcode.name in ('GLOBAL', 'SHORT_BINUNICODE') and \ + arg.endswith('_import_module'): + found_import = True + else: + if opcode.name in UNICODE: + return arg + else: + raise UnpicklingError("reached STOP without finding main module") + except (NotImplementedError, ValueError) as error: + # ValueError occours when the end of the chunk is reached (without a STOP). + if isinstance(error, NotImplementedError) and main is not None: + # file is not peekable, but we have main. + return None + raise UnpicklingError("unable to identify main module") from error + +def is_module_pickle(filename, importable: bool = True) -> bool: + """Check if a file is a module state pickle file. + + Parameters: + filename: a path-like object or a readable stream. + importable: expected kind of the file's saved module. Use `True` for + importable modules (the default) or `False` for module-type objects. + + Returns: + `True` if the pickle file at ``filename`` was generated with + :py:func:`dump_module` **AND** the module whose state is saved in it is + of the kind specified by the ``importable`` argument. `False` otherwise. + """ + with _open(filename, 'rb', peekable=True) as file: + try: + pickle_main = _identify_module(file) + except UnpicklingError: + return False + else: + is_runtime_mod = pickle_main.startswith('__runtime__.') + return importable ^ is_runtime_mod + +def load_module( + filename = str(TEMPDIR/'session.pkl'), + module: Union[ModuleType, str] = None, + **kwds +) -> Optional[ModuleType]: + """Update the selected module (default is :py:mod:`__main__`) with + the state saved at ``filename``. + + Restore a module to the state saved with :py:func:`dump_module`. The + saved module can be :py:mod:`__main__` (e.g. an interpreter session), + an imported module, or a module-type object (e.g. created with + :py:class:`~types.ModuleType`). + + When restoring the state of a non-importable module-type object, the + current instance of this module may be passed as the argument ``main``. + Otherwise, a new instance is created with :py:class:`~types.ModuleType` + and returned. + + Parameters: + filename: a path-like object or a readable stream. + module: a module object or the name of an importable module; + the module name and kind (i.e. imported or non-imported) must + match the name and kind of the module stored at ``filename``. + **kwds: extra keyword arguments passed to :py:class:`Unpickler()`. + + Raises: + :py:exc:`UnpicklingError`: if unpickling fails. + :py:exc:`ValueError`: if the argument ``main`` and module saved + at ``filename`` are incompatible. + + Returns: + A module object, if the saved module is not :py:mod:`__main__` or + a module instance wasn't provided with the argument ``main``. + + Examples: + + - Save the state of some modules: + + >>> import dill + >>> squared = lambda x: x*x + >>> dill.dump_module() # save state of __main__ to /tmp/session.pkl + >>> + >>> import pox # an imported module + >>> pox.plus_one = lambda x: x+1 + >>> dill.dump_module('pox_session.pkl', module=pox) + >>> + >>> from types import ModuleType + >>> foo = ModuleType('foo') # a module-type object + >>> foo.values = [1,2,3] + >>> import math + >>> foo.sin = math.sin + >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) + + - Restore the state of the interpreter: + + >>> import dill + >>> dill.load_module() # updates __main__ from /tmp/session.pkl + >>> squared(2) + 4 + + - Load the saved state of an importable module: + + >>> import dill + >>> pox = dill.load_module('pox_session.pkl') + >>> pox.plus_one(1) + 2 + >>> import sys + >>> pox in sys.modules.values() + True + + - Load the saved state of a non-importable module-type object: + + >>> import dill + >>> foo = dill.load_module('foo_session.pkl') + >>> [foo.sin(x) for x in foo.values] + [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] + >>> import math + >>> foo.sin is math.sin # foo.sin was saved by reference + True + >>> import sys + >>> foo in sys.modules.values() + False + + - Update the state of a non-importable module-type object: + + >>> import dill + >>> from types import ModuleType + >>> foo = ModuleType('foo') + >>> foo.values = ['a','b'] + >>> foo.sin = lambda x: x*x + >>> dill.load_module('foo_session.pkl', module=foo) + >>> [foo.sin(x) for x in foo.values] + [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] + + *Changed in version 0.3.6:* Function ``load_session()`` was renamed to + ``load_module()``. Parameter ``main`` was renamed to ``module``. + + See also: + :py:func:`load_module_asdict` to load the contents of module saved + with :py:func:`dump_module` into a dictionary. + """ + if 'main' in kwds: + warnings.warn( + "The argument 'main' has been renamed 'module'.", + PendingDeprecationWarning + ) + if module is not None: + raise TypeError("both 'module' and 'main' arguments were used") + module = kwds.pop('main') + main = module + with _open(filename, 'rb', peekable=True) as file: + #FIXME: dill.settings are disabled + unpickler = Unpickler(file, **kwds) + unpickler._session = True + + # Resolve unpickler._main + pickle_main = _identify_module(file, main) + if main is None and pickle_main is not None: + main = pickle_main + if isinstance(main, str): + if main.startswith('__runtime__.'): + # Create runtime module to load the session into. + main = ModuleType(main.partition('.')[-1]) + else: + main = _import_module(main) + if main is not None: + if not isinstance(main, ModuleType): + raise TypeError("%r is not a module" % main) + unpickler._main = main + else: + main = unpickler._main + + # Check against the pickle's main. + is_main_imported = _is_imported_module(main) + if pickle_main is not None: + is_runtime_mod = pickle_main.startswith('__runtime__.') + if is_runtime_mod: + pickle_main = pickle_main.partition('.')[-1] + error_msg = "can't update{} module{} %r with the saved state of{} module{} %r" + if main.__name__ != pickle_main: + raise ValueError(error_msg.format("", "", "", "") % (main.__name__, pickle_main)) + if is_runtime_mod and is_main_imported: + raise ValueError( + error_msg.format(" imported", "", "", "-type object") + % (main.__name__, main.__name__) + ) + if not is_runtime_mod and not is_main_imported: + raise ValueError( + error_msg.format("", "-type object", " imported", "") + % (main.__name__, main.__name__) + ) + + try: + if not is_main_imported: + # This is for find_class() to be able to locate it. + runtime_main = '__runtime__.%s' % main.__name__ + sys.modules[runtime_main] = main + loaded = unpickler.load() + finally: + if not is_main_imported: + del sys.modules[runtime_main] + + assert loaded is main + _restore_modules(unpickler, main) + if main is _main_module or main is module: + return None + else: + return main + +# Backward compatibility. +def load_session(filename=str(TEMPDIR/'session.pkl'), main=None, **kwds): + warnings.warn("load_session() has been renamed load_module().", PendingDeprecationWarning) + load_module(filename, module=main, **kwds) +load_session.__doc__ = load_module.__doc__ + +def load_module_asdict( + filename = str(TEMPDIR/'session.pkl'), + update: bool = False, + **kwds +) -> dict: + """ + Load the contents of a saved module into a dictionary. + + ``load_module_asdict()`` is the near-equivalent of:: + + lambda filename: vars(dill.load_module(filename)).copy() + + however, does not alter the original module. Also, the path of + the loaded module is stored in the ``__session__`` attribute. + + Parameters: + filename: a path-like object or a readable stream + update: if `True`, initialize the dictionary with the current state + of the module prior to loading the state stored at filename. + **kwds: extra keyword arguments passed to :py:class:`Unpickler()` + + Raises: + :py:exc:`UnpicklingError`: if unpickling fails + + Returns: + A copy of the restored module's dictionary. + + Note: + If ``update`` is True, the corresponding module may first be imported + into the current namespace before the saved state is loaded from + filename to the dictionary. Note that any module that is imported into + the current namespace as a side-effect of using ``update`` will not be + modified by loading the saved module in filename to a dictionary. + + Example: + >>> import dill + >>> alist = [1, 2, 3] + >>> anum = 42 + >>> dill.dump_module() + >>> anum = 0 + >>> new_var = 'spam' + >>> main = dill.load_module_asdict() + >>> main['__name__'], main['__session__'] + ('__main__', '/tmp/session.pkl') + >>> main is globals() # loaded objects don't reference globals + False + >>> main['alist'] == alist + True + >>> main['alist'] is alist # was saved by value + False + >>> main['anum'] == anum # changed after the session was saved + False + >>> new_var in main # would be True if the option 'update' was set + False + """ + if 'module' in kwds: + raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") + with _open(filename, 'rb', peekable=True) as file: + main_name = _identify_module(file) + old_main = sys.modules.get(main_name) + main = ModuleType(main_name) + if update: + if old_main is None: + old_main = _import_module(main_name) + main.__dict__.update(old_main.__dict__) + else: + main.__builtins__ = builtins + try: + sys.modules[main_name] = main + load_module(file, **kwds) + finally: + if old_main is None: + del sys.modules[main_name] + else: + sys.modules[main_name] = old_main + main.__session__ = str(filename) + return main.__dict__ From 6b557550e68d944c3318e34c309aa9d1da974cf1 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 18 Jul 2022 23:49:35 -0300 Subject: [PATCH 024/109] session: deal with modules with unpickleable objects --- dill/_dill.py | 104 +++++++++++++++++++++++++++++++------ dill/logger.py | 22 ++++---- dill/session.py | 22 +++++++- dill/settings.py | 4 ++ dill/tests/test_session.py | 2 +- 5 files changed, 125 insertions(+), 29 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index aaa14101..668534db 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -40,6 +40,8 @@ #XXX: get types from .objtypes ? import builtins as __builtin__ from pickle import _Pickler as StockPickler, Unpickler as StockUnpickler +from pickle import BINPUT, DICT, EMPTY_DICT, LONG_BINPUT, MARK, PUT, SETITEM +from struct import pack from _thread import LockType from _thread import RLock as RLockType #from io import IOBase @@ -234,6 +236,9 @@ def __reduce_ex__(self, protocol): #: Pickles the entire file (handle and contents), preserving mode and position. FILE_FMODE = 2 +# Exceptions commonly raised by unpicklable objects. +UNPICKLEABLE_ERRORS = (PicklingError, TypeError, NotImplementedError) + ### Shorthands (modified from python2.5/lib/pickle.py) def copy(obj, *args, **kwds): """ @@ -349,16 +354,18 @@ class Pickler(StockPickler): def __init__(self, file, *args, **kwds): settings = Pickler.settings _byref = kwds.pop('byref', None) - #_strictio = kwds.pop('strictio', None) _fmode = kwds.pop('fmode', None) _recurse = kwds.pop('recurse', None) + #_refonfail = kwds.pop('refonfail', None) + #_strictio = kwds.pop('strictio', None) StockPickler.__init__(self, file, *args, **kwds) self._main = _main_module self._diff_cache = {} self._byref = settings['byref'] if _byref is None else _byref - self._strictio = False #_strictio self._fmode = settings['fmode'] if _fmode is None else _fmode self._recurse = settings['recurse'] if _recurse is None else _recurse + self._refonfail = False #settings['dump_module']['refonfail'] if _refonfail is None else _refonfail + self._strictio = False #_strictio self._postproc = OrderedDict() self._file = file # for the logger @@ -395,7 +402,7 @@ def save_numpy_dtype(pickler, obj): if NumpyArrayType and ndarraysubclassinstance(obj): @register(type(obj)) def save_numpy_array(pickler, obj): - logger.trace(pickler, "Nu: (%s, %s)", obj.shape, obj.dtype) + logger.trace(pickler, "Nu: (%s, %s)", obj.shape, obj.dtype, obj=obj) npdict = getattr(obj, '__dict__', None) f, args, state = obj.__reduce__() pickler.save_reduce(_create_array, (f,args,state,npdict), obj=obj) @@ -407,9 +414,68 @@ def save_numpy_array(pickler, obj): raise PicklingError(msg) logger.trace_setup(self) StockPickler.dump(self, obj) - dump.__doc__ = StockPickler.dump.__doc__ + def save(self, obj, save_persistent_id=True, *, name=None): + """If self._refonfail is True, try to save object by reference if pickling fails.""" + if not self._refonfail: + super().save(obj, save_persistent_id) + return + if self.framer.current_frame: + # protocol >= 4 + self.framer.commit_frame() + stream = self.framer.current_frame + else: + stream = self._file + position = stream.tell() + memo_size = len(self.memo) + try: + super().save(obj, save_persistent_id) + except UNPICKLEABLE_ERRORS + (AttributeError,) as error_stack: + # AttributeError may happen in save_global() call for child object. + if (type(error_stack) == AttributeError + and "no attribute '__name__'" not in error_stack.args[0]): + raise + # roll back the stream + stream.seek(position) + stream.truncate() + # roll back memo + for _ in range(len(self.memo) - memo_size): + self.memo.popitem() # LIFO order is guaranteed for since 3.7 + try: + self.save_global(obj, name) + except (AttributeError, PicklingError) as error: + if getattr(self, '_trace_stack', None) and id(obj) == self._trace_stack[-1]: + # roll back trace state + self._trace_stack.pop() + self._size_stack.pop() + raise error from error_stack + logger.trace(self, "# X: fallback to save_global: <%s object at %#012x>", + type(obj).__name__, id(obj), obj=obj) + + def _save_module_dict(self, obj): + """ + Use object name in the module namespace as a last resource to try to + save it by reference when pickling fails. + + Modified from Pickler.save_dict() and Pickler._batch_setitems(). + """ + if not self._refonfail: + super().save_dict(obj) + return + if self.bin: + self.write(EMPTY_DICT) + else: # proto 0 -- can't use EMPTY_DICT + self.write(MARK + DICT) + self.memoize(obj) + for k, v in obj.items(): + self.save(k) + if hasattr(v, '__name__') or hasattr(v, '__qualname__'): + self.save(v) + else: + self.save(v, name=k) + self.write(SETITEM) + class Unpickler(StockUnpickler): """python's Unpickler extended to interpreter sessions and more types""" from .settings import settings @@ -1173,26 +1239,30 @@ def _repr_dict(obj): @register(dict) def save_module_dict(pickler, obj): - if is_dill(pickler, child=False) and obj == pickler._main.__dict__ and \ + pickler_is_dill = is_dill(pickler, child=False) + if pickler_is_dill and obj == pickler._main.__dict__ and \ not (pickler._session and pickler._first_pass): - logger.trace(pickler, "D1: %s", _repr_dict(obj)) # obj + logger.trace(pickler, "D1: %s", _repr_dict(obj), obj=obj) pickler.write(bytes('c__builtin__\n__main__\n', 'UTF-8')) logger.trace(pickler, "# D1") - elif (not is_dill(pickler, child=False)) and (obj == _main_module.__dict__): - logger.trace(pickler, "D3: %s", _repr_dict(obj)) # obj + elif (not pickler_is_dill) and (obj == _main_module.__dict__): + logger.trace(pickler, "D3: %s", _repr_dict(obj), obj=obj) pickler.write(bytes('c__main__\n__dict__\n', 'UTF-8')) #XXX: works in general? logger.trace(pickler, "# D3") elif '__name__' in obj and obj != _main_module.__dict__ \ and type(obj['__name__']) is str \ and obj is getattr(_import_module(obj['__name__'],True), '__dict__', None): - logger.trace(pickler, "D4: %s", _repr_dict(obj)) # obj + logger.trace(pickler, "D4: %s", _repr_dict(obj), obj=obj) pickler.write(bytes('c%s\n__dict__\n' % obj['__name__'], 'UTF-8')) logger.trace(pickler, "# D4") + elif pickler_is_dill and pickler._session and pickler._first_pass: + # we only care about session the first pass thru + pickler._first_pass = False + logger.trace(pickler, "D5: %s", _repr_dict(obj), obj=obj) + pickler._save_module_dict(obj) + logger.trace(pickler, "# D5") else: - logger.trace(pickler, "D2: %s", _repr_dict(obj)) # obj - if is_dill(pickler, child=False) and pickler._session: - # we only care about session the first pass thru - pickler._first_pass = False + logger.trace(pickler, "D2: %s", _repr_dict(obj), obj=obj) StockPickler.save_dict(pickler, obj) logger.trace(pickler, "# D2") return @@ -1491,7 +1561,7 @@ def save_cell(pickler, obj): if MAPPING_PROXY_TRICK: @register(DictProxyType) def save_dictproxy(pickler, obj): - logger.trace(pickler, "Mp: %s", _repr_dict(obj)) # obj + logger.trace(pickler, "Mp: %s", _repr_dict(obj), obj=obj) mapping = obj | _dictproxy_helper_instance pickler.save_reduce(DictProxyType, (mapping,), obj=obj) logger.trace(pickler, "# Mp") @@ -1499,7 +1569,7 @@ def save_dictproxy(pickler, obj): else: @register(DictProxyType) def save_dictproxy(pickler, obj): - logger.trace(pickler, "Mp: %s", _repr_dict(obj)) # obj + logger.trace(pickler, "Mp: %s", _repr_dict(obj), obj=obj) pickler.save_reduce(DictProxyType, (obj.copy(),), obj=obj) logger.trace(pickler, "# Mp") return @@ -1575,7 +1645,7 @@ def save_weakproxy(pickler, obj): logger.trace(pickler, "%s: %s", _t, obj) except ReferenceError: _t = "R3" - logger.trace(pickler, "%s: %s", _t, sys.exc_info()[1]) + logger.trace(pickler, "%s: %s", _t, sys.exc_info()[1], obj=obj) #callable = bool(getattr(refobj, '__call__', None)) if type(obj) is CallableProxyType: callable = True else: callable = False @@ -1914,7 +1984,7 @@ def pickles(obj,exact=False,safe=False,**kwds): """ if safe: exceptions = (Exception,) # RuntimeError, ValueError else: - exceptions = (TypeError, AssertionError, NotImplementedError, PicklingError, UnpicklingError) + exceptions = UNPICKLEABLE_ERRORS + (AssertionError, UnpicklingError) try: pik = copy(obj, **kwds) #FIXME: should check types match first, then check content if "exact" diff --git a/dill/logger.py b/dill/logger.py index fedff6bf..7b6afcdd 100644 --- a/dill/logger.py +++ b/dill/logger.py @@ -129,18 +129,22 @@ def trace_setup(self, pickler): if not dill._dill.is_dill(pickler, child=False): return if self.isEnabledFor(logging.INFO): - pickler._trace_depth = 1 + pickler._trace_stack = [] pickler._size_stack = [] else: - pickler._trace_depth = None - def trace(self, pickler, msg, *args, **kwargs): - if not hasattr(pickler, '_trace_depth'): + pickler._trace_stack = None + def trace(self, pickler, msg, *args, obj=None, **kwargs): + if not hasattr(pickler, '_trace_stack'): logger.info(msg, *args, **kwargs) return - if pickler._trace_depth is None: + if pickler._trace_stack is None: return extra = kwargs.get('extra', {}) pushed_obj = msg.startswith('#') + if not pushed_obj: + if obj is None: + obj = args[-1] + pickler._trace_stack.append(id(obj)) size = None try: # Streams are not required to be tellable. @@ -159,13 +163,11 @@ def trace(self, pickler, msg, *args, **kwargs): else: size -= pickler._size_stack.pop() extra['size'] = size - if pushed_obj: - pickler._trace_depth -= 1 - extra['depth'] = pickler._trace_depth + extra['depth'] = len(pickler._trace_stack) kwargs['extra'] = extra self.info(msg, *args, **kwargs) - if not pushed_obj: - pickler._trace_depth += 1 + if pushed_obj: + pickler._trace_stack.pop() class TraceFormatter(logging.Formatter): """ diff --git a/dill/session.py b/dill/session.py index dc26ae99..9e545f1c 100644 --- a/dill/session.py +++ b/dill/session.py @@ -184,9 +184,10 @@ def dump_module( filename = str(TEMPDIR/'session.pkl'), module: Union[ModuleType, str] = None, refimported: bool = False, + refonfail: bool = False, **kwds ) -> None: - """Pickle the current state of :py:mod:`__main__` or another module to a file. + R"""Pickle the current state of :py:mod:`__main__` or another module to a file. Save the contents of :py:mod:`__main__` (e.g. from an interactive interpreter session), an imported module, or a module-type object (e.g. @@ -202,6 +203,10 @@ def dump_module( similar but independent from ``dill.settings[`byref`]``, as ``refimported`` refers to virtually all imported objects, while ``byref`` only affects select objects. + refonfail: if `True`, objects that fail to be saved by value will try to + be saved by reference. If it also fails, saving their parent + objects by reference will be attempted recursively. In the worst + case scenario, the module itself may be saved by reference. **kwds: extra keyword arguments passed to :py:class:`Pickler()`. Raises: @@ -232,6 +237,15 @@ def dump_module( >>> foo.sin = math.sin >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) + - Save the state of a module with unpickleable objects: + + >>> import dill + >>> import os + >>> os.altsep = '\\' + >>> dill.dump_module('os_session.pkl', module=os) + PicklingError: ... + >>> dill.dump_module('os_session.pkl', module=os, refonfail=True) + - Restore the state of the saved modules: >>> import dill @@ -244,6 +258,9 @@ def dump_module( >>> foo = dill.load_module('foo_session.pkl') >>> [foo.sin(x) for x in foo.values] [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] + >>> os = dill.load_module('os_session.pkl') + >>> print(os.altsep.join('path')) + p\a\t\h *Changed in version 0.3.6:* Function ``dump_session()`` was renamed to ``dump_module()``. Parameters ``main`` and ``byref`` were renamed to @@ -266,6 +283,8 @@ def dump_module( from .settings import settings protocol = settings['protocol'] + if refimported is None: refimported = settings['dump_module']['refimported'] + if refonfail is None: refonfail = settings['dump_module']['refonfail'] main = module if main is None: main = _main_module @@ -283,6 +302,7 @@ def dump_module( pickler._main = main #FIXME: dill.settings are disabled pickler._byref = False # disable pickling by name reference pickler._recurse = False # disable pickling recursion for globals + pickler._refonfail = refonfail pickler._session = True # is best indicator of when pickling a session pickler._first_pass = True pickler.dump(main) diff --git a/dill/settings.py b/dill/settings.py index b105d2e8..22c55458 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -19,6 +19,10 @@ 'fmode' : 0, #HANDLE_FMODE 'recurse' : False, 'ignore' : False, + 'dump_module' : { + 'refimported': False, + 'refonfail' : False, + }, } del DEFAULT_PROTOCOL diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index 9124802c..51128916 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -197,7 +197,7 @@ def test_runtime_module(): runtime = ModuleType(modname) runtime.x = 42 - mod = dill._dill._stash_modules(runtime) + mod = dill.session._stash_modules(runtime) if mod is not runtime: print("There are objects to save by referenece that shouldn't be:", mod.__dill_imported, mod.__dill_imported_as, mod.__dill_imported_top_level, From aac47b5fded6e69535a006dce79381b0764e9d17 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 19 Jul 2022 11:01:51 -0300 Subject: [PATCH 025/109] disable framing when using the 'refonfail' option --- dill/_dill.py | 27 ++++++++++++--------------- dill/logger.py | 2 +- dill/session.py | 18 ++++++++++++++---- 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 668534db..146ab9b6 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -367,7 +367,7 @@ def __init__(self, file, *args, **kwds): self._refonfail = False #settings['dump_module']['refonfail'] if _refonfail is None else _refonfail self._strictio = False #_strictio self._postproc = OrderedDict() - self._file = file # for the logger + self._file_tell = getattr(file, 'tell', None) # for logger and refonfail def dump(self, obj): #NOTE: if settings change, need to update attributes # register if the object is a numpy ufunc @@ -421,32 +421,29 @@ def save(self, obj, save_persistent_id=True, *, name=None): if not self._refonfail: super().save(obj, save_persistent_id) return - if self.framer.current_frame: - # protocol >= 4 - self.framer.commit_frame() - stream = self.framer.current_frame - else: - stream = self._file - position = stream.tell() + # Disable framing (right after the framer.init_framing() call at dump()). + self.framer.current_frame = None + # Store initial state. + position = self._file_tell() memo_size = len(self.memo) try: super().save(obj, save_persistent_id) except UNPICKLEABLE_ERRORS + (AttributeError,) as error_stack: - # AttributeError may happen in save_global() call for child object. + # AttributeError may happen in the save_global() call from a child object. if (type(error_stack) == AttributeError and "no attribute '__name__'" not in error_stack.args[0]): raise - # roll back the stream - stream.seek(position) - stream.truncate() - # roll back memo + # Roll back the stream. + self._file_seek(position) + self._file_truncate() + # Roll back memo. for _ in range(len(self.memo) - memo_size): - self.memo.popitem() # LIFO order is guaranteed for since 3.7 + self.memo.popitem() # LIFO order is guaranteed since 3.7 try: self.save_global(obj, name) except (AttributeError, PicklingError) as error: if getattr(self, '_trace_stack', None) and id(obj) == self._trace_stack[-1]: - # roll back trace state + # Roll back trace state. self._trace_stack.pop() self._size_stack.pop() raise error from error_stack diff --git a/dill/logger.py b/dill/logger.py index 7b6afcdd..385c862d 100644 --- a/dill/logger.py +++ b/dill/logger.py @@ -148,7 +148,7 @@ def trace(self, pickler, msg, *args, obj=None, **kwargs): size = None try: # Streams are not required to be tellable. - size = pickler._file.tell() + size = pickler._file_tell() frame = pickler.framer.current_frame try: size += frame.tell() diff --git a/dill/session.py b/dill/session.py index 9e545f1c..3da31318 100644 --- a/dill/session.py +++ b/dill/session.py @@ -203,10 +203,12 @@ def dump_module( similar but independent from ``dill.settings[`byref`]``, as ``refimported`` refers to virtually all imported objects, while ``byref`` only affects select objects. - refonfail: if `True`, objects that fail to be saved by value will try to - be saved by reference. If it also fails, saving their parent + refonfail: if `True`, objects that fail to pickle by value will try to + be saved by reference. If this also fails, saving their parent objects by reference will be attempted recursively. In the worst - case scenario, the module itself may be saved by reference. + case scenario, the module itself may be saved by reference. Note: + The file-like object must be seekable and truncable with this + option set. **kwds: extra keyword arguments passed to :py:class:`Pickler()`. Raises: @@ -302,9 +304,17 @@ def dump_module( pickler._main = main #FIXME: dill.settings are disabled pickler._byref = False # disable pickling by name reference pickler._recurse = False # disable pickling recursion for globals - pickler._refonfail = refonfail pickler._session = True # is best indicator of when pickling a session pickler._first_pass = True + if refonfail: + pickler._refonfail = True # False by default + pickler._file_seek = getattr(file, 'seek', None) + pickler._file_truncate = getattr(file, 'truncate', None) + if hasattr(file, 'seekable') and not file.seekable(): + pickler._file_seek = None + if pickler._file_seek is None or pickler._file_truncate is None: + raise TypeError("file must have 'tell', 'seek' and 'truncate'" + " attributes if the 'refonfail' option is set.") pickler.dump(main) return From e3ad4656d33a2f7c33adbdc154c75e66d3e59d4b Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 21 Jul 2022 19:20:25 -0300 Subject: [PATCH 026/109] namespace filtering: tests --- dill/_dill.py | 3 +- dill/_utils.py | 179 ++++++++++++++++++++++++++++++++--- dill/logger.py | 6 +- dill/session.py | 99 +++---------------- dill/tests/test_filtering.py | 97 +++++++++++++++++++ dill/tests/test_session.py | 34 ++++++- 6 files changed, 310 insertions(+), 108 deletions(-) create mode 100644 dill/tests/test_filtering.py diff --git a/dill/_dill.py b/dill/_dill.py index 50f81105..d8adb485 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -186,10 +186,11 @@ def get_file_type(*args, **kwargs): #FIXME: additionally calls ForkingPickler.register several times from multiprocessing.reduction import _reduce_socket as reduce_socket try: - __IPYTHON__ is True # is ipython + IS_IPYTHON = __IPYTHON__ # is True ExitType = None # IPython.core.autocall.ExitAutocall singletontypes = ['exit', 'quit', 'get_ipython'] except NameError: + IS_IPYTHON = False try: ExitType = type(exit) # apparently 'exit' can be removed except NameError: ExitType = None singletontypes = [] diff --git a/dill/_utils.py b/dill/_utils.py index 96c0e068..05f32dba 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -8,19 +8,25 @@ from __future__ import annotations -__all__ = ['FilterRules', 'Filter', 'RuleType', '_open'] +__all__ = ['FilterRules', 'Filter', 'RuleType', 'size_filter', '_open'] import contextlib +import math +import random import re +import warnings from dataclasses import dataclass, field, fields from collections import namedtuple from collections.abc import MutableSet from enum import Enum from functools import partialmethod from itertools import chain, filterfalse +from statistics import mean from types import ModuleType from typing import Any, Callable, Dict, Iterable, Pattern, Set, Tuple, Union +from dill import _dill + def _open(filename, mode): """return a context manager with an opened file""" attr = 'write' if 'w' in mode else 'read' @@ -29,13 +35,36 @@ def _open(filename, mode): else: return open(filename, mode) +def _format_bytes_size(size: Union[int, float]) -> Tuple[int, str]: + """Return bytes size text representation in human-redable form.""" + unit = "B" + power_of_2 = math.trunc(size).bit_length() - 1 + magnitude = min(power_of_2 - power_of_2 % 10, 80) # 2**80 == 1 YiB + if magnitude: + size = ((size >> magnitude-1) + 1) >> 1 # rounding trick: 1535 -> 1K; 1536 -> 2K + unit = "%siB" % "KMGTPEZY"[magnitude // 10] + return size, unit + # Namespace filtering. Filter = Union[str, Pattern[str], int, type, Callable] RuleType = Enum('RuleType', 'EXCLUDE INCLUDE', module=__name__) Rule = Tuple[RuleType, Union[Filter, Iterable[Filter]]] -NamedObj = namedtuple('NamedObj', 'name value', module=__name__) +class NamedObject: + """Simple container class for a variable name and value.""" + __slots__ = 'name', 'value' + def __init__(self, name_value): + self.name, self.value = name_value + def __eq__(self, other): + """ + Prevent simple bugs from writing `lambda obj: obj == 'literal'` instead + of `lambda obj: obj.value == 'literal' in a filter definition.` + """ + if type(other) != NamedObject: + raise TypeError("'==' not supported between instances of 'NamedObject' and %r" % + type(other).__name__) + return super().__eq__(other) def _iter(filters): if isinstance(filters, str): @@ -54,6 +83,7 @@ class FilterSet(MutableSet): funcs: Set[Callable] = field(default_factory=set) _fields = None _rtypemap = None + _typename_regex = re.compile(r'\w+(?=Type$)|\w+$', re.IGNORECASE) def _match_type(self, filter): if isinstance(filter, str): if filter.isidentifier(): @@ -118,16 +148,15 @@ def update(self, filters): def copy(self): return FilterSet(*(getattr(self, field).copy() for field in self._fields)) @classmethod + def _get_typename(cls, key): + return cls._typename_regex.match(key).group().lower() + @classmethod def get_type(cls, key): if cls._rtypemap is None: - from ._dill import _reverse_typemap - cls._rtypemap = {(k[:-4] if k.endswith('Type') else k).lower(): v - for k, v in _reverse_typemap.items()} - if key.endswith('Type'): - key = key[:-4] - return cls._rtypemap[key.lower()] - def add_type(self, type_name): - self.types.add(self.get_type(type_name)) + cls._rtypemap = {cls._get_typename(k): v for k, v in _dill._reverse_typemap.items()} + return cls._rtypemap[cls._get_typename(key)] + def add_type(self, typename): + self.types.add(self.get_type(typename)) FilterSet._fields = tuple(field.name for field in fields(FilterSet)) class _FilterSetDescriptor: @@ -200,7 +229,7 @@ def update(self, rules: Union[Iterable[Rule], FilterRules]): else: self.add(filter, rule_type=rule_type) - def _apply_filters(filter_set, objects): + def _apply_filters(self, filter_set, objects): filters = [] types_list = tuple(filter_set.types) # Apply broader/cheaper filters first. @@ -217,13 +246,13 @@ def _apply_filters(filter_set, objects): objects = filterfalse(filter, objects) return objects - def filter_vars(self, namespace: Dict[str, Any]): + def filter_vars(self, namespace: Dict[str, Any]) -> Dict[str, Any]: """Apply filters to dictionary with names as keys.""" if not namespace or not (self.exclude or self.include): return namespace # Protect agains dict changes during the call. namespace_copy = namespace.copy() - all_objs = [NamedObj._make(item) for item in namespace_copy.items()] + all_objs = [NamedObject(item) for item in namespace_copy.items()] if not self.exclude: # Treat this rule set as an allowlist. @@ -238,10 +267,130 @@ def filter_vars(self, namespace: Dict[str, Any]): if len(exclude_objs) == len(namespace): warnings.warn( - "the exclude/include rules applied have excluded all the %d items" % len(all_objects), - PicklingWarning + "the exclude/include rules applied have excluded all %d items" % len(all_objs), + _dill.PicklingWarning, + stacklevel=2 ) return {} for obj in exclude_objs: del namespace_copy[obj.name] return namespace_copy + + +###################### +# Filter factories # +###################### + +import collections +import collections.abc +from sys import getsizeof + +class size_filter: + """Create a filter function with a limit for estimated object size. + + Note: Doesn't work on PyPy. See ``help('``py:func:`sys.getsizeof```)'`` + """ + __slots__ = 'limit', 'recursive' + # Cover "true" collections from 'builtins', 'collections' and 'collections.abc'. + COLLECTION_TYPES = ( + list, + tuple, + collections.deque, + collections.UserList, + collections.abc.Mapping, + collections.abc.Set, + ) + MINIMUM_SIZE = getsizeof(None, 16) + MISSING_SLOT = object() + + def __init__(self, limit: str, recursive: bool = True): + if _dill.IS_PYPY: + raise NotImplementedError("size_filter() is not implemented for PyPy") + self.limit = limit + if type(limit) != int: + try: + self.limit = float(limit) + except (TypeError, ValueError): + limit_match = re.fullmatch(r'(\d+)\s*(B|[KMGT]i?B?)', limit, re.IGNORECASE) + if limit_match: + coeff, unit = limit_match.groups() + coeff, unit = int(coeff), unit.lower() + if unit == 'b': + self.limit = coeff + else: + base = 1024 if unit[1:2] == 'i' else 1000 + exponent = 'kmgt'.index(unit[0]) + 1 + self.limit = coeff * base**exponent + else: + # Will raise error for Inf and NaN. + self.limit = math.truc(self.limit) + if type(self.limit) != int: + # Everything failed. + raise ValueError("invalid 'limit' value: %r" % limit) + elif self.limit < 0: + raise ValueError("'limit' can't be negative %r" % limit) + self.recursive = recursive + + def __call__(self, obj: NamedObject) -> bool: + if self.recursive: + size = self.estimate_size(obj.value) + else: + try: + size = getsizeof(obj.value) + except ReferenceError: + size = self.MINIMUM_SIZE + return size > self.limit + + def __repr__(self): + return "size_filter(limit=%r, recursive=%r)" % ( + "%d %s" % _format_bytes_size(self.limit), + self.recursive, + ) + + @classmethod + def estimate_size(cls, obj: Any, memo: set = None) -> int: + if memo is None: + memo = set() + obj_id = id(obj) + if obj_id in memo: + # Object size already counted. + return 0 + memo.add(obj_id) + size = cls.MINIMUM_SIZE + try: + if isinstance(obj, ModuleType) and _dill._is_builtin_module(obj): + # Always saved by reference. + return cls.MINIMUM_SIZE + size = getsizeof(obj) + if hasattr(obj, '__dict__'): + size += cls.estimate_size(obj.__dict__, memo) + if hasattr(obj, '__slots__'): + slots = (getattr(obj, x, cls.MISSING_SLOT) for x in obj.__slots__ if x != '__dict__') + size += sum(cls.estimate_size(x, memo) for x in slots if x is not cls.MISSING_SLOT) + if ( + isinstance(obj, str) # common case shortcut + or not isinstance(obj, collections.abc.Collection) # general, single test + or not isinstance(obj, cls.COLLECTION_TYPES) # specific, multiple tests + ): + return size + if isinstance(obj, collections.ChainMap): # collections.Mapping subtype + size += sum(cls.estimate_size(mapping, memo) for mapping in obj.maps) + elif len(obj) < 1000: + if isinstance(obj, collections.abc.Mapping): + size += sum(cls.estimate_size(k, memo) + cls.estimate_size(v, memo) + for k, v in obj.items()) + else: + size += sum(cls.estimate_size(item, memo) for item in obj) + else: + # Use random sample for large collections. + sample = set(random.sample(range(len(obj)), k=100)) + if isinstance(obj, collections.abc.Mapping): + samples_sizes = (cls.estimate_size(k, memo) + cls.estimate_size(v, memo) + for i, (k, v) in enumerate(obj.items()) if i in sample) + else: + samples_sizes = (cls.estimate_size(item, memo) + for i, item in enumerate(obj) if i in sample) + size += len(obj) * mean(samples_sizes) + except Exception: + pass + return size diff --git a/dill/logger.py b/dill/logger.py index fedff6bf..286ede32 100644 --- a/dill/logger.py +++ b/dill/logger.py @@ -53,6 +53,7 @@ from typing import NoReturn, TextIO, Union import dill +from ._utils import _format_bytes_size # Tree drawing characters: Unicode to ASCII map. ASCII_MAP = str.maketrans({"│": "|", "├": "|", "┬": "+", "└": "`"}) @@ -201,10 +202,7 @@ def format(self, record): prefix = prefix.translate(ASCII_MAP) + "-" fields['prefix'] = prefix + " " if hasattr(record, 'size'): - # Show object size in human-redable form. - power = int(math.log(record.size, 2)) // 10 - size = record.size >> power*10 - fields['suffix'] = " [%d %sB]" % (size, "KMGTP"[power] + "i" if power else "") + fields['suffix'] = " [%d %s]" % _format_bytes_size(record.size) vars(record).update(fields) return super().format(record) diff --git a/dill/session.py b/dill/session.py index 92bea292..c202f377 100644 --- a/dill/session.py +++ b/dill/session.py @@ -21,13 +21,11 @@ import builtins import pathlib -import random import re import sys import tempfile import warnings from contextlib import suppress -from statistics import mean from types import SimpleNamespace from dill import _dill, Pickler, Unpickler @@ -36,7 +34,7 @@ _import_module, _is_builtin_module, _is_imported_module, _main_module, _reverse_typemap, ) -from ._utils import FilterRules, RuleType +from ._utils import FilterRules, RuleType, size_filter from .settings import settings # Type hints. @@ -603,93 +601,24 @@ def load_module_asdict( return main.__dict__ -###################### -# Filter factories # -###################### +############################## +# Session filter factories # +############################## -import collections -import collections.abc -from sys import getsizeof - -# Cover "true" collections from 'builtins', 'collections' and 'collections.abc'. -COLLECTION_TYPES = ( - list, - tuple, - collections.deque, - collections.UserList, - collections.abc.Mapping, - collections.abc.Set, -) - -def _estimate_size(obj, recursive=True): - if recursive: - return _estimate_size_recursively(obj, memo=set()) - try: - return getsizeof(obj) - except Exception: - return 0 - -def _estimate_size_recursively(obj, memo): - obj_id = id(obj) - if obj_id in memo: - return 0 - memo.add(obj_id) - size = 0 - try: - if isinstance(obj, ModuleType) and _is_builtin_module(obj): - return 0 - size += getsizeof(obj) - if hasattr(obj, '__dict__'): - size += sum(_estimate_size(k, memo) + _estimate_size(v, memo) for k, v in obj.__dict__.items()) - if (isinstance(obj, str) # common case shortcut - or not isinstance(obj, collections.abc.Collection) # general, single test - or not isinstance(obj, COLLECTION_TYPES) # specific, multiple tests - ): - return size - if isinstance(obj, collections.ChainMap): # collections.Mapping subtype - size += sum(_estimate_size(mapping, memo) for mapping in obj.maps) - elif len(obj) < 1000: - if isinstance(obj, collections.abc.Mapping): - size += sum(_estimate_size(k, memo) + _estimate_size(v, memo) for k, v in obj.items()) - else: - size += sum(_estimate_size(item, memo) for item in obj) - else: - # Use random sample for large collections. - sample = set(random.sample(range(len(obj)), k=100)) - if isinstance(obj, collections.abc.Mapping): - samples_size = (_estimate_size(k, memo) + _estimate_size(v, memo) - for i, (k, v) in enumerate(obj.items()) if i in sample) - else: - samples_size = (_estimate_size(item, memo) for i, item in enumerate(obj) if i in sample) - size += len(obj) * mean(filter(None, samples_size)) - except Exception: - pass - return size - -def size_filter(limit, recursive=True): - match = re.fullmatch(r'(\d+)\s*(B|[KMGT]i?B?)', limit, re.IGNORECASE) - if not match: - raise ValueError("invalid 'limit' value: %r" % limit) - coeff, unit = match.groups() - coeff, unit = int(coeff), unit.lower() - if unit == 'b': - limit = coeff - else: - base = 1024 if unit[1:2] == 'i' else 1000 - exponent = 'kmgt'.index(unit[0]) + 1 - limit = coeff * base**exponent - def exclude_large(obj): - return _estimate_size(obj.value, recursive) < limit - return exclude_large - -def ipython_filter(*, keep_input=True, keep_output=False): +def ipython_filter(*, keep_history: str = 'input'): """filter factory for IPython sessions (can't be added to settings currently) Usage: >>> from dill.session import * >>> dump_session(exclude=[ipython_filter()]) """ - if not __builtins__.get('__IPYTHON__'): + HISTORY_OPTIONS = {'input', 'output', 'both', 'none'} + if keep_history not in HISTORY_OPTIONS: + raise ValueError( + "invalid 'keep_history' argument: %r (must be one of %r)" % + (keep_history, HISTORY_OPTIONS) + ) + if not _dill.IS_IPYTHON: # Return no-op filter if not in IPython. return (lambda x: False) @@ -704,10 +633,10 @@ def ipython_filter(*, keep_input=True, keep_output=False): # Input and output history. history_regex = [] - if keep_input: + if keep_history in {'input', 'both'}: interactive_vars |= {'_ih', 'In', '_i', '_ii', '_iii'} history_regex.append(re.compile(r'_i\d+')) - if keep_output: + if keep_history in {'output', 'both'}: interactive_vars |= {'_oh', 'Out', '_', '__', '___'} history_regex.append(re.compile(r'_\d+')) diff --git a/dill/tests/test_filtering.py b/dill/tests/test_filtering.py new file mode 100644 index 00000000..3a3444d5 --- /dev/null +++ b/dill/tests/test_filtering.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python + +# Author: Leonardo Gama (@leogama) +# Copyright (c) 2022 The Uncertainty Quantification Foundation. +# License: 3-clause BSD. The full license text is available at: +# - https://github.com/uqfoundation/dill/blob/master/LICENSE + +import sys + +from dill import _dill +from dill._utils import FilterRules, RuleType, size_filter + +EXCLUDE = RuleType.EXCLUDE +INCLUDE = RuleType.INCLUDE + +NS = { + 'a': 1, + 'aa': 2, + 'aaa': 3, + 'b': 42, + 'bazaar': 'cathedral', + 'has_spam': True, + 'Integer': int, +} + +def did_exclude(namespace, rules, excluded_subset): + rules = FilterRules(rules) + filtered = rules.filter_vars(namespace) + return set(namespace).difference(filtered) == excluded_subset + +def test_basic_filtering(): + filter_names = [(EXCLUDE, ['a', 'c'])] # not 'aa', etc. + assert did_exclude(NS, filter_names, excluded_subset={'a'}) + + filter_regexes = [(EXCLUDE, [r'aa+', r'bb+'])] # not 'a', 'b', 'bazaar' + assert did_exclude(NS, filter_regexes, excluded_subset={'aa', 'aaa'}) + + # Should exclude 'b' and 'd', and not 'b_id'. + NS_copy = NS.copy() + NS_copy['d'] = NS['b'] + NS_copy['b_id'] = id(NS['b']) + filter_ids = [(EXCLUDE, id(NS['b']))] + assert did_exclude(NS_copy, filter_ids, excluded_subset={'b', 'd'}) + + # Should also exclude bool 'has_spam' (int subclass). + filter_types = [(EXCLUDE, [int, frozenset])] + assert did_exclude(NS, filter_types, excluded_subset={'a', 'aa', 'aaa', 'b', 'has_spam'}) + + # Match substring (regexes use fullmatch()). + filter_funcs_name = [(EXCLUDE, lambda obj: 'aa' in obj.name)] + assert did_exclude(NS, filter_funcs_name, excluded_subset={'aa', 'aaa', 'bazaar'}) + + # Don't exclude subclasses. + filter_funcs_value = [(EXCLUDE, lambda obj: type(obj.value) == int)] + assert did_exclude(NS, filter_funcs_value, excluded_subset={'a', 'aa', 'aaa', 'b'}) + +def test_exclude_include(): + # Include rules must apply after exclude rules. + filter_include = [(EXCLUDE, r'a+'), (INCLUDE, 'aa')] # not 'aa' + assert did_exclude(NS, filter_include, excluded_subset={'a', 'aaa'}) + + # If no exclude rules, behave as an allowlist. + filter_allowlist = [(INCLUDE, lambda obj: 'a' in obj.name)] + assert did_exclude(NS, filter_allowlist, excluded_subset={'b', 'Integer'}) + +def test_add_type(): + type_rules = FilterRules() # Formats accepted (actually case insensitive): + type_rules.exclude.add_type('function') # 1. typename + type_rules.exclude.add_type('Type') # 2. Typename + type_rules.exclude.add_type('ModuleType') # 2. TypenameType + NS_copy = NS.copy() + NS_copy.update(F=test_basic_filtering, T=FilterRules, M=_dill) + assert did_exclude(NS_copy, type_rules, excluded_subset={'F', 'T', 'M', 'Integer'}) + +def test_size_filter(): + from sys import getsizeof + estimate = size_filter.estimate_size + + small = list(range(100)) + large = list(range(1000)) + reflarge = 10*[small] + small_size = getsizeof(small) + 100*getsizeof(0) + large_size = getsizeof(large) + 1000*getsizeof(0) + assert small_size <= estimate(small) < estimate(reflarge) < large_size <= estimate(large) + + NS_copy = NS.copy() # all base objects are small and should not be excluded + reflarge.append(reflarge) # recursive reference + NS_copy.update(small=small, large=large, reflarge=reflarge) + filter_size = [(EXCLUDE, size_filter(limit=5*small_size))] + assert did_exclude(NS_copy, filter_size, excluded_subset={'large'}) + +if __name__ == '__main__': + test_basic_filtering() + test_exclude_include() + test_add_type() + if not _dill.IS_PYPY: + test_size_filter() diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index 3cd8a4bb..a89fc8e1 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -11,8 +11,10 @@ import __main__ from contextlib import suppress from io import BytesIO +from types import ModuleType import dill +from dill.session import ipython_filter, EXCLUDE, INCLUDE session_file = os.path.join(os.path.dirname(__file__), 'session-refimported-%s.pkl') @@ -192,12 +194,12 @@ def test_session_other(): assert module.selfref is module def test_runtime_module(): - from types import ModuleType + from dill.session import _stash_modules modname = '__runtime__' runtime = ModuleType(modname) runtime.x = 42 - mod = dill.session._stash_modules(runtime, runtime) + mod = _stash_modules(runtime, runtime) if mod is not runtime: print("There are objects to save by referenece that shouldn't be:", mod.__dill_imported, mod.__dill_imported_as, mod.__dill_imported_top_level, @@ -230,7 +232,7 @@ def test_refimported_imported_as(): import concurrent.futures import types import typing - mod = sys.modules['__test__'] = types.ModuleType('__test__') + mod = sys.modules['__test__'] = ModuleType('__test__') dill.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) mod.Dict = collections.UserDict # select by type mod.AsyncCM = typing.AsyncContextManager # select by __module__ @@ -271,6 +273,31 @@ def test_load_module_asdict(): assert 'y' not in main_vars assert 'empty' in main_vars +def test_ipython_filter(): + from itertools import filterfalse + from types import SimpleNamespace + from dill._utils import FilterRules + dill._dill.IS_IPYTHON = True # trick ipython_filter + sys.modules['IPython'] = MockIPython = ModuleType('IPython') + + # Mimic the behavior of IPython namespaces at __main__. + user_ns_actual = {'user_var': 1, 'x': 2} + user_ns_hidden = {'x': 3, '_i1': '1 / 2', '_1': 0.5, 'hidden': 4} + user_ns = user_ns_hidden.copy() # user_ns == vars(__main__) + user_ns.update(user_ns_actual) + assert user_ns['x'] == user_ns_actual['x'] # user_ns.x masks user_ns_hidden.x + MockIPython.get_ipython = lambda: SimpleNamespace(user_ns=user_ns, user_ns_hidden=user_ns_hidden) + + # Test variations of keeping or dropping the interpreter history. + user_vars = set(user_ns_actual) + def namespace_matches(keep_history, should_keep_vars): + rules = FilterRules([(EXCLUDE, ipython_filter(keep_history=keep_history))]) + return set(rules.filter_vars(user_ns)) == user_vars | should_keep_vars + assert namespace_matches(keep_history='input', should_keep_vars={'_i1'}) + assert namespace_matches(keep_history='output', should_keep_vars={'_1'}) + assert namespace_matches(keep_history='both', should_keep_vars={'_i1', '_1'}) + assert namespace_matches(keep_history='none', should_keep_vars=set()) + if __name__ == '__main__': test_session_main(refimported=False) test_session_main(refimported=True) @@ -278,3 +305,4 @@ def test_load_module_asdict(): test_runtime_module() test_refimported_imported_as() test_load_module_asdict() + test_ipython_filter() From 5e4d91233afb996783a456603bff0138e5d1eca0 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 21 Jul 2022 19:41:05 -0300 Subject: [PATCH 027/109] rename is_module_pickle() to is_pickled_module(); fix _dill's __all__ --- dill/__init__.py | 2 +- dill/_dill.py | 11 +++++------ dill/session.py | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/dill/__init__.py b/dill/__init__.py index 8f8429bd..b540ebd3 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -30,7 +30,7 @@ PickleError, PickleWarning, PicklingError, PicklingWarning, UnpicklingError, UnpicklingWarning, ) -from .session import dump_module, is_module_pickle, load_module, load_module_asdict +from .session import dump_module, is_pickled_module, load_module, load_module_asdict from .session import dump_session, load_session # backward compatibility from . import detect, session, source, temp diff --git a/dill/_dill.py b/dill/_dill.py index 146ab9b6..d734075b 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -16,12 +16,11 @@ Test against CH16+ Std. Lib. ... TBD. """ __all__ = [ - 'dump', 'dumps', 'load', 'loads', 'dump_module', 'load_module', - 'load_module_asdict', 'dump_session', 'load_session', 'is_module_pickle', - 'Pickler', 'Unpickler', 'register', 'copy', 'pickle', 'pickles', 'check', - 'HIGHEST_PROTOCOL', 'DEFAULT_PROTOCOL', 'PicklingError', 'UnpicklingError', - 'HANDLE_FMODE', 'CONTENTS_FMODE', 'FILE_FMODE', 'PickleError', - 'PickleWarning', 'PicklingWarning', 'UnpicklingWarning', + 'Pickler','Unpickler', + 'check','copy','dump','dumps','load','loads','pickle','pickles','register', + 'DEFAULT_PROTOCOL','HIGHEST_PROTOCOL','CONTENTS_FMODE','FILE_FMODE','HANDLE_FMODE', + 'PickleError','PickleWarning','PicklingError','PicklingWarning','UnpicklingError', + 'UnpicklingWarning', ] __module__ = 'dill' diff --git a/dill/session.py b/dill/session.py index 3da31318..021a0fd9 100644 --- a/dill/session.py +++ b/dill/session.py @@ -11,7 +11,7 @@ """ __all__ = [ - 'dump_module', 'is_module_pickle', 'load_module', 'load_module_asdict', + 'dump_module', 'is_pickled_module', 'load_module', 'load_module_asdict', 'dump_session', 'load_session' # backward compatibility ] @@ -347,7 +347,7 @@ def _identify_module(file, main=None): return None raise UnpicklingError("unable to identify main module") from error -def is_module_pickle(filename, importable: bool = True) -> bool: +def is_pickled_module(filename, importable: bool = True) -> bool: """Check if a file is a module state pickle file. Parameters: From 699f30ae69c40386df98904367ab49d6dbf58ed2 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 21 Jul 2022 21:58:29 -0300 Subject: [PATCH 028/109] sync with branch 'session-excludes' --- dill/session.py | 22 +++++++++++----------- dill/settings.py | 3 ++- dill/tests/test_session.py | 4 ++-- docs/source/dill.rst | 14 +++++++++++++- 4 files changed, 28 insertions(+), 15 deletions(-) diff --git a/dill/session.py b/dill/session.py index 021a0fd9..fee7899b 100644 --- a/dill/session.py +++ b/dill/session.py @@ -183,8 +183,9 @@ def _restore_modules(unpickler, main_module): def dump_module( filename = str(TEMPDIR/'session.pkl'), module: Union[ModuleType, str] = None, - refimported: bool = False, - refonfail: bool = False, + *, + refimported: bool = None, + refonfail: bool = None, **kwds ) -> None: R"""Pickle the current state of :py:mod:`__main__` or another module to a file. @@ -285,8 +286,10 @@ def dump_module( from .settings import settings protocol = settings['protocol'] - if refimported is None: refimported = settings['dump_module']['refimported'] - if refonfail is None: refonfail = settings['dump_module']['refonfail'] + if refimported is None: + refimported = settings['dump_module']['refimported'] + if refonfail is None: + refonfail = settings['dump_module']['refonfail'] main = module if main is None: main = _main_module @@ -486,7 +489,7 @@ def load_module( # Resolve unpickler._main pickle_main = _identify_module(file, main) - if main is None and pickle_main is not None: + if main is None: main = pickle_main if isinstance(main, str): if main.startswith('__runtime__.'): @@ -494,12 +497,9 @@ def load_module( main = ModuleType(main.partition('.')[-1]) else: main = _import_module(main) - if main is not None: - if not isinstance(main, ModuleType): - raise TypeError("%r is not a module" % main) - unpickler._main = main - else: - main = unpickler._main + if not isinstance(main, ModuleType): + raise TypeError("%r is not a module" % main) + unpickler._main = main # Check against the pickle's main. is_main_imported = _is_imported_module(main) diff --git a/dill/settings.py b/dill/settings.py index 22c55458..df1d30a4 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -9,6 +9,8 @@ global settings for Pickler """ +__all__ = ['settings'] + from pickle import DEFAULT_PROTOCOL settings = { @@ -26,4 +28,3 @@ } del DEFAULT_PROTOCOL - diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index 51128916..6a6ce22e 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -11,6 +11,7 @@ import __main__ from contextlib import suppress from io import BytesIO +from types import ModuleType import dill @@ -192,7 +193,6 @@ def test_session_other(): assert module.selfref is module def test_runtime_module(): - from types import ModuleType modname = '__runtime__' runtime = ModuleType(modname) runtime.x = 42 @@ -230,7 +230,7 @@ def test_refimported_imported_as(): import concurrent.futures import types import typing - mod = sys.modules['__test__'] = types.ModuleType('__test__') + mod = sys.modules['__test__'] = ModuleType('__test__') dill.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) mod.Dict = collections.UserDict # select by type mod.AsyncCM = typing.AsyncContextManager # select by __module__ diff --git a/docs/source/dill.rst b/docs/source/dill.rst index 31d41c91..af64599c 100644 --- a/docs/source/dill.rst +++ b/docs/source/dill.rst @@ -11,7 +11,7 @@ dill module :special-members: :show-inheritance: :imported-members: - :exclude-members: dump_session, load_session +.. :exclude-members: detect module ------------- @@ -49,6 +49,18 @@ pointers module :imported-members: .. :exclude-members: +session module +--------------- + +.. automodule:: dill.session + :members: + :undoc-members: + :private-members: + :special-members: + :show-inheritance: + :imported-members: + :exclude-members: dump_session, load_session + settings module --------------- From 5ab70b6cfe97f1aef9f7e2b9c9b81e7c258091f1 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 21 Jul 2022 21:46:35 -0300 Subject: [PATCH 029/109] move code and sync with branch 'document-session' --- dill/__init__.py | 2 +- dill/_dill.py | 9 +- dill/_utils.py | 11 +- dill/session.py | 229 +++++++++++++++++++++++++++---------- dill/settings.py | 76 ++---------- dill/tests/test_session.py | 3 +- 6 files changed, 182 insertions(+), 148 deletions(-) diff --git a/dill/__init__.py b/dill/__init__.py index 1e6df963..b540ebd3 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -30,7 +30,7 @@ PickleError, PickleWarning, PicklingError, PicklingWarning, UnpicklingError, UnpicklingWarning, ) -from .session import dump_module, load_module, load_module_asdict +from .session import dump_module, is_pickled_module, load_module, load_module_asdict from .session import dump_session, load_session # backward compatibility from . import detect, session, source, temp diff --git a/dill/_dill.py b/dill/_dill.py index d8adb485..d6594de3 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -241,7 +241,8 @@ def copy(obj, *args, **kwds): See :func:`dumps` and :func:`loads` for keyword arguments. """ - ignore = kwds.pop('ignore', Unpickler.settings['ignore']) + from .settings import settings + ignore = kwds.pop('ignore', settings['ignore']) return loads(dumps(obj, *args, **kwds), ignore=ignore) def dump(obj, file, protocol=None, byref=None, fmode=None, recurse=None, **kwds):#, strictio=None): @@ -344,10 +345,9 @@ class Pickler(StockPickler): """python's Pickler extended to interpreter sessions""" dispatch = MetaCatchingDict(StockPickler.dispatch.copy()) _session = False - from .settings import settings def __init__(self, file, *args, **kwds): - settings = Pickler.settings + from .settings import settings _byref = kwds.pop('byref', None) #_strictio = kwds.pop('strictio', None) _fmode = kwds.pop('fmode', None) @@ -412,7 +412,6 @@ def save_numpy_array(pickler, obj): class Unpickler(StockUnpickler): """python's Unpickler extended to interpreter sessions and more types""" - from .settings import settings _session = False def find_class(self, module, name): @@ -424,7 +423,7 @@ def find_class(self, module, name): return StockUnpickler.find_class(self, module, name) def __init__(self, *args, **kwds): - settings = Pickler.settings + from .settings import settings _ignore = kwds.pop('ignore', None) StockUnpickler.__init__(self, *args, **kwds) self._main = _main_module diff --git a/dill/_utils.py b/dill/_utils.py index 05f32dba..5f5e621f 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -8,9 +8,8 @@ from __future__ import annotations -__all__ = ['FilterRules', 'Filter', 'RuleType', 'size_filter', '_open'] +__all__ = ['FilterRules', 'Filter', 'RuleType', 'size_filter'] -import contextlib import math import random import re @@ -27,14 +26,6 @@ from dill import _dill -def _open(filename, mode): - """return a context manager with an opened file""" - attr = 'write' if 'w' in mode else 'read' - if hasattr(filename, attr): - return contextlib.nullcontext(filename) - else: - return open(filename, mode) - def _format_bytes_size(size: Union[int, float]) -> Tuple[int, str]: """Return bytes size text representation in human-redable form.""" unit = "B" diff --git a/dill/session.py b/dill/session.py index c202f377..eb0d14d2 100644 --- a/dill/session.py +++ b/dill/session.py @@ -10,9 +10,11 @@ Pickle and restore the intepreter session. """ +from __future__ import annotations + __all__ = [ - 'FilterRules', 'dump_module', 'ipython_filter', 'load_module', - 'load_module_asdict', 'size_filter', 'EXCLUDE', 'INCLUDE', + 'FilterSet', 'ModuleFilters', 'dump_module', 'ipython_filter', + 'is_pickled_module', 'load_module', 'load_module_asdict', 'size_filter', 'dump_session', 'load_session' # backward compatibility ] @@ -20,12 +22,12 @@ logger = logging.getLogger('dill.session') import builtins +import contextlib import pathlib import re import sys import tempfile import warnings -from contextlib import suppress from types import SimpleNamespace from dill import _dill, Pickler, Unpickler @@ -34,17 +36,64 @@ _import_module, _is_builtin_module, _is_imported_module, _main_module, _reverse_typemap, ) -from ._utils import FilterRules, RuleType, size_filter -from .settings import settings +from ._utils import FilterRules, FilterSet, RuleType, size_filter # Type hints. from typing import Iterable, Optional, Union -from ._utils import Filter, _open +from ._utils import Filter EXCLUDE, INCLUDE = RuleType.EXCLUDE, RuleType.INCLUDE TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) +class _PeekableReader: + """lightweight readable stream wrapper that implements peek()""" + def __init__(self, stream): + self.stream = stream + def read(self, n): + return self.stream.read(n) + def readline(self): + return self.stream.readline() + def tell(self): + return self.stream.tell() + def close(self): + return self.stream.close() + def peek(self, n): + stream = self.stream + try: + if hasattr(stream, 'flush'): stream.flush() + position = stream.tell() + stream.seek(position) # assert seek() works before reading + chunk = stream.read(n) + stream.seek(position) + return chunk + except (AttributeError, OSError): + raise NotImplementedError("stream is not peekable: %r", stream) from None + +def _open(file, mode, *, peekable=False): + """return a context manager with an opened file-like object""" + import io + attr = 'write' if 'w' in mode else 'read' + was_open = hasattr(file, attr) + if not was_open: + file = open(file, mode) + if attr == 'read' and peekable and not hasattr(file, 'peek'): + # Try our best to return the stream as an object with a peek() method. + if hasattr(file, 'tell') and hasattr(file, 'seek'): + file = _PeekableReader(file) + else: + try: + file = io.BufferedReader(file) + except Exception: + # Stream won't be peekable, but will fail gracefully in _identify_module(). + file = _PeekableReader(file) + if was_open: # should not close at exit + return contextlib.nullcontext(file) + elif type(file) == _PeekableReader: + return contextlib.closing(file) + else: + return file + def _module_map(): """get map of imported modules""" from collections import defaultdict @@ -138,9 +187,9 @@ def _restore_modules(unpickler, main_module): except KeyError: pass -def _filter_vars(main, default_rules, exclude, include): +def _filter_vars(main, base_rules, exclude, include): rules = FilterRules() - mod_rules = default_rules.get(main.__name__, default_rules) + mod_rules = base_rules.get(main.__name__, base_rules) rules.exclude |= mod_rules.get_filters(EXCLUDE) rules.include |= mod_rules.get_filters(INCLUDE) if exclude is not None: @@ -170,9 +219,11 @@ def _filter_vars(main, default_rules, exclude, include): def dump_module( filename = str(TEMPDIR/'session.pkl'), module: Union[ModuleType, str] = None, - refimported: bool = False, + *, + refimported: bool = None, exclude: Union[Filter, Iterable[Filter]] = None, include: Union[Filter, Iterable[Filter]] = None, + base_rules: ModuleFilters = None, **kwds ) -> None: """Pickle the current state of :py:mod:`__main__` or another module to a file. @@ -191,6 +242,9 @@ def dump_module( similar but independent from ``dill.settings[`byref`]``, as ``refimported`` refers to virtually all imported objects, while ``byref`` only affects select objects. + exclude: here be dragons + include: here be dragons + base_rules: here be dragons **kwds: extra keyword arguments passed to :py:class:`Pickler()`. Raises: @@ -255,7 +309,13 @@ def dump_module( from .settings import settings protocol = settings['protocol'] - default_rules = settings['dump_module'] + if refimported is None: + refimported = settings['dump_module']['refimported'] + if base_rules is None: + base_rules = settings['dump_module']['filters'] + else: + base_rules = ModuleFilters(base_rules) + main = module if main is None: main = _main_module @@ -264,7 +324,7 @@ def dump_module( if not isinstance(main, ModuleType): raise TypeError("%r is not a module" % main) original_main = main - main = _filter_vars(main, default_rules, exclude, include) + main = _filter_vars(main, base_rules, exclude, include) if refimported: main = _stash_modules(main, original_main) with _open(filename, 'wb') as file: @@ -285,42 +345,6 @@ def dump_session(filename=str(TEMPDIR/'session.pkl'), main=None, byref=False, ** dump_module(filename, module=main, refimported=byref, **kwds) dump_session.__doc__ = dump_module.__doc__ -class _PeekableReader: - """lightweight stream wrapper that implements peek()""" - def __init__(self, stream): - self.stream = stream - def read(self, n): - return self.stream.read(n) - def readline(self): - return self.stream.readline() - def tell(self): - return self.stream.tell() - def close(self): - return self.stream.close() - def peek(self, n): - stream = self.stream - try: - if hasattr(stream, 'flush'): stream.flush() - position = stream.tell() - stream.seek(position) # assert seek() works before reading - chunk = stream.read(n) - stream.seek(position) - return chunk - except (AttributeError, OSError): - raise NotImplementedError("stream is not peekable: %r", stream) from None - -def _make_peekable(stream): - """return stream as an object with a peek() method""" - import io - if hasattr(stream, 'peek'): - return stream - if not (hasattr(stream, 'tell') and hasattr(stream, 'seek')): - try: - return io.BufferedReader(stream) - except Exception: - pass - return _PeekableReader(stream) - def _identify_module(file, main=None): """identify the name of the module stored in the given file-type object""" from pickletools import genops @@ -344,6 +368,28 @@ def _identify_module(file, main=None): return None raise UnpicklingError("unable to identify main module") from error +def is_pickled_module(filename, importable: bool = True) -> bool: + """Check if a file is a module state pickle file. + + Parameters: + filename: a path-like object or a readable stream. + importable: expected kind of the file's saved module. Use `True` for + importable modules (the default) or `False` for module-type objects. + + Returns: + `True` if the pickle file at ``filename`` was generated with + :py:func:`dump_module` **AND** the module whose state is saved in it is + of the kind specified by the ``importable`` argument. `False` otherwise. + """ + with _open(filename, 'rb', peekable=True) as file: + try: + pickle_main = _identify_module(file) + except UnpicklingError: + return False + else: + is_runtime_mod = pickle_main.startswith('__runtime__.') + return importable ^ is_runtime_mod + def load_module( filename = str(TEMPDIR/'session.pkl'), module: Union[ModuleType, str] = None, @@ -454,16 +500,14 @@ def load_module( raise TypeError("both 'module' and 'main' arguments were used") module = kwds.pop('main') main = module - with _open(filename, 'rb') as file: - file = _make_peekable(file) + with _open(filename, 'rb', peekable=True) as file: #FIXME: dill.settings are disabled unpickler = Unpickler(file, **kwds) - unpickler._main = main unpickler._session = True # Resolve unpickler._main pickle_main = _identify_module(file, main) - if main is None and pickle_main is not None: + if main is None: main = pickle_main if isinstance(main, str): if main.startswith('__runtime__.'): @@ -471,12 +515,9 @@ def load_module( main = ModuleType(main.partition('.')[-1]) else: main = _import_module(main) - if main is not None: - if not isinstance(main, ModuleType): - raise TypeError("%r is not a module" % main) - unpickler._main = main - else: - main = unpickler._main + if not isinstance(main, ModuleType): + raise TypeError("%r is not a module" % main) + unpickler._main = main # Check against the pickle's main. is_main_imported = _is_imported_module(main) @@ -499,13 +540,13 @@ def load_module( ) try: - # This is for find_class() to be able to locate it. if not is_main_imported: + # This is for find_class() to be able to locate it. runtime_main = '__runtime__.%s' % main.__name__ sys.modules[runtime_main] = main loaded = unpickler.load() finally: - with suppress(KeyError, NameError): + if not is_main_imported: del sys.modules[runtime_main] assert loaded is main @@ -578,8 +619,7 @@ def load_module_asdict( """ if 'module' in kwds: raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") - with _open(filename, 'rb') as file: - file = _make_peekable(file) + with _open(filename, 'rb', peekable=True) as file: main_name = _identify_module(file) old_main = sys.modules.get(main_name) main = ModuleType(main_name) @@ -601,6 +641,73 @@ def load_module_asdict( return main.__dict__ +############################# +# Module filters settings # +############################# + +class ModuleFilters(FilterRules): + __slots__ = 'module', '_parent', '__dict__' + _fields = tuple(x.lstrip('_') for x in FilterRules.__slots__) + def __init__(self, + rules: Union[Iterable[Rule], FilterRules] = None, + module: str = 'DEFAULT', + parent: ModuleFilters = None, + ): + # Don't call super().__init__() + if rules is not None: + super().__init__(rules) + super().__setattr__('module', module) + super().__setattr__('_parent', parent) + def __repr__(self): + desc = "DEFAULT" if self.module == 'DEFAULT' else "for %r" % self.module + return "" % (desc, super().__repr__()) + def __setattr__(self, name, value): + if name in FilterRules.__slots__: + # Don't interfere with superclass attributes. + super().__setattr__(name, value) + elif name in self._fields: + if not any(hasattr(self, x) for x in FilterRules.__slots__): + # Initialize other. This is not a placeholder anymore. + other = '_include' if name == 'exclude' else '_exclude' + super().__setattr__(other, ()) + super().__setattr__(name, value) + else: + # Create a child node for submodule 'name'. + super().__setattr__(name, ModuleFilters(rules=value, module=name, parent=self)) + def __setitem__(self, name, value): + if '.' not in name: + setattr(self, name, value) + else: + module, _, submodules = name.partition('.') + if module not in self.__dict__: + # Create a placeholder node, like logging.PlaceHolder. + setattr(self, module, None) + mod_rules = getattr(self, module) + mod_rules[submodules] = value + def __getitem__(self, name): + module, _, submodules = name.partition('.') + mod_rules = getattr(self, module) + if not submodules: + return mod_rules + else: + return mod_rules[submodules] + def get(self, name: str, default: ModuleFilters = None): + try: + return self[name] + except AttributeError: + return default + def get_filters(self, rule_type: RuleType): + if not isinstance(rule_type, RuleType): + raise ValueError("invalid rule type: %r (must be one of %r)" % (rule_type, list(RuleType))) + try: + return getattr(self, rule_type.name.lower()) + except AttributeError: + # 'self' is a placeholder, 'exclude' and 'include' are unset. + if self._parent is None: + raise + return self._parent.get_filters(rule_type) + + ############################## # Session filter factories # ############################## diff --git a/dill/settings.py b/dill/settings.py index 50f6862e..88b886a3 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -9,12 +9,10 @@ global settings for Pickler """ -from __future__ import annotations - -__all__ = ['settings', 'ModuleRules'] +__all__ = ['settings'] from pickle import DEFAULT_PROTOCOL -from ._utils import FilterRules, RuleType +from .session import ModuleFilters settings = { #'main' : None, @@ -24,70 +22,10 @@ 'fmode' : 0, #HANDLE_FMODE 'recurse' : False, 'ignore' : False, + 'dump_module' : { + 'filters': ModuleFilters(rules=()), + 'refimported': False, + }, } -del DEFAULT_PROTOCOL - -class ModuleRules(FilterRules): - __slots__ = 'module', '_parent', '__dict__' - _fields = tuple(x.lstrip('_') for x in FilterRules.__slots__) - def __init__(self, - module: str, - parent: ModuleRules = None, - rules: Union[Iterable[Rule], FilterRules] = None - ): - super().__setattr__('module', module) - super().__setattr__('_parent', parent) - # Don't call super().__init__(). - if rules is not None: - super().__init__(rules) - def __repr__(self): - desc = "DEFAULT" if self.module == 'DEFAULT' else "for %r" % self.module - return "" % (desc, super().__repr__()) - def __setattr__(self, name, value): - if name in FilterRules.__slots__: - # Don't interfere with superclass attributes. - super().__setattr__(name, value) - elif name in self._fields: - if not any(hasattr(self, x) for x in FilterRules.__slots__): - # Initialize other. This is not a placeholder anymore. - other = '_include' if name == 'exclude' else '_exclude' - super().__setattr__(other, ()) - super().__setattr__(name, value) - else: - # Create a child node for submodule 'name'. - super().__setattr__(name, ModuleRules(parent=self, module=name, rules=value)) - def __setitem__(self, name, value): - if '.' not in name: - setattr(self, name, value) - else: - module, _, submodules = name.partition('.') - if module not in self.__dict__: - # Create a placeholder node, like logging.PlaceHolder. - setattr(self, module, None) - mod_rules = getattr(self, module) - mod_rules[submodules] = value - def __getitem__(self, name): - module, _, submodules = name.partition('.') - mod_rules = getattr(self, module) - if not submodules: - return mod_rules - else: - return mod_rules[submodules] - def get(self, name: str, default: ModuleRules = None): - try: - return self[name] - except AttributeError: - return default - def get_filters(self, rule_type: RuleType): - if not isinstance(rule_type, RuleType): - raise ValueError("invalid rule type: %r (must be one of %r)" % (rule_type, list(RuleType))) - try: - return getattr(self, rule_type.name.lower()) - except AttributeError: - # 'self' is a placeholder, 'exclude' and 'include' are unset. - if self._parent is None: - raise - return self._parent.get_filters(rule_type) - -settings['dump_module'] = ModuleRules('DEFAULT', rules=()) +del DEFAULT_PROTOCOL, ModuleFilters diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index a89fc8e1..0e23bf14 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -194,12 +194,11 @@ def test_session_other(): assert module.selfref is module def test_runtime_module(): - from dill.session import _stash_modules modname = '__runtime__' runtime = ModuleType(modname) runtime.x = 42 - mod = _stash_modules(runtime, runtime) + mod = dill.session._stash_modules(runtime, runtime) if mod is not runtime: print("There are objects to save by referenece that shouldn't be:", mod.__dill_imported, mod.__dill_imported_as, mod.__dill_imported_top_level, From 429e55a1e520b8c30077cf2a3ce4642c8ed7337e Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 21 Jul 2022 22:25:55 -0300 Subject: [PATCH 030/109] fix bad merge --- dill/_dill.py | 1 - dill/tests/test_session.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index f52926e7..e3ce8add 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -1904,7 +1904,6 @@ def save_capsule(pickler, obj): _incedental_reverse_typemap['PyCapsuleType'] = PyCapsuleType _reverse_typemap['PyCapsuleType'] = PyCapsuleType _incedental_types.add(PyCapsuleType) - SESSION_IMPORTED_AS_TYPES += (PyCapsuleType,) else: _testcapsule = None diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index 3eb7816a..f2a026ec 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -231,9 +231,9 @@ def test_refimported_imported_as(): import concurrent.futures import types import typing - mod = sys.modules['__test__'] = ModuleType('__test__') -dill.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) + mod = sys.modules['__test__'] = ModuleType('__test__') + dill.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) mod.Dict = collections.UserDict # select by type mod.AsyncCM = typing.AsyncContextManager # select by __module__ mod.thread_exec = dill.executor # select by __module__ with regex From 04968f35ec7f6b984c16f71fde2caa972570e227 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Fri, 22 Jul 2022 00:10:06 -0300 Subject: [PATCH 031/109] refonfail: save modules by reference using save_reduce() --- dill/_dill.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index d734075b..415286c9 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -347,6 +347,7 @@ class UnpicklingWarning(PickleWarning, UnpicklingError): class Pickler(StockPickler): """python's Pickler extended to interpreter sessions""" dispatch = MetaCatchingDict(StockPickler.dispatch.copy()) + _refonfail = False _session = False from .settings import settings @@ -355,7 +356,6 @@ def __init__(self, file, *args, **kwds): _byref = kwds.pop('byref', None) _fmode = kwds.pop('fmode', None) _recurse = kwds.pop('recurse', None) - #_refonfail = kwds.pop('refonfail', None) #_strictio = kwds.pop('strictio', None) StockPickler.__init__(self, file, *args, **kwds) self._main = _main_module @@ -363,7 +363,6 @@ def __init__(self, file, *args, **kwds): self._byref = settings['byref'] if _byref is None else _byref self._fmode = settings['fmode'] if _fmode is None else _fmode self._recurse = settings['recurse'] if _recurse is None else _recurse - self._refonfail = False #settings['dump_module']['refonfail'] if _refonfail is None else _refonfail self._strictio = False #_strictio self._postproc = OrderedDict() self._file_tell = getattr(file, 'tell', None) # for logger and refonfail @@ -439,7 +438,11 @@ def save(self, obj, save_persistent_id=True, *, name=None): for _ in range(len(self.memo) - memo_size): self.memo.popitem() # LIFO order is guaranteed since 3.7 try: - self.save_global(obj, name) + if isinstance(obj, ModuleType) and \ + (_is_builtin_module(obj) or obj is sys.modules['dill']): + self.save_reduce(_import_module, (obj.__name__,), obj=obj) + else: + self.save_global(obj, name) except (AttributeError, PicklingError) as error: if getattr(self, '_trace_stack', None) and id(obj) == self._trace_stack[-1]: # Roll back trace state. From a596126cb020ce4f5bcff8c170ff494e06cddbed Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Fri, 22 Jul 2022 00:28:08 -0300 Subject: [PATCH 032/109] unpickleable ctypes objects raise ValueError... --- dill/_dill.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 415286c9..abe0c5ed 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -235,8 +235,8 @@ def __reduce_ex__(self, protocol): #: Pickles the entire file (handle and contents), preserving mode and position. FILE_FMODE = 2 -# Exceptions commonly raised by unpicklable objects. -UNPICKLEABLE_ERRORS = (PicklingError, TypeError, NotImplementedError) +# Exceptions commonly raised by unpicklable objects in the Standard Library. +UNPICKLEABLE_ERRORS = (PicklingError, TypeError, ValueError, NotImplementedError) ### Shorthands (modified from python2.5/lib/pickle.py) def copy(obj, *args, **kwds): From bbb7623c9d8c07a5ae2dfc50a8f28b37e71be5a5 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Fri, 22 Jul 2022 21:28:04 -0300 Subject: [PATCH 033/109] sync with branch session-submodule --- dill/__init__.py | 10 ++++++---- dill/logger.py | 2 +- dill/session.py | 28 +++++++++++++++++++--------- docs/source/dill.rst | 12 ++++++++++++ 4 files changed, 38 insertions(+), 14 deletions(-) diff --git a/dill/__init__.py b/dill/__init__.py index b540ebd3..87e4eb42 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -30,15 +30,17 @@ PickleError, PickleWarning, PicklingError, PicklingWarning, UnpicklingError, UnpicklingWarning, ) -from .session import dump_module, is_pickled_module, load_module, load_module_asdict -from .session import dump_session, load_session # backward compatibility -from . import detect, session, source, temp +from .session import ( + dump_module, is_pickled_module, load_module, load_module_asdict, + dump_session, load_session # backward compatibility +) +from . import detect, logger, session, source, temp # get global settings from .settings import settings # make sure "trace" is turned off -detect.trace(False) +logger.trace(False) from importlib import reload diff --git a/dill/logger.py b/dill/logger.py index 286ede32..9359d0e4 100644 --- a/dill/logger.py +++ b/dill/logger.py @@ -108,7 +108,7 @@ class TraceAdapter(logging.LoggerAdapter): Usage of logger with 'trace()' method: - >>> from .logger import adapter as logger # instead of 'from .logger import logger' + >>> from dill.logger import adapter as logger #NOTE: not dill.logger.logger >>> ... >>> def save_atype(pickler, obj): >>> logger.trace(pickler, "Message with %s and %r etc. placeholders", 'text', obj) diff --git a/dill/session.py b/dill/session.py index eb0d14d2..35789212 100644 --- a/dill/session.py +++ b/dill/session.py @@ -15,26 +15,22 @@ __all__ = [ 'FilterSet', 'ModuleFilters', 'dump_module', 'ipython_filter', 'is_pickled_module', 'load_module', 'load_module_asdict', 'size_filter', - 'dump_session', 'load_session' # backward compatibility + 'dump_session', 'load_session' # backward compatibility ] import logging logger = logging.getLogger('dill.session') -import builtins import contextlib -import pathlib import re import sys -import tempfile import warnings -from types import SimpleNamespace from dill import _dill, Pickler, Unpickler from ._dill import ( BuiltinMethodType, FunctionType, MethodType, ModuleType, TypeType, _import_module, _is_builtin_module, _is_imported_module, _main_module, - _reverse_typemap, + _reverse_typemap, __builtin__, ) from ._utils import FilterRules, FilterSet, RuleType, size_filter @@ -44,6 +40,9 @@ EXCLUDE, INCLUDE = RuleType.EXCLUDE, RuleType.INCLUDE +import pathlib +import tempfile + TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) class _PeekableReader: @@ -97,6 +96,7 @@ def _open(file, mode, *, peekable=False): def _module_map(): """get map of imported modules""" from collections import defaultdict + from types import SimpleNamespace modmap = SimpleNamespace( by_name=defaultdict(list), by_id=defaultdict(list), @@ -113,8 +113,8 @@ def _module_map(): return modmap IMPORTED_AS_TYPES = (ModuleType, TypeType, FunctionType, MethodType, BuiltinMethodType) -PyCapsuleType = _reverse_typemap.get('PyCapsuleType') -if PyCapsuleType is not None: IMPORTED_AS_TYPES += (PyCapsuleType,) +if 'PyCapsuleType' in _reverse_typemap: + IMPORTED_AS_TYPES += (_reverse_typemap['PyCapsuleType'],) IMPORTED_AS_MODULES = [re.compile(x) for x in ( 'ctypes', 'typing', 'subprocess', 'threading', @@ -628,7 +628,7 @@ def load_module_asdict( old_main = _import_module(main_name) main.__dict__.update(old_main.__dict__) else: - main.__builtins__ = builtins + main.__builtins__ = __builtin__ try: sys.modules[main_name] = main load_module(file, **kwds) @@ -753,3 +753,13 @@ def not_interactive_var(obj): return obj.name not in interactive_vars return not_interactive_var + + +# Internal exports for backward compatibility with dill v0.3.5.1 +# Can't be placed in dill._dill because of circular import problems. +for name in ( + '_lookup_module', '_module_map', '_restore_modules', '_stash_modules', + 'dump_session', 'load_session' # backward compatibility functions +): + setattr(_dill, name, globals()[name]) +del name diff --git a/docs/source/dill.rst b/docs/source/dill.rst index af64599c..2770af2a 100644 --- a/docs/source/dill.rst +++ b/docs/source/dill.rst @@ -25,6 +25,18 @@ detect module :imported-members: .. :exclude-members: ismethod, isfunction, istraceback, isframe, iscode, parent, reference, at, parents, children +logger module +------------- + +.. automodule:: dill.logger + :members: + :undoc-members: + :private-members: + :special-members: + :show-inheritance: + :imported-members: +.. :exclude-members: + objtypes module --------------- From 3abae56bc04fe91340fffc68f4da40ee5576a3be Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Fri, 22 Jul 2022 23:54:54 -0300 Subject: [PATCH 034/109] standardize argument/setting retrieving --- dill/_dill.py | 36 +++++++++++++++++++++--------------- dill/_utils.py | 5 ++++- dill/session.py | 14 ++++++-------- 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index e3ce8add..557bb6bf 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -245,8 +245,7 @@ def copy(obj, *args, **kwds): See :func:`dumps` and :func:`loads` for keyword arguments. """ - from .settings import settings - ignore = kwds.pop('ignore', settings['ignore']) + ignore = kwds.pop('ignore', Unpickler.settings['ignore']) return loads(dumps(obj, *args, **kwds), ignore=ignore) def dump(obj, file, protocol=None, byref=None, fmode=None, recurse=None, **kwds):#, strictio=None): @@ -256,7 +255,7 @@ def dump(obj, file, protocol=None, byref=None, fmode=None, recurse=None, **kwds) See :func:`dumps` for keyword arguments. """ from .settings import settings - protocol = settings['protocol'] if protocol is None else int(protocol) + protocol = _getopt(settings, 'protocol', int(protocol)) _kwds = kwds.copy() _kwds.update(dict(byref=byref, fmode=fmode, recurse=recurse)) Pickler(file, protocol, **_kwds).dump(obj) @@ -344,25 +343,32 @@ class PicklingWarning(PickleWarning, PicklingError): class UnpicklingWarning(PickleWarning, UnpicklingError): pass +def _getopt(settings, key, arg=None, *, kwds=None): + if kwds is not None: + arg = kwds.pop(key, None) + if arg is not None: + return arg + while '.' in key: + prefix, _, key = key.partition('.') + settings = settings[prefix] + return settings[key] + ### Extend the Picklers class Pickler(StockPickler): """python's Pickler extended to interpreter sessions""" dispatch = MetaCatchingDict(StockPickler.dispatch.copy()) _session = False + from .settings import settings def __init__(self, file, *args, **kwds): - from .settings import settings - _byref = kwds.pop('byref', None) - #_strictio = kwds.pop('strictio', None) - _fmode = kwds.pop('fmode', None) - _recurse = kwds.pop('recurse', None) + settings = Pickler.settings StockPickler.__init__(self, file, *args, **kwds) self._main = _main_module self._diff_cache = {} - self._byref = settings['byref'] if _byref is None else _byref - self._strictio = False #_strictio - self._fmode = settings['fmode'] if _fmode is None else _fmode - self._recurse = settings['recurse'] if _recurse is None else _recurse + self._byref = _getopt(settings, 'byref', kwds=kwds) + self._fmode = _getopt(settings, 'fmode', kwds=kwds) + self._recurse = _getopt(settings, 'recurse', kwds=kwds) + self._strictio = False #_getopt(settings, 'strictio', kwds=kwds) self._postproc = OrderedDict() self._file = file @@ -416,6 +422,7 @@ def save_numpy_array(pickler, obj): class Unpickler(StockUnpickler): """python's Unpickler extended to interpreter sessions and more types""" + from .settings import settings _session = False def find_class(self, module, name): @@ -427,11 +434,10 @@ def find_class(self, module, name): return StockUnpickler.find_class(self, module, name) def __init__(self, *args, **kwds): - from .settings import settings - _ignore = kwds.pop('ignore', None) + settings = Pickler.settings StockUnpickler.__init__(self, *args, **kwds) self._main = _main_module - self._ignore = settings['ignore'] if _ignore is None else _ignore + self._ignore = _getopt(settings, 'ignore', kwds=kwds) def load(self): #NOTE: if settings change, need to update attributes obj = StockUnpickler.load(self) diff --git a/dill/_utils.py b/dill/_utils.py index 5f5e621f..f1013093 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -4,7 +4,10 @@ # Copyright (c) 2022 The Uncertainty Quantification Foundation. # License: 3-clause BSD. The full license text is available at: # - https://github.com/uqfoundation/dill/blob/master/LICENSE -"""auxiliary internal classes used in multiple submodules, set here to avoid import recursion""" +""" +Auxiliary classes and functions used in more than one module, defined here to +avoid circular import problems. +""" from __future__ import annotations diff --git a/dill/session.py b/dill/session.py index 35789212..2d8e1103 100644 --- a/dill/session.py +++ b/dill/session.py @@ -29,8 +29,8 @@ from dill import _dill, Pickler, Unpickler from ._dill import ( BuiltinMethodType, FunctionType, MethodType, ModuleType, TypeType, - _import_module, _is_builtin_module, _is_imported_module, _main_module, - _reverse_typemap, __builtin__, + _getopt, _import_module, _is_builtin_module, _is_imported_module, + _main_module, _reverse_typemap, __builtin__, ) from ._utils import FilterRules, FilterSet, RuleType, size_filter @@ -309,12 +309,9 @@ def dump_module( from .settings import settings protocol = settings['protocol'] - if refimported is None: - refimported = settings['dump_module']['refimported'] - if base_rules is None: - base_rules = settings['dump_module']['filters'] - else: - base_rules = ModuleFilters(base_rules) + refimported = _getopt(settings, 'dump_module.refimported', refimported) + base_rules = _getopt(settings, 'dump_module.filters', base_rules) + if type(base_rules) != ModuleFilters: base_rules = ModuleFilters(base_rules) main = module if main is None: @@ -499,6 +496,7 @@ def load_module( if module is not None: raise TypeError("both 'module' and 'main' arguments were used") module = kwds.pop('main') + main = module with _open(filename, 'rb', peekable=True) as file: #FIXME: dill.settings are disabled From 8fd687e287230205dc5f450b729297d1c4996f79 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 23 Jul 2022 13:33:33 -0300 Subject: [PATCH 035/109] fixes --- dill/_dill.py | 12 ++++++------ dill/session.py | 9 +++++++-- dill/settings.py | 5 ++--- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 557bb6bf..9c7da9cc 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -255,10 +255,10 @@ def dump(obj, file, protocol=None, byref=None, fmode=None, recurse=None, **kwds) See :func:`dumps` for keyword arguments. """ from .settings import settings - protocol = _getopt(settings, 'protocol', int(protocol)) - _kwds = kwds.copy() - _kwds.update(dict(byref=byref, fmode=fmode, recurse=recurse)) - Pickler(file, protocol, **_kwds).dump(obj) + protocol = int(_getopt(settings, 'protocol', protocol)) + kwds = kwds.copy() + kwds.update(byref=byref, fmode=fmode, recurse=recurse) + Pickler(file, protocol, **kwds).dump(obj) return def dumps(obj, protocol=None, byref=None, fmode=None, recurse=None, **kwds):#, strictio=None): @@ -362,7 +362,6 @@ class Pickler(StockPickler): def __init__(self, file, *args, **kwds): settings = Pickler.settings - StockPickler.__init__(self, file, *args, **kwds) self._main = _main_module self._diff_cache = {} self._byref = _getopt(settings, 'byref', kwds=kwds) @@ -371,6 +370,7 @@ def __init__(self, file, *args, **kwds): self._strictio = False #_getopt(settings, 'strictio', kwds=kwds) self._postproc = OrderedDict() self._file = file + StockPickler.__init__(self, file, *args, **kwds) def dump(self, obj): #NOTE: if settings change, need to update attributes # register if the object is a numpy ufunc @@ -435,9 +435,9 @@ def find_class(self, module, name): def __init__(self, *args, **kwds): settings = Pickler.settings - StockUnpickler.__init__(self, *args, **kwds) self._main = _main_module self._ignore = _getopt(settings, 'ignore', kwds=kwds) + StockUnpickler.__init__(self, *args, **kwds) def load(self): #NOTE: if settings change, need to update attributes obj = StockUnpickler.load(self) diff --git a/dill/session.py b/dill/session.py index 2d8e1103..0c503dd0 100644 --- a/dill/session.py +++ b/dill/session.py @@ -753,11 +753,16 @@ def not_interactive_var(obj): return not_interactive_var +## Variables set in this module to avoid circular import problems. ## + +from .settings import settings +settings['dump_module']['filters'] = ModuleFilters(rules=()) + # Internal exports for backward compatibility with dill v0.3.5.1 -# Can't be placed in dill._dill because of circular import problems. for name in ( '_lookup_module', '_module_map', '_restore_modules', '_stash_modules', 'dump_session', 'load_session' # backward compatibility functions ): setattr(_dill, name, globals()[name]) -del name + +del name, settings diff --git a/dill/settings.py b/dill/settings.py index 88b886a3..0f1eede0 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -12,7 +12,6 @@ __all__ = ['settings'] from pickle import DEFAULT_PROTOCOL -from .session import ModuleFilters settings = { #'main' : None, @@ -23,9 +22,9 @@ 'recurse' : False, 'ignore' : False, 'dump_module' : { - 'filters': ModuleFilters(rules=()), + 'filters': None, #ModuleFilters(rules=()) # set in dill.session 'refimported': False, }, } -del DEFAULT_PROTOCOL, ModuleFilters +del DEFAULT_PROTOCOL From add61ba01be5dd56dc20f3ece7ee29f1facefb45 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sun, 24 Jul 2022 00:50:27 -0300 Subject: [PATCH 036/109] move common autodoc options to conf.py --- dill/__init__.py | 15 +++++---- dill/_dill.py | 6 ++-- dill/session.py | 6 ++-- docs/source/conf.py | 8 +++++ docs/source/dill.rst | 73 ++++++-------------------------------------- 5 files changed, 30 insertions(+), 78 deletions(-) diff --git a/dill/__init__.py b/dill/__init__.py index 87e4eb42..3571f54e 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -11,10 +11,10 @@ from .__info__ import __version__, __author__, __doc__, __license__ except: # pragma: no cover import os - import sys + import sys parent = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) sys.path.append(parent) - # get distribution meta info + # get distribution meta info from version import (__version__, __author__, get_license_text, get_readme_as_rst) __license__ = get_license_text(os.path.join(parent, 'LICENSE')) @@ -24,14 +24,14 @@ from ._dill import ( - Pickler, Unpickler, - check, copy, dump, dumps, load, loads, pickle, pickles, register, - DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, CONTENTS_FMODE, FILE_FMODE, HANDLE_FMODE, + dump, dumps, load, loads, copy, + Pickler, Unpickler, register, pickle, pickles, check, + DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, HANDLE_FMODE, CONTENTS_FMODE, FILE_FMODE, PickleError, PickleWarning, PicklingError, PicklingWarning, UnpicklingError, UnpicklingWarning, ) from .session import ( - dump_module, is_pickled_module, load_module, load_module_asdict, + dump_module, load_module, load_module_asdict, is_pickled_module, dump_session, load_session # backward compatibility ) from . import detect, logger, session, source, temp @@ -42,8 +42,6 @@ # make sure "trace" is turned off logger.trace(False) -from importlib import reload - objects = {} # local import of dill._objects #from . import _objects @@ -68,6 +66,7 @@ def load_types(pickleable=True, unpickleable=True): Returns: None """ + from importlib import reload # local import of dill.objects from . import _objects if pickleable: diff --git a/dill/_dill.py b/dill/_dill.py index 9c7da9cc..14d45b3d 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -16,9 +16,9 @@ Test against CH16+ Std. Lib. ... TBD. """ __all__ = [ - 'Pickler','Unpickler', - 'check','copy','dump','dumps','load','loads','pickle','pickles','register', - 'DEFAULT_PROTOCOL','HIGHEST_PROTOCOL','CONTENTS_FMODE','FILE_FMODE','HANDLE_FMODE', + 'dump','dumps','load','loads','copy', + 'Pickler','Unpickler','register','pickle','pickles','check', + 'DEFAULT_PROTOCOL','HIGHEST_PROTOCOL','HANDLE_FMODE','CONTENTS_FMODE','FILE_FMODE', 'PickleError','PickleWarning','PicklingError','PicklingWarning','UnpicklingError', 'UnpicklingWarning', ] diff --git a/dill/session.py b/dill/session.py index 0c503dd0..67ba86ff 100644 --- a/dill/session.py +++ b/dill/session.py @@ -7,14 +7,14 @@ # License: 3-clause BSD. The full license text is available at: # - https://github.com/uqfoundation/dill/blob/master/LICENSE """ -Pickle and restore the intepreter session. +Pickle and restore the intepreter session or a module's state. """ from __future__ import annotations __all__ = [ - 'FilterSet', 'ModuleFilters', 'dump_module', 'ipython_filter', - 'is_pickled_module', 'load_module', 'load_module_asdict', 'size_filter', + 'dump_module', 'load_module', 'load_module_asdict', 'is_pickled_module', + 'ModuleFilters', 'FilterRules', 'FilterSet', 'size_filter', 'ipython_filter', 'dump_session', 'load_session' # backward compatibility ] diff --git a/docs/source/conf.py b/docs/source/conf.py index ead9ed06..ff34cd55 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -66,6 +66,14 @@ # extension config github_project_url = "https://github.com/uqfoundation/dill" autoclass_content = 'both' +autodoc_default_options = { + 'members': True, + 'undoc-members': True, + 'private-members': True, + 'special-members': True, + 'show-inheritance': True, + 'exclude-members': '__dict__, __module__, __slots__, __weakref__', +} autodoc_typehints = 'description' napoleon_include_init_with_doc = True napoleon_include_private_with_doc = False diff --git a/docs/source/dill.rst b/docs/source/dill.rst index 2770af2a..db81dffe 100644 --- a/docs/source/dill.rst +++ b/docs/source/dill.rst @@ -5,107 +5,52 @@ dill module ----------- .. automodule:: dill._dill - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + detect module ------------- .. automodule:: dill.detect - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: ismethod, isfunction, istraceback, isframe, iscode, parent, reference, at, parents, children +.. :exclude-members: +ismethod, isfunction, istraceback, isframe, iscode, parent, reference, at, parents, children logger module ------------- .. automodule:: dill.logger - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + objtypes module --------------- .. automodule:: dill.objtypes - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + pointers module --------------- .. automodule:: dill.pointers - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + session module --------------- .. automodule:: dill.session - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: - :exclude-members: dump_session, load_session + :exclude-members: +dump_session, load_session settings module --------------- .. automodule:: dill.settings - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + source module ------------- .. automodule:: dill.source - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + temp module ----------- .. automodule:: dill.temp - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: - +.. :exclude-members: + From f46d399b6dc65277f95d6d668f7555c628ec8c71 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sun, 24 Jul 2022 00:50:27 -0300 Subject: [PATCH 037/109] move common autodoc options to conf.py and exclude some special members Exclude special class members that are implementation details and give practically no useful information: - __dict__ (can generate really big strings) - __module__ - __slots__ - __weakref__ --- dill/__init__.py | 15 +++++---- dill/_dill.py | 6 ++-- dill/session.py | 4 +-- docs/source/conf.py | 8 +++++ docs/source/dill.rst | 73 ++++++-------------------------------------- 5 files changed, 29 insertions(+), 77 deletions(-) diff --git a/dill/__init__.py b/dill/__init__.py index 87e4eb42..3571f54e 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -11,10 +11,10 @@ from .__info__ import __version__, __author__, __doc__, __license__ except: # pragma: no cover import os - import sys + import sys parent = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) sys.path.append(parent) - # get distribution meta info + # get distribution meta info from version import (__version__, __author__, get_license_text, get_readme_as_rst) __license__ = get_license_text(os.path.join(parent, 'LICENSE')) @@ -24,14 +24,14 @@ from ._dill import ( - Pickler, Unpickler, - check, copy, dump, dumps, load, loads, pickle, pickles, register, - DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, CONTENTS_FMODE, FILE_FMODE, HANDLE_FMODE, + dump, dumps, load, loads, copy, + Pickler, Unpickler, register, pickle, pickles, check, + DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, HANDLE_FMODE, CONTENTS_FMODE, FILE_FMODE, PickleError, PickleWarning, PicklingError, PicklingWarning, UnpicklingError, UnpicklingWarning, ) from .session import ( - dump_module, is_pickled_module, load_module, load_module_asdict, + dump_module, load_module, load_module_asdict, is_pickled_module, dump_session, load_session # backward compatibility ) from . import detect, logger, session, source, temp @@ -42,8 +42,6 @@ # make sure "trace" is turned off logger.trace(False) -from importlib import reload - objects = {} # local import of dill._objects #from . import _objects @@ -68,6 +66,7 @@ def load_types(pickleable=True, unpickleable=True): Returns: None """ + from importlib import reload # local import of dill.objects from . import _objects if pickleable: diff --git a/dill/_dill.py b/dill/_dill.py index ca445bed..ab560a54 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -16,9 +16,9 @@ Test against CH16+ Std. Lib. ... TBD. """ __all__ = [ - 'Pickler','Unpickler', - 'check','copy','dump','dumps','load','loads','pickle','pickles','register', - 'DEFAULT_PROTOCOL','HIGHEST_PROTOCOL','CONTENTS_FMODE','FILE_FMODE','HANDLE_FMODE', + 'dump','dumps','load','loads','copy', + 'Pickler','Unpickler','register','pickle','pickles','check', + 'DEFAULT_PROTOCOL','HIGHEST_PROTOCOL','HANDLE_FMODE','CONTENTS_FMODE','FILE_FMODE', 'PickleError','PickleWarning','PicklingError','PicklingWarning','UnpicklingError', 'UnpicklingWarning', ] diff --git a/dill/session.py b/dill/session.py index 798b627a..626798e4 100644 --- a/dill/session.py +++ b/dill/session.py @@ -7,11 +7,11 @@ # License: 3-clause BSD. The full license text is available at: # - https://github.com/uqfoundation/dill/blob/master/LICENSE """ -Pickle and restore the intepreter session. +Pickle and restore the intepreter session or a module's state. """ __all__ = [ - 'dump_module', 'is_pickled_module', 'load_module', 'load_module_asdict', + 'dump_module', 'load_module', 'load_module_asdict', 'is_pickled_module', 'dump_session', 'load_session' # backward compatibility ] diff --git a/docs/source/conf.py b/docs/source/conf.py index ead9ed06..ff34cd55 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -66,6 +66,14 @@ # extension config github_project_url = "https://github.com/uqfoundation/dill" autoclass_content = 'both' +autodoc_default_options = { + 'members': True, + 'undoc-members': True, + 'private-members': True, + 'special-members': True, + 'show-inheritance': True, + 'exclude-members': '__dict__, __module__, __slots__, __weakref__', +} autodoc_typehints = 'description' napoleon_include_init_with_doc = True napoleon_include_private_with_doc = False diff --git a/docs/source/dill.rst b/docs/source/dill.rst index 2770af2a..db81dffe 100644 --- a/docs/source/dill.rst +++ b/docs/source/dill.rst @@ -5,107 +5,52 @@ dill module ----------- .. automodule:: dill._dill - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + detect module ------------- .. automodule:: dill.detect - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: ismethod, isfunction, istraceback, isframe, iscode, parent, reference, at, parents, children +.. :exclude-members: +ismethod, isfunction, istraceback, isframe, iscode, parent, reference, at, parents, children logger module ------------- .. automodule:: dill.logger - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + objtypes module --------------- .. automodule:: dill.objtypes - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + pointers module --------------- .. automodule:: dill.pointers - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + session module --------------- .. automodule:: dill.session - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: - :exclude-members: dump_session, load_session + :exclude-members: +dump_session, load_session settings module --------------- .. automodule:: dill.settings - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + source module ------------- .. automodule:: dill.source - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + temp module ----------- .. automodule:: dill.temp - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: - +.. :exclude-members: + From bef579561ec9aeac8788ff26b6291dae2cd20d78 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sun, 24 Jul 2022 01:01:47 -0300 Subject: [PATCH 038/109] don't document trace() twice --- docs/source/dill.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/dill.rst b/docs/source/dill.rst index db81dffe..e18607db 100644 --- a/docs/source/dill.rst +++ b/docs/source/dill.rst @@ -17,7 +17,7 @@ logger module ------------- .. automodule:: dill.logger -.. :exclude-members: + + :exclude-members: +trace objtypes module --------------- From b26a100a4c674ea8e2d117db43eafd9045ed2d09 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sun, 24 Jul 2022 01:01:47 -0300 Subject: [PATCH 039/109] don't document trace() twice --- docs/source/dill.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/dill.rst b/docs/source/dill.rst index db81dffe..e18607db 100644 --- a/docs/source/dill.rst +++ b/docs/source/dill.rst @@ -17,7 +17,7 @@ logger module ------------- .. automodule:: dill.logger -.. :exclude-members: + + :exclude-members: +trace objtypes module --------------- From 0e25f1472ed7dc92d896ae25e3567f208379c299 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 26 Jul 2022 00:43:07 -0300 Subject: [PATCH 040/109] complete session documentation --- dill/_utils.py | 137 +++++++++++++++++++++-- dill/session.py | 263 +++++++++++++++++++++++++++++++++----------- docs/source/conf.py | 10 +- 3 files changed, 333 insertions(+), 77 deletions(-) diff --git a/dill/_utils.py b/dill/_utils.py index f1013093..04e99808 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -11,7 +11,7 @@ from __future__ import annotations -__all__ = ['FilterRules', 'Filter', 'RuleType', 'size_filter'] +__all__ = ['FilterRules', 'Filter', 'RuleType', 'size_filter', 'EXCLUDE', 'INCLUDE'] import math import random @@ -41,8 +41,10 @@ def _format_bytes_size(size: Union[int, float]) -> Tuple[int, str]: # Namespace filtering. -Filter = Union[str, Pattern[str], int, type, Callable] RuleType = Enum('RuleType', 'EXCLUDE INCLUDE', module=__name__) +EXCLUDE, INCLUDE = RuleType.EXCLUDE, RuleType.INCLUDE + +Filter = Union[str, Pattern[str], int, type, Callable] Rule = Tuple[RuleType, Union[Filter, Iterable[Filter]]] class NamedObject: @@ -70,9 +72,10 @@ def _iter(filters): @dataclass class FilterSet(MutableSet): - ids: Set[int] = field(default_factory=set) + """A superset of exclude/include filter sets.""" names: Set[str] = field(default_factory=set) regexes: Set[Pattern[str]] = field(default_factory=set) + ids: Set[int] = field(default_factory=set) types: Set[type] = field(default_factory=set) funcs: Set[Callable] = field(default_factory=set) _fields = None @@ -149,7 +152,25 @@ def get_type(cls, key): if cls._rtypemap is None: cls._rtypemap = {cls._get_typename(k): v for k, v in _dill._reverse_typemap.items()} return cls._rtypemap[cls._get_typename(key)] - def add_type(self, typename): + def add_type(self, typename: str) -> None: + """Add a type filter to the set by passsing the type name. + + Parameters: + typename: a type name (case insensitive). + + Example: + Add some type filters to default exclusion filters: + + >>> import dill + >>> filters = dill.settings['dump_module']['filters'] + >>> filters.exclude.add_type('type') + >>> filters.exclude.add_type('Function') + >>> filters.exclude.add_type('ModuleType') + >>> filters + , , }), + include=FilterSet()> + """ self.types.add(self.get_type(typename)) FilterSet._fields = tuple(field.name for field in fields(FilterSet)) @@ -176,6 +197,72 @@ def __delete__(self, obj): raise AttributeError(self.name) from None class FilterRules: + """Exclude and include rules for filtering a namespace. + + Namespace filtering rules can be of two types, ``EXCLUDE`` and ``INCLUDE`` + rules, and of five "flavors": + + - `name`: match a variable name exactly; + - `regex`: match a variable name by regular expression; + - `id`: match a variable value by id; + - `type`: match a variable value by type (using ``isinstance``); + - `func`: callable filter, match a variable name and/or value by an + arbitrary logic. + + A `name` filter is specified by a simple string, e.g. 'some_var'. If its + value is not a valid Python identifier, it is treated as a regular + expression instead. + + A `regex` filter is specified either by a string containing a regular + expression, e.g. ``r'\w+_\d+'``, or by a :py:class:`re.Pattern` object. + + An `id` filter is specified by an ``int`` that corresponds to the id of an + object. For example, to exclude a specific object ``obj`` that may me + assigned to multiple variables, just use ``id(obj)`` as an `id` filter. + + A `type` filter is specified by a type-object, e.g. ``list`` or + ``type(some_var)``. For adding `type` filters by the type name, see + :py:func:`FilterSet.add_type`. + + A `func` filter can be any callable that accepts a single argument and + returns a boolean value, being it ``True`` if the object should be excluded + (or included) or ``False`` if it should *not* be excluded (or included). + The single argument is an object with two attributes: ``name`` is the + variable's name in the namespace and ``value`` is the object that it refers + to. Below are some examples of `func` filters. + + Exclude objects that were renamed after definition: + + >>> renamed_filter = lambda obj: obj.name != getattr(obj.value, '__name__', obj.name) + + Strict type filter, exclude ``int`` but not ``bool`` (an ``int`` subclass): + + >>> int_filter = lambda obj: type(obj) == int + + Filters may be added interactively after creating an empty ``FilterRules`` + object: + + >>> from dill.session import FilterRules + >>> filters = FilterRules() + >>> filters.exclude.add('some_var') + >>> filters.exclude.add(r'__\w+') + >>> filters.include.add(r'__\w+__') # keep __dunder__ variables + + Or may be created all at once at initialization with "filter rule literals": + + >>> from dill.session import FilterRules, EXCLUDE, INCLUDE + >>> filters = FilterRules([ + ... (EXCLUDE, ['some_var', r'__\+']), + ... (INCLUDE, r'__\w+__'), + ... ]) + + The order that the exclude and include filters are added is irrelevant + because **exclude filters are always applied first**. Therefore, generally + the rules work as a blocklist, with include filters acting as exceptions to + the exclusion rules. However, **if there are only include filters, the + rules work as an allowlist** instead, and only the variables matched by the + include filters are kept. + """ __slots__ = '_exclude', '_include' exclude = _FilterSetDescriptor() include = _FilterSetDescriptor() @@ -186,12 +273,23 @@ def __init__(self, rules: Union[Iterable[Rule], FilterRules] = None): self.update(rules) def __repr__(self): desc = [" 2 else " " - return sep.join(desc).replace("set()", "{}") + ">" + for attr in ('exclude', 'include'): + set_desc = getattr(self, attr, None) + if set_desc is None: + continue + set_desc = repr(set_desc) + set_desc = re.sub(r'(\w+=set\(\)(, )?)', '', set_desc).replace(', )', ')') + if len(set_desc) > 78: + set_desc = ["FilterSet("] + re.findall(r'\w+={.+?}', set_desc) + set_desc = ",\n ".join(set_desc) + "\n )" + set_desc = "%s=%s" % (attr, set_desc) + if attr == 'exclude' and hasattr(self, 'include'): + set_desc += ',' + desc.append(set_desc) + if len(desc) == 1: + desc += ["NOT SET"] + sep = "\n " if sum(len(x) for x in desc) > 78 else " " + return sep.join(desc) + ">" # Proxy add(), discard(), remove() and clear() to FilterSets. def __proxy__(self, method, filter, *, rule_type=RuleType.EXCLUDE): if not isinstance(rule_type, RuleType): @@ -282,7 +380,19 @@ def filter_vars(self, namespace: Dict[str, Any]) -> Dict[str, Any]: class size_filter: """Create a filter function with a limit for estimated object size. - Note: Doesn't work on PyPy. See ``help('``py:func:`sys.getsizeof```)'`` + Parameters: + limit: maximum size allowed in bytes. May be an absolute number of bytes + as an ``int`` or ``float``, or a string representing a size in bytes, + e.g. ``1000``, ``10e3``, ``"1000"``, ``"1k"`` and ``"1 KiB"`` are all + valid and roughly equivalent (the last one represents 1024 bytes). + recursive: if `False`, the function won't recurse into the object's + attributes and items to estimate its size. + + Returns: + A callable filter to be used with :py:func:`dump_module`. + + Note: + Doesn't work on PyPy. See ``help(sys.getsizeof)``. """ __slots__ = 'limit', 'recursive' # Cover "true" collections from 'builtins', 'collections' and 'collections.abc'. @@ -297,7 +407,10 @@ class size_filter: MINIMUM_SIZE = getsizeof(None, 16) MISSING_SLOT = object() - def __init__(self, limit: str, recursive: bool = True): + def __init__(self, + limit: Union[int, float, str], + recursive: bool = True, + ) -> Callable[NamedObject, bool]: if _dill.IS_PYPY: raise NotImplementedError("size_filter() is not implemented for PyPy") self.limit = limit diff --git a/dill/session.py b/dill/session.py index 67ba86ff..a375701f 100644 --- a/dill/session.py +++ b/dill/session.py @@ -32,13 +32,11 @@ _getopt, _import_module, _is_builtin_module, _is_imported_module, _main_module, _reverse_typemap, __builtin__, ) -from ._utils import FilterRules, FilterSet, RuleType, size_filter +from ._utils import FilterRules, FilterSet, size_filter, EXCLUDE, INCLUDE # Type hints. -from typing import Iterable, Optional, Union -from ._utils import Filter - -EXCLUDE, INCLUDE = RuleType.EXCLUDE, RuleType.INCLUDE +from typing import Callable, Iterable, Optional, Union +from ._utils import Filter, NamedObject, RuleType import pathlib import tempfile @@ -60,7 +58,8 @@ def close(self): def peek(self, n): stream = self.stream try: - if hasattr(stream, 'flush'): stream.flush() + if hasattr(stream, 'flush'): + stream.flush() position = stream.tell() stream.seek(position) # assert seek() works before reading chunk = stream.read(n) @@ -100,7 +99,7 @@ def _module_map(): modmap = SimpleNamespace( by_name=defaultdict(list), by_id=defaultdict(list), - top_level={}, + top_level={}, # top-level modules ) for modname, module in sys.modules.items(): if modname in ('__main__', '__mp_main__') or not isinstance(module, ModuleType): @@ -112,10 +111,12 @@ def _module_map(): modmap.by_id[id(modobj)].append((modobj, objname, modname)) return modmap +# Unique objects (with no duplicates) that may be imported with "import as". IMPORTED_AS_TYPES = (ModuleType, TypeType, FunctionType, MethodType, BuiltinMethodType) if 'PyCapsuleType' in _reverse_typemap: IMPORTED_AS_TYPES += (_reverse_typemap['PyCapsuleType'],) +# For unique objects of various types that have a '__module__' attribute. IMPORTED_AS_MODULES = [re.compile(x) for x in ( 'ctypes', 'typing', 'subprocess', 'threading', r'concurrent\.futures(\.\w+)?', r'multiprocessing(\.\w+)?' @@ -134,7 +135,10 @@ def _lookup_module(modmap, name, obj, main_module): return modname, objname return None, None +BUILTIN_CONSTANTS = (None, False, True, NotImplemented) + def _stash_modules(main_module, original_main): + """pop imported variables to be saved by reference in the __dill_imported* attributes""" modmap = _module_map() newmod = ModuleType(main_module.__name__) @@ -143,13 +147,14 @@ def _stash_modules(main_module, original_main): imported_top_level = [] # keep separated for backward compatibility original = {} for name, obj in main_module.__dict__.items(): + # Self-references. if obj is main_module: - original[name] = newmod # self-reference + original[name] = newmod elif obj is main_module.__dict__: original[name] = newmod.__dict__ - # Avoid incorrectly matching a singleton value in another package (ex.: __doc__). - elif any(obj is singleton for singleton in (None, False, True)) \ - or isinstance(obj, ModuleType) and _is_builtin_module(obj): # always saved by ref + # Avoid incorrectly matching a singleton value in another package (e.g. __doc__ == None). + elif (any(obj is constant for constant in BUILTIN_CONSTANTS) # must compare by identity + or isinstance(obj, ModuleType) and _is_builtin_module(obj)): # always saved by ref original[name] = obj else: source_module, objname = _lookup_module(modmap, name, obj, main_module=original_main) @@ -169,27 +174,22 @@ def _stash_modules(main_module, original_main): newmod.__dill_imported = imported newmod.__dill_imported_as = imported_as newmod.__dill_imported_top_level = imported_top_level - if getattr(newmod, '__loader__', None) is None and _is_imported_module(main_module): - # Trick _is_imported_module() to force saving as an imported module. - newmod.__loader__ = True # will be discarded by save_module() return newmod else: return main_module def _restore_modules(unpickler, main_module): - try: - for modname, name in main_module.__dict__.pop('__dill_imported'): - main_module.__dict__[name] = unpickler.find_class(modname, name) - for modname, objname, name in main_module.__dict__.pop('__dill_imported_as'): - main_module.__dict__[name] = unpickler.find_class(modname, objname) - for modname, name in main_module.__dict__.pop('__dill_imported_top_level'): - main_module.__dict__[name] = __import__(modname) - except KeyError: - pass - -def _filter_vars(main, base_rules, exclude, include): + for modname, name in main_module.__dict__.pop('__dill_imported', ()): + main_module.__dict__[name] = unpickler.find_class(modname, name) + for modname, objname, name in main_module.__dict__.pop('__dill_imported_as', ()): + main_module.__dict__[name] = unpickler.find_class(modname, objname) + for modname, name in main_module.__dict__.pop('__dill_imported_top_level', ()): + main_module.__dict__[name] = _import_module(modname) + +def _filter_vars(main_module, base_rules, exclude, include): + """apply exclude/include filters from arguments *and* settings""" rules = FilterRules() - mod_rules = base_rules.get(main.__name__, base_rules) + mod_rules = base_rules.get(main_module.__name__, base_rules) rules.exclude |= mod_rules.get_filters(EXCLUDE) rules.include |= mod_rules.get_filters(INCLUDE) if exclude is not None: @@ -197,22 +197,22 @@ def _filter_vars(main, base_rules, exclude, include): if include is not None: rules.update([(INCLUDE, include)]) - namespace = rules.filter_vars(main.__dict__) - if namespace is main.__dict__: - return main + namespace = rules.filter_vars(main_module.__dict__) + if namespace is main_module.__dict__: + return main_module if logger.isEnabledFor(logging.INFO): excluded = {name: type(value).__name__ - for name, value in sorted(main.__dict__.items()) if name not in namespace} - excluded = str(excluded).translate({ord(","): "\n", ord("'"): None}) - logger.info("Objects excluded from dump_session():\n%s\n", excluded) + for name, value in sorted(main_module.__dict__.items()) if name not in namespace} + excluded = str(excluded).translate({ord(","): "\n ", ord("'"): None}) + logger.info("Objects excluded from dump_session():\n %s", excluded) - newmod = ModuleType(main.__name__) + newmod = ModuleType(main_module.__name__) newmod.__dict__.update(namespace) for name, obj in namespace.items(): - if obj is main: + if obj is main_module: setattr(newmod, name, newmod) - elif obj is main.__dict__: + elif obj is main_module.__dict__: setattr(newmod, name, newmod.__dict__) return newmod @@ -233,6 +233,13 @@ def dump_module( built with :py:class:`~types.ModuleType`), to a file. The pickled module can then be restored with the function :py:func:`load_module`. + Only a subset of the module's variables may be saved if exclusion/inclusion + filters are specified. Filters apply to every variable name or value and + determine if they should be saved or not. They can be set in + ``dill.settings['dump_module']['filters']`` or passed directly to the + ``exclude`` and ``include`` parameters. See :py:class:`ModuleFilters` for + details. + Parameters: filename: a path-like object or a writable stream. module: a module object or the name of an importable module. If `None` @@ -288,6 +295,12 @@ def dump_module( >>> [foo.sin(x) for x in foo.values] [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] + - Save current session but exclude some variables: + + >>> import dill + >>> num, text, alist = 1, 'apple', [4, 9, 16] + >>> dill.dump_module(exclude=['text', int])) # only 'alist' is saved + *Changed in version 0.3.6:* Function ``dump_session()`` was renamed to ``dump_module()``. Parameters ``main`` and ``byref`` were renamed to ``module`` and ``refimported``, respectively. @@ -324,15 +337,25 @@ def dump_module( main = _filter_vars(main, base_rules, exclude, include) if refimported: main = _stash_modules(main, original_main) + if main is not original_main: + # Some empty attributes like __doc__ may have been added by ModuleType(). + added_names = set(main.__dict__) + added_names.difference_update(original_main.__dict__) + added_names.difference_update('__dill_imported%s' % s for s in ('', '_as', '_top_level')) + for name in added_names: + delattr(main, name) + if getattr(main, '__loader__', None) is None and _is_imported_module(original_main): + # Trick _is_imported_module() to force saving this as an imported module. + main.__loader__ = True # will be discarded by _dill.save_module() with _open(filename, 'wb') as file: pickler = Pickler(file, protocol, **kwds) - if main is not original_main: - pickler._original_main = original_main pickler._main = main #FIXME: dill.settings are disabled pickler._byref = False # disable pickling by name reference pickler._recurse = False # disable pickling recursion for globals pickler._session = True # is best indicator of when pickling a session pickler._first_pass = True + if main is not original_main: + pickler._original_main = original_main pickler.dump(main) return @@ -350,10 +373,12 @@ def _identify_module(file, main=None): try: for opcode, arg, pos in genops(file.peek(256)): if not found_import: + # Find the first '_import_module' constructor. if opcode.name in ('GLOBAL', 'SHORT_BINUNICODE') and \ arg.endswith('_import_module'): found_import = True else: + # Just after that, the argument is the main module name. if opcode.name in UNICODE: return arg else: @@ -361,12 +386,12 @@ def _identify_module(file, main=None): except (NotImplementedError, ValueError) as error: # ValueError occours when the end of the chunk is reached (without a STOP). if isinstance(error, NotImplementedError) and main is not None: - # file is not peekable, but we have main. + # The file is not peekable, but we have the argument main. return None raise UnpicklingError("unable to identify main module") from error def is_pickled_module(filename, importable: bool = True) -> bool: - """Check if a file is a module state pickle file. + """Check if file is a pickled module state. Parameters: filename: a path-like object or a readable stream. @@ -377,6 +402,31 @@ def is_pickled_module(filename, importable: bool = True) -> bool: `True` if the pickle file at ``filename`` was generated with :py:func:`dump_module` **AND** the module whose state is saved in it is of the kind specified by the ``importable`` argument. `False` otherwise. + + Examples: + Create three types of pickle files: + + >>> import dill + >>> import types + >>> dill.dump_module('module_session.pkl') # saves __main__ + >>> dill.dump_module('module_object.pkl', module=types.ModuleType('example')) + >>> with open('common_object.pkl', 'wb') as file: + >>> dill.dump('example', file) + + Test each file's kind: + + >>> dill.is_pickled_module('module_session.pkl') # the module is importable + True + >>> dill.is_pickled_module('module_session.pkl', importable=False) + False + >>> dill.is_pickled_module('module_object.pkl') # the module is not importable + False + >>> dill.is_pickled_module('module_object.pkl', importable=False) + True + >>> dill.is_pickled_module('common_object.pkl') # always return False + False + >>> dill.is_pickled_module('common_object.pkl', importable=False) + False """ with _open(filename, 'rb', peekable=True) as file: try: @@ -497,13 +547,13 @@ def load_module( raise TypeError("both 'module' and 'main' arguments were used") module = kwds.pop('main') - main = module with _open(filename, 'rb', peekable=True) as file: #FIXME: dill.settings are disabled unpickler = Unpickler(file, **kwds) unpickler._session = True - # Resolve unpickler._main + # Resolve main. + main = module pickle_main = _identify_module(file, main) if main is None: main = pickle_main @@ -515,7 +565,6 @@ def load_module( main = _import_module(main) if not isinstance(main, ModuleType): raise TypeError("%r is not a module" % main) - unpickler._main = main # Check against the pickle's main. is_main_imported = _is_imported_module(main) @@ -526,17 +575,18 @@ def load_module( error_msg = "can't update{} module{} %r with the saved state of{} module{} %r" if main.__name__ != pickle_main: raise ValueError(error_msg.format("", "", "", "") % (main.__name__, pickle_main)) - if is_runtime_mod and is_main_imported: + elif is_runtime_mod and is_main_imported: raise ValueError( error_msg.format(" imported", "", "", "-type object") % (main.__name__, main.__name__) ) - if not is_runtime_mod and not is_main_imported: + elif not is_runtime_mod and not is_main_imported: raise ValueError( error_msg.format("", "-type object", " imported", "") % (main.__name__, main.__name__) ) + # Load the module's state. try: if not is_main_imported: # This is for find_class() to be able to locate it. @@ -619,31 +669,94 @@ def load_module_asdict( raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") with _open(filename, 'rb', peekable=True) as file: main_name = _identify_module(file) - old_main = sys.modules.get(main_name) + original_main = sys.modules.get(main_name) main = ModuleType(main_name) if update: - if old_main is None: - old_main = _import_module(main_name) - main.__dict__.update(old_main.__dict__) + if original_main is None: + original_main = _import_module(main_name) + main.__dict__.update(original_main.__dict__) else: main.__builtins__ = __builtin__ try: sys.modules[main_name] = main load_module(file, **kwds) finally: - if old_main is None: + if original_main is None: del sys.modules[main_name] else: - sys.modules[main_name] = old_main + sys.modules[main_name] = original_main main.__session__ = str(filename) return main.__dict__ -############################# -# Module filters settings # -############################# - class ModuleFilters(FilterRules): + """Stores default filtering rules for modules. + + :py:class:`FilterRules` subclass with a tree-like structure that may hold + exclusion/inclusion filters for specific modules and submodules. See the + base class documentation to learn more about how to create and use filters. + + This is the type of ``dill.settings['dump_module']['filters']``: + + >>> import dill + >>> filters = dill.settings['dump_module']['filters'] + >>> filters + + + Exclusion and inclusion filters for global variables can be added using the + ``add()`` methods of the ``exclude`` and ``include`` attributes, or of the + ``ModuleFilters`` object itself. In the latter case, the filter is added to + its ``exclude`` :py:class:`FilterSet` by default: + + >>> filters.add('some_var') # exclude a variable named 'some_var' + >>> filters.exclude.add('_.*') # exclude any variable with a name prefixed by '_' + >>> filters.include.add('_keep_this') # an exception to the rule above + + + Similarly, a filter can be discarded with the ``discard()`` method: + + >>> filters.discard('some_var') + >>> filters.exclude.discard('_.*') + >>> filters + + + Note how, after the last operation, ``filters.exclude`` was left empty but + ``filters.include`` still contains a name filter. In cases like this, i.e. + when ``len(filters.exclude) == 0 and len(filters.include) > 0.``, the + filters are treated as an "allowlist", which means that **only** the + variables that match the ``include`` filters will be pickled. In this + example, only the variable ``_keep_this``, if it existed, would be saved. + + To create filters specific for a module and its submodules, use the + following syntax to add a child node to the default ``ModuleFilters``: + + >>> from dill.session import EXCLUDE, INCLUDE + >>> filters['foo'] = [] + >>> filters['foo'] # override default with empty rules for module 'foo' + + >>> filters['bar.baz'] = [(EXCLUDE, r'\w+\d+'), (INCLUDE, 'ERROR404')] + >>> filters['bar.baz'] # specific rules for the submodule 'bar.baz' + + >>> filters['bar'] # but the default rules would apply for the module 'bar' + + + Module-specific filter rules may be accessed using different syntaxes: + + >>> filters['bar.baz'] is filters['bar']['baz'] is filters.bar.baz + True + + Note, however, that using the attribute syntax to directly set rules for + a submodule will fail if its parent module doesn't have an entry yet: + + >>> filters.parent.child = [] # filters.parent doesn't exist + AttributeError: 'ModuleFilters' object has no attribute 'parent' + >>> filters['parent.child'] = [] # use this syntax instead + >>> filters.parent.child.grandchild = [(EXCLUDE, str)] # works fine + """ __slots__ = 'module', '_parent', '__dict__' _fields = tuple(x.lstrip('_') for x in FilterRules.__slots__) def __init__(self, @@ -654,11 +767,13 @@ def __init__(self, # Don't call super().__init__() if rules is not None: super().__init__(rules) + if parent is not None and parent.module != 'DEFAULT': + module = '%s.%s' % (parent.module, module) super().__setattr__('module', module) super().__setattr__('_parent', parent) def __repr__(self): desc = "DEFAULT" if self.module == 'DEFAULT' else "for %r" % self.module - return "" % (desc, super().__repr__()) + return " Callable[NamedObject, bool]: + """Filter factory to exclude IPython hidden variables. -############################## -# Session filter factories # -############################## + When saving the session with :py:func:`dump_module` in an IPython + interpreter, hidden variables, i.e. variables listed by ``dir()`` but + not listed by the ``%who`` magic command, are saved unless they are excluded + by filters. This function generates a filter that will exclude these hidden + variables from the list of saved variables, with the optional exception of + command history variables. -def ipython_filter(*, keep_history: str = 'input'): - """filter factory for IPython sessions (can't be added to settings currently) + Parameters: + keep_history: whether to keep (i.e. not exclude) the input and output + history of the IPython interactive session. Accepted values: - Usage: - >>> from dill.session import * - >>> dump_session(exclude=[ipython_filter()]) + - `'input'`: the input history contained in the hidden variables + ``In``, ``_ih``, ``_i``, ``_i1``, ``_i2``, etc. will be saved. + - `'output'`, the output history contained in the hidden variables + ``Out``, ``_oh``, ``_``, ``_1``, ``_2``, etc. will be saved. + - `'both'`: both the input and output history will be saved. + - `'none'`: all the hidden history variables will be excluded. + + Returns: + An exclude filter function to be used with :py:func:`dump_module`. + + Example: + + >>> import dill + >>> from dill.session import ipython_filter + >>> dill.dump_module(exclude=ipython_filter(keep_history='none')) """ HISTORY_OPTIONS = {'input', 'output', 'both', 'none'} if keep_history not in HISTORY_OPTIONS: @@ -736,7 +871,7 @@ def ipython_filter(*, keep_history: str = 'input'): nonmatching = object() # This can never be in user_ns interactive_vars = {x for x in user_ns if user_ns[x] is not user_ns_hidden.get(x, nonmatching)} - # Input and output history. + # Input and output history hidden variables. history_regex = [] if keep_history in {'input', 'both'}: interactive_vars |= {'_ih', 'In', '_i', '_ii', '_iii'} diff --git a/docs/source/conf.py b/docs/source/conf.py index ff34cd55..20fff7f4 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -72,7 +72,15 @@ 'private-members': True, 'special-members': True, 'show-inheritance': True, - 'exclude-members': '__dict__, __module__, __slots__, __weakref__', + 'exclude-members': ( #NOTE: this is a single string concatenation + '__dict__,' # implementation detail (may be verbose) + '__slots__,' # implementation detail + '__module__,' # implementation detail + '__weakref__,' # built-in automatic attribute, mostly meaningless + '__annotations__,' # redundant with signature documentation + '__dataclass_fields__,' # dataclass automatic attribute, redundant + '_abc_impl,' # implementation detail + ) } autodoc_typehints = 'description' napoleon_include_init_with_doc = True From a0e4a7ced59468844c0e959b1443f25b57e87c5e Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 26 Jul 2022 00:56:46 -0300 Subject: [PATCH 041/109] optimize FilterSet._match_type() --- dill/_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dill/_utils.py b/dill/_utils.py index 04e99808..0c49fe7b 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -82,14 +82,15 @@ class FilterSet(MutableSet): _rtypemap = None _typename_regex = re.compile(r'\w+(?=Type$)|\w+$', re.IGNORECASE) def _match_type(self, filter): - if isinstance(filter, str): + filter_type = type_filter + if filter_type == str: if filter.isidentifier(): field = 'names' else: filter, field = re.compile(filter), 'regexes' - elif isinstance(filter, re.Pattern): + elif filter_type == re.Pattern: field = 'regexes' - elif isinstance(filter, int): + elif filter_type == int: field = 'ids' elif isinstance(filter, type): field = 'types' From 55d53863293878f9233f60f35fa01e1e41e728fe Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 26 Jul 2022 01:09:58 -0300 Subject: [PATCH 042/109] add _firt_pass and _original_main attributes to Pickler class --- dill/_dill.py | 2 ++ dill/_utils.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dill/_dill.py b/dill/_dill.py index 14d45b3d..c5335da4 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -358,6 +358,8 @@ class Pickler(StockPickler): """python's Pickler extended to interpreter sessions""" dispatch = MetaCatchingDict(StockPickler.dispatch.copy()) _session = False + _first_pass = False + _original_main = None from .settings import settings def __init__(self, file, *args, **kwds): diff --git a/dill/_utils.py b/dill/_utils.py index 0c49fe7b..0b43b15c 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -82,7 +82,7 @@ class FilterSet(MutableSet): _rtypemap = None _typename_regex = re.compile(r'\w+(?=Type$)|\w+$', re.IGNORECASE) def _match_type(self, filter): - filter_type = type_filter + filter_type = type(filter) if filter_type == str: if filter.isidentifier(): field = 'names' From a73801bba3c2fc63e24419e6ed43de839a7f9d8d Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 26 Jul 2022 09:37:20 -0300 Subject: [PATCH 043/109] FilterSet: substitute the add_type method by a special string syntax "type:" --- dill/_utils.py | 49 +++++++++++++----------------------- dill/session.py | 4 +-- dill/tests/test_filtering.py | 6 ++--- 3 files changed, 23 insertions(+), 36 deletions(-) diff --git a/dill/_utils.py b/dill/_utils.py index 0b43b15c..d4bdc6da 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -81,13 +81,17 @@ class FilterSet(MutableSet): _fields = None _rtypemap = None _typename_regex = re.compile(r'\w+(?=Type$)|\w+$', re.IGNORECASE) - def _match_type(self, filter): + def _match_type(self, filter: Filter) -> Tuple[filter, str]: filter_type = type(filter) if filter_type == str: if filter.isidentifier(): field = 'names' + elif filter.startswith('type:'): + filter = self.get_type(filter.partition(':')[-1].strip()) + field = 'types' else: - filter, field = re.compile(filter), 'regexes' + filter = re.compile(filter) + field = 'regexes' elif filter_type == re.Pattern: field = 'regexes' elif filter_type == int: @@ -146,33 +150,14 @@ def update(self, filters): def copy(self): return FilterSet(*(getattr(self, field).copy() for field in self._fields)) @classmethod - def _get_typename(cls, key): - return cls._typename_regex.match(key).group().lower() + def _get_typekey(cls, typename: str) -> str: + return cls._typename_regex.match(typename).group().lower() @classmethod - def get_type(cls, key): + def get_type(cls, typename: str) -> type: + """retrieve a type registered in ``dill``'s "reverse typemap"'""" if cls._rtypemap is None: - cls._rtypemap = {cls._get_typename(k): v for k, v in _dill._reverse_typemap.items()} - return cls._rtypemap[cls._get_typename(key)] - def add_type(self, typename: str) -> None: - """Add a type filter to the set by passsing the type name. - - Parameters: - typename: a type name (case insensitive). - - Example: - Add some type filters to default exclusion filters: - - >>> import dill - >>> filters = dill.settings['dump_module']['filters'] - >>> filters.exclude.add_type('type') - >>> filters.exclude.add_type('Function') - >>> filters.exclude.add_type('ModuleType') - >>> filters - , , }), - include=FilterSet()> - """ - self.types.add(self.get_type(typename)) + cls._rtypemap = {cls._get_typekey(k): v for k, v in _dill._reverse_typemap.items()} + return cls._rtypemap[cls._get_typekey(typename)] FilterSet._fields = tuple(field.name for field in fields(FilterSet)) class _FilterSetDescriptor: @@ -211,8 +196,8 @@ class FilterRules: arbitrary logic. A `name` filter is specified by a simple string, e.g. 'some_var'. If its - value is not a valid Python identifier, it is treated as a regular - expression instead. + value is not a valid Python identifier, except for the special `type` case + below, it is treated as a regular expression instead. A `regex` filter is specified either by a string containing a regular expression, e.g. ``r'\w+_\d+'``, or by a :py:class:`re.Pattern` object. @@ -222,8 +207,10 @@ class FilterRules: assigned to multiple variables, just use ``id(obj)`` as an `id` filter. A `type` filter is specified by a type-object, e.g. ``list`` or - ``type(some_var)``. For adding `type` filters by the type name, see - :py:func:`FilterSet.add_type`. + ``type(some_var)``, or by a string with the format ``"type:"``, + where ```` is a type name (case insensitive) known by ``dill`` , + e.g. ``"type:function"`` or ``"type: FunctionType"``. These include all + the types defined in the module :py:module:`types` and many more. A `func` filter can be any callable that accepts a single argument and returns a boolean value, being it ``True`` if the object should be excluded diff --git a/dill/session.py b/dill/session.py index a375701f..7844354c 100644 --- a/dill/session.py +++ b/dill/session.py @@ -186,7 +186,7 @@ def _restore_modules(unpickler, main_module): for modname, name in main_module.__dict__.pop('__dill_imported_top_level', ()): main_module.__dict__[name] = _import_module(modname) -def _filter_vars(main_module, base_rules, exclude, include): +def _filter_vars(main_module, exclude, include, base_rules): """apply exclude/include filters from arguments *and* settings""" rules = FilterRules() mod_rules = base_rules.get(main_module.__name__, base_rules) @@ -334,7 +334,7 @@ def dump_module( if not isinstance(main, ModuleType): raise TypeError("%r is not a module" % main) original_main = main - main = _filter_vars(main, base_rules, exclude, include) + main = _filter_vars(main, exclude, include, base_rules) if refimported: main = _stash_modules(main, original_main) if main is not original_main: diff --git a/dill/tests/test_filtering.py b/dill/tests/test_filtering.py index 3a3444d5..c51e1549 100644 --- a/dill/tests/test_filtering.py +++ b/dill/tests/test_filtering.py @@ -65,9 +65,9 @@ def test_exclude_include(): def test_add_type(): type_rules = FilterRules() # Formats accepted (actually case insensitive): - type_rules.exclude.add_type('function') # 1. typename - type_rules.exclude.add_type('Type') # 2. Typename - type_rules.exclude.add_type('ModuleType') # 2. TypenameType + type_rules.exclude.add('type: function') # 1. typename + type_rules.exclude.add('type: Type ') # 2. Typename + type_rules.exclude.add('type:ModuleType') # 2. TypenameType NS_copy = NS.copy() NS_copy.update(F=test_basic_filtering, T=FilterRules, M=_dill) assert did_exclude(NS_copy, type_rules, excluded_subset={'F', 'T', 'M', 'Integer'}) From 10369b165c5fc269de8f2ba22134b3d6ff953ed8 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 26 Jul 2022 09:40:58 -0300 Subject: [PATCH 044/109] rename FilterRules.filter_vars() to FilterRules.apply_filters() --- dill/_utils.py | 2 +- dill/session.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dill/_utils.py b/dill/_utils.py index d4bdc6da..3c804791 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -326,7 +326,7 @@ def _apply_filters(self, filter_set, objects): objects = filterfalse(filter, objects) return objects - def filter_vars(self, namespace: Dict[str, Any]) -> Dict[str, Any]: + def apply_filters(self, namespace: Dict[str, Any]) -> Dict[str, Any]: """Apply filters to dictionary with names as keys.""" if not namespace or not (self.exclude or self.include): return namespace diff --git a/dill/session.py b/dill/session.py index 7844354c..8042a367 100644 --- a/dill/session.py +++ b/dill/session.py @@ -197,7 +197,7 @@ def _filter_vars(main_module, exclude, include, base_rules): if include is not None: rules.update([(INCLUDE, include)]) - namespace = rules.filter_vars(main_module.__dict__) + namespace = rules.apply_filters(main_module.__dict__) if namespace is main_module.__dict__: return main_module From 953b5e049af28fe55c33586decae1de4602e4323 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 26 Jul 2022 20:46:20 -0300 Subject: [PATCH 045/109] fix is_pickled_module() --- dill/_dill.py | 2 +- dill/session.py | 38 ++++++++++++++++++++++++-------------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index ab560a54..9e2568e3 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -39,7 +39,7 @@ #XXX: get types from .objtypes ? import builtins as __builtin__ from pickle import _Pickler as StockPickler, Unpickler as StockUnpickler -from pickle import BINPUT, DICT, EMPTY_DICT, LONG_BINPUT, MARK, PUT, SETITEM +from pickle import DICT, EMPTY_DICT, MARK, SETITEM from struct import pack from _thread import LockType from _thread import RLock as RLockType diff --git a/dill/session.py b/dill/session.py index 626798e4..92861bbe 100644 --- a/dill/session.py +++ b/dill/session.py @@ -20,7 +20,7 @@ import sys import warnings -from dill import _dill, Pickler, Unpickler +from dill import _dill, Pickler, Unpickler, UnpicklingError from ._dill import ( BuiltinMethodType, FunctionType, MethodType, ModuleType, TypeType, _import_module, _is_builtin_module, _is_imported_module, _main_module, @@ -326,20 +326,30 @@ def dump_session(filename=str(TEMPDIR/'session.pkl'), main=None, byref=False, ** def _identify_module(file, main=None): """identify the name of the module stored in the given file-type object""" - from pickletools import genops - UNICODE = {'UNICODE', 'BINUNICODE', 'SHORT_BINUNICODE'} - found_import = False + import pickletools + NEUTRAL = {'PROTO', 'FRAME', 'PUT', 'BINPUT', 'MEMOIZE', 'MARK', 'STACK_GLOBAL'} + opcodes = ((opcode.name, arg) for opcode, arg, pos in pickletools.genops(file.peek(256)) + if opcode.name not in NEUTRAL) try: - for opcode, arg, pos in genops(file.peek(256)): - if not found_import: - if opcode.name in ('GLOBAL', 'SHORT_BINUNICODE') and \ - arg.endswith('_import_module'): - found_import = True - else: - if opcode.name in UNICODE: - return arg - else: - raise UnpicklingError("reached STOP without finding main module") + opcode, arg = next(opcodes) + if (opcode, arg) == ('SHORT_BINUNICODE', 'dill._dill'): + # The file uses STACK_GLOBAL instead of GLOBAL. + opcode, arg = next(opcodes) + if not (opcode in ('SHORT_BINUNICODE', 'GLOBAL') and arg.split()[-1] == '_import_module'): + raise ValueError + opcode, arg = next(opcodes) + if not opcode in ('SHORT_BINUNICODE', 'BINUNICODE', 'UNICODE'): + raise ValueError + module_name = arg + if not ( + next(opcodes)[0] in ('TUPLE1', 'TUPLE') and + next(opcodes)[0] == 'REDUCE' and + next(opcodes)[0] in ('EMPTY_DICT', 'DICT') + ): + raise ValueError + return module_name + except StopIteration: + raise UnpicklingError("reached STOP without finding main module") from None except (NotImplementedError, ValueError) as error: # ValueError occours when the end of the chunk is reached (without a STOP). if isinstance(error, NotImplementedError) and main is not None: From d30e7c3bc59588f11674746fab9a14cb52a245ea Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Fri, 29 Jul 2022 20:37:35 -0300 Subject: [PATCH 046/109] add dill.read_settings() to read INI files --- dill/__init__.py | 2 +- dill/_utils.py | 4 +- dill/session.py | 12 ++-- dill/settings.py | 176 +++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 180 insertions(+), 14 deletions(-) diff --git a/dill/__init__.py b/dill/__init__.py index 3571f54e..dc984cca 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -37,7 +37,7 @@ from . import detect, logger, session, source, temp # get global settings -from .settings import settings +from .settings import settings, read_settings # make sure "trace" is turned off logger.trace(False) diff --git a/dill/_utils.py b/dill/_utils.py index 3c804791..319d33d4 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -170,7 +170,7 @@ def __set__(self, obj, value): if isinstance(value, FilterSet): setattr(obj, self._name, value) else: - setattr(obj, self._name, FilterSet(value)) + setattr(obj, self._name, FilterSet._from_iterable(value)) def __get__(self, obj, objtype=None): try: return getattr(obj, self._name) @@ -210,7 +210,7 @@ class FilterRules: ``type(some_var)``, or by a string with the format ``"type:"``, where ```` is a type name (case insensitive) known by ``dill`` , e.g. ``"type:function"`` or ``"type: FunctionType"``. These include all - the types defined in the module :py:module:`types` and many more. + the types defined in the module :py:mod:`types` and many more. A `func` filter can be any callable that accepts a single argument and returns a boolean value, being it ``True`` if the object should be excluded diff --git a/dill/session.py b/dill/session.py index 8042a367..31aff717 100644 --- a/dill/session.py +++ b/dill/session.py @@ -757,7 +757,7 @@ class ModuleFilters(FilterRules): >>> filters['parent.child'] = [] # use this syntax instead >>> filters.parent.child.grandchild = [(EXCLUDE, str)] # works fine """ - __slots__ = 'module', '_parent', '__dict__' + __slots__ = '_module', '_parent', '__dict__' _fields = tuple(x.lstrip('_') for x in FilterRules.__slots__) def __init__(self, rules: Union[Iterable[Rule], FilterRules] = None, @@ -767,12 +767,12 @@ def __init__(self, # Don't call super().__init__() if rules is not None: super().__init__(rules) - if parent is not None and parent.module != 'DEFAULT': - module = '%s.%s' % (parent.module, module) - super().__setattr__('module', module) + if parent is not None and parent._module != 'DEFAULT': + module = '%s.%s' % (parent._module, module) + super().__setattr__('_module', module) super().__setattr__('_parent', parent) def __repr__(self): - desc = "DEFAULT" if self.module == 'DEFAULT' else "for %r" % self.module + desc = "DEFAULT" if self._module == 'DEFAULT' else "for %r" % self._module return " None: + """Read dill settings from an INI file. + + Update the ``dill.settings`` dictionary with the contents of the INI file + ``filename``. Accepted file sections: + + - `dill`: general :py:mod:`dill` settings + - `dill.module`: settings for :py:func:`dill.dump_module` + - `filters`: default exclude/include filters for :py:func:`dill.dump_module` + - `filters.`: module-specific filters for + :py:func:`dill.dump_module`, where `` is the complete module + path in the form `module[.submodule...]` + + Accepted option values for general settings: + + - boolean options (case insensitive): yes, no, on, off, true, false + - `protocol`: DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, 0, 1, 2, 3, ... + - `fmode`: HANDLE_FMODE, 0, CONTENTS_FMODE, 1, FILE_FMODE, 2 + + .. Lexer 'pacmanconf' generates better highlighting than 'ini'. + .. code-block:: pacmanconf + + [dill] + # General settings + ## Stored in dill.settings. + protocol = HIGHEST_PROTOCOL + byref = yes + + [dill.dump_module] + # Settings for dill.dump_module() + ## Stored in dill.settings['dump_module']. + refimported = yes + + [filters] + # Default exclude/include filters for dill.dump_module() + ## Stored in dill.settings['dump_module']['filters']. + exclude.names = some_var, SomeClass + exclude.regexes = '_.+' + exclude.types = function, ModuleType, io.BytesIO + exclude.funcs = + lambda obj: type(obj.value) == int + dill.session.size_filter('10 KB') + include = _keep_this_var, '__.+__' + + [filters.some.module] + # Filter rules specific to the module 'some.module' + ## Reuse regex filters defined in the previous section. + ## Option 'include' is unset, will fall back to default 'include' filters. + exclude = ${filters:exclude.regexes} + #include = + + [filters.another.module] + # Filter rules specifit to the module 'another.module' + ## Empty filter sets disable filtering for this module. + exclude = + include = + + Parameters: + filename: a path-like object or a readable stream. + + Tip: + The parser uses default syntax with extended interpolation enabled. + For details about the accepted INI format, see :py:mod:`configparser`. + """ + import configparser + from dill import DEFAULT_PROTOCOL, HANDLE_FMODE + from dill.session import ModuleFilters + + cp = configparser.ConfigParser( + dict_type=dict, # internal, in place of OrderedDict + empty_lines_in_values=False, # one value per line + interpolation=configparser.ExtendedInterpolation(), + ) + cp.read_dict(DEFAULT_SETTINGS) + if hasattr(filename, 'readline'): + cp.read_file(filename) + else: + cp.read(filename) + + # General settings. + section = cp['dill'] + new_settings = {k: section.getboolean(k) + for k, v in DEFAULT_SETTINGS['dill'].items() if type(v) == bool} + fmode = section.get('fmode') + protocol = section.get('protocol') + new_settings['fmode'] = int(FMODES.get(fmode, fmode)) + new_settings['protocol'] = int(STANDARD_PROTOCOLS.get(protocol, protocol)) + + # dump_module() settings. + new_settings['dump_module'] = { + 'refimported': cp.getboolean('dill.dump_module', 'refimported'), + 'filters': ModuleFilters(rules=()), + } + filters = new_settings['dump_module']['filters'] + if 'filters' in cp: + # Default filters. + _read_filters(cp['filters'], filters) + for module, section in cp.items(): + if not module.startswith('filters.'): + continue + module = module.partition('.')[-1] + assert all(x.isidentifier() for x in module.split('.')) + filters[module] = () # instantiate ModuleFilters and FilterSet's + _read_filters(section, filters[module]) + + # Update settings dictionary. + settings.clear() + settings.update(new_settings) From f5359786779a7297f75e34fe7139ab1a208b19ce Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Fri, 29 Jul 2022 20:41:21 -0300 Subject: [PATCH 047/109] rename method in tests --- dill/tests/test_filtering.py | 2 +- dill/tests/test_session.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dill/tests/test_filtering.py b/dill/tests/test_filtering.py index c51e1549..172a0de5 100644 --- a/dill/tests/test_filtering.py +++ b/dill/tests/test_filtering.py @@ -25,7 +25,7 @@ def did_exclude(namespace, rules, excluded_subset): rules = FilterRules(rules) - filtered = rules.filter_vars(namespace) + filtered = rules.apply_filters(namespace) return set(namespace).difference(filtered) == excluded_subset def test_basic_filtering(): diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index f2a026ec..f671e0bc 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -292,7 +292,7 @@ def test_ipython_filter(): user_vars = set(user_ns_actual) def namespace_matches(keep_history, should_keep_vars): rules = FilterRules([(EXCLUDE, ipython_filter(keep_history=keep_history))]) - return set(rules.filter_vars(user_ns)) == user_vars | should_keep_vars + return set(rules.apply_filters(user_ns)) == user_vars | should_keep_vars assert namespace_matches(keep_history='input', should_keep_vars={'_i1'}) assert namespace_matches(keep_history='output', should_keep_vars={'_1'}) assert namespace_matches(keep_history='both', should_keep_vars={'_i1', '_1'}) From e5da1c8373e083fc64bcea7aaa83c040273e4a97 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 30 Jul 2022 09:39:13 -0300 Subject: [PATCH 048/109] deteail the effects of 'module' argument in load_module() and rename 'main' to 'module' on doctstrings --- dill/session.py | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/dill/session.py b/dill/session.py index 92861bbe..52fd0359 100644 --- a/dill/session.py +++ b/dill/session.py @@ -349,13 +349,13 @@ def _identify_module(file, main=None): raise ValueError return module_name except StopIteration: - raise UnpicklingError("reached STOP without finding main module") from None + raise UnpicklingError("reached STOP without finding module") from None except (NotImplementedError, ValueError) as error: # ValueError occours when the end of the chunk is reached (without a STOP). if isinstance(error, NotImplementedError) and main is not None: # file is not peekable, but we have main. return None - raise UnpicklingError("unable to identify main module") from error + raise UnpicklingError("unable to identify module") from error def is_pickled_module(filename, importable: bool = True) -> bool: """Check if a file is a module state pickle file. @@ -393,10 +393,39 @@ def load_module( :py:class:`~types.ModuleType`). When restoring the state of a non-importable module-type object, the - current instance of this module may be passed as the argument ``main``. + current instance of this module may be passed as the argument ``module``. Otherwise, a new instance is created with :py:class:`~types.ModuleType` and returned. + Passing a `module` argument forces dill to verify that the module being + loaded is compatible with the argument value. Additionally, if the argument + is a module (instead of a module name), it supresses the return value. + + This call loads ``math`` and returns it at the end: + + >>> import dill + >>> # load module -> restore state -> return module + >>> dill.load_module('math_session.pkl') + + + Passing the module name does the same as above, but also verifies that the + module loaded, restored and returned is indeed ``math``: + + >>> import dill + >>> # load module -> check name/kind -> restore state -> return module + >>> dill.load_module('math_session.pkl', module='math') + + >>> dill.load_module('math_session.pkl', module='cmath') + ValueError: can't update module 'cmath' with the saved state of module 'math' + + Passing the module itself instead of its name have the additional effect of + supressing the return value (and the module is already loaded at this point): + + >>> import dill + >>> import math + >>> # check name/kind -> restore state -> return None + >>> dill.load_module('math_session.pkl', module=math) + Parameters: filename: a path-like object or a readable stream. module: a module object or the name of an importable module; @@ -406,12 +435,12 @@ def load_module( Raises: :py:exc:`UnpicklingError`: if unpickling fails. - :py:exc:`ValueError`: if the argument ``main`` and module saved + :py:exc:`ValueError`: if the argument ``module`` and module saved at ``filename`` are incompatible. Returns: A module object, if the saved module is not :py:mod:`__main__` or - a module instance wasn't provided with the argument ``main``. + a module instance wasn't provided with the argument ``module``. Examples: From 2e4887c3d6e90fcd3eb2fa48619c93580dcf3337 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 30 Jul 2022 10:12:00 -0300 Subject: [PATCH 049/109] Better describe the side effects and the usage of the returned value from load_module() --- dill/session.py | 85 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 30 deletions(-) diff --git a/dill/session.py b/dill/session.py index 52fd0359..7d16b0ae 100644 --- a/dill/session.py +++ b/dill/session.py @@ -268,7 +268,7 @@ def dump_module( Note: Currently, ``dill.settings['byref']`` and ``dill.settings['recurse']`` - don't apply to this function.` + don't apply to this function. """ for old_par, par in [('main', 'module'), ('byref', 'refimported')]: if old_par in kwds: @@ -397,35 +397,6 @@ def load_module( Otherwise, a new instance is created with :py:class:`~types.ModuleType` and returned. - Passing a `module` argument forces dill to verify that the module being - loaded is compatible with the argument value. Additionally, if the argument - is a module (instead of a module name), it supresses the return value. - - This call loads ``math`` and returns it at the end: - - >>> import dill - >>> # load module -> restore state -> return module - >>> dill.load_module('math_session.pkl') - - - Passing the module name does the same as above, but also verifies that the - module loaded, restored and returned is indeed ``math``: - - >>> import dill - >>> # load module -> check name/kind -> restore state -> return module - >>> dill.load_module('math_session.pkl', module='math') - - >>> dill.load_module('math_session.pkl', module='cmath') - ValueError: can't update module 'cmath' with the saved state of module 'math' - - Passing the module itself instead of its name have the additional effect of - supressing the return value (and the module is already loaded at this point): - - >>> import dill - >>> import math - >>> # check name/kind -> restore state -> return None - >>> dill.load_module('math_session.pkl', module=math) - Parameters: filename: a path-like object or a readable stream. module: a module object or the name of an importable module; @@ -442,6 +413,60 @@ def load_module( A module object, if the saved module is not :py:mod:`__main__` or a module instance wasn't provided with the argument ``module``. + Passing an argument to ``module`` forces `dill` to verify that the module + being loaded is compatible with the argument value. Additionally, if the + argument is a module (instead of a module name), it supresses the return + value. Each case and behavior is exemplified below: + + 1. `module`: ``None`` --- This call loads a previously saved state of + the module ``math`` and returns this at the end: + + >>> import dill + >>> # load module -> restore state -> return module + >>> dill.load_module('math_session.pkl') + + + 2. `module`: ``str`` --- Passing the module name does the same as above, + but also verifies that the module loaded, restored and returned is + indeed ``math``: + + >>> import dill + >>> # load module -> check name/kind -> restore state -> return module + >>> dill.load_module('math_session.pkl', module='math') + + >>> dill.load_module('math_session.pkl', module='cmath') + ValueError: can't update module 'cmath' with the saved state of module 'math' + + 3. `module`: ``ModuleType`` --- Passing the module itself instead of its + name have the additional effect of supressing the return value (and the + module is already loaded at this point): + + >>> import dill + >>> import math + >>> # check name/kind -> restore state -> return None + >>> dill.load_module('math_session.pkl', module=math) + + For imported modules, the return value is meant as a convenience, so that + the function call can substitute an ``import`` statement. Therefore these + statements: + + >>> import dill + >>> math2 = dill.load_module('math_session.pkl', module='math') + + are equivalent to these: + + >>> import dill + >>> import math as math2 + >>> dill.load_module('math_session.pkl', module=math2) + + Note that, in both cases, ``math2`` is just a reference to + ``sys.modules['math']``: + + >>> import math + >>> import sys + >>> math is math2 is sys.modules['math'] + True + Examples: - Save the state of some modules: From be319c8baa3f883a0c7d290c0000f6b3b2bfe329 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 30 Jul 2022 15:07:02 -0300 Subject: [PATCH 050/109] describe session module behavior and use cases --- dill/session.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/dill/session.py b/dill/session.py index 7d16b0ae..72e8c2c2 100644 --- a/dill/session.py +++ b/dill/session.py @@ -8,8 +8,30 @@ # - https://github.com/uqfoundation/dill/blob/master/LICENSE """ Pickle and restore the intepreter session or a module's state. + +The functions :py:func:`dump_module`, :py:func:`load_module` and +:py:func:`load_module_asdict` are capable of saving and restoring, as long as +objects are pickleable, the complete state of a module. For imported modules +that are pickled, `dill` assumes that they are importable when unpickling. + +Contrary of using :py:func:`dill.dump` and :py:func:`dill.load` to save and load +a module object, :py:func:`dill.dump_module` always try to pickle the module by +value (including built-in modules). Also, options like +``dill.settings['byref']`` and ``dill.settings['recurse']`` don't affect its +behavior. + +However, if a module contains references to objects originating from other +modules, that would prevent it from pickling or drastically increase its disk +size, they can be saved by reference instead of by value using the option +``refimported``. + +With :py:func:`dump_module`, namespace filters may be used to restrict the list +of variables pickled to a subset of those in the module, based on their names or +values. Also, using :py:func:`load_module_asdict` allows one to load the +variables from different saved states of the same module into dictionaries. """ + __all__ = [ 'dump_module', 'load_module', 'load_module_asdict', 'is_pickled_module', 'dump_session', 'load_session' # backward compatibility From a9ea8830fa5d567fe69cf574b1cf7a41b8a134b7 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 1 Aug 2022 19:38:37 -0300 Subject: [PATCH 051/109] add Python License and copyright notice for modified code as specified by the licensce --- LICENSE | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++ dill/_dill.py | 9 +++++++-- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/LICENSE b/LICENSE index 6e9cde5a..a76105cd 100644 --- a/LICENSE +++ b/LICENSE @@ -33,3 +33,56 @@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--------------------------------------------------------------------------- + +Copyright (c) 2001-2022 Python Software Foundation. +All Rights Reserved. + +PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 + +1. This LICENSE AGREEMENT is between the Python Software Foundation +("PSF"), and the Individual or Organization ("Licensee") accessing and +otherwise using this software ("Python") in source or binary form and +its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, PSF +hereby grants Licensee a nonexclusive, royalty-free, world-wide +license to reproduce, analyze, test, perform and/or display publicly, +prepare derivative works, distribute, and otherwise use Python alone +or in any derivative version, provided, however, that PSF's License +Agreement and PSF's notice of copyright, i.e., "Copyright (c) 2001, +2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, +2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022 Python +Software Foundation; All Rights Reserved" are retained in Python alone +or in any derivative version prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python. + +4. PSF is making Python available to Licensee on an "AS IS" +basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any +relationship of agency, partnership, or joint venture between PSF and +Licensee. This License Agreement does not grant permission to use PSF +trademarks or trade name in a trademark sense to endorse or promote +products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using Python, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. diff --git a/dill/_dill.py b/dill/_dill.py index 9e2568e3..cafa0d94 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -460,9 +460,14 @@ def _save_module_dict(self, obj): """ Use object name in the module namespace as a last resource to try to save it by reference when pickling fails. - - Modified from Pickler.save_dict() and Pickler._batch_setitems(). """ + # Modified from Python Standard Library's pickle._Pickler.save_dict() + # and pickle._Pickler._batch_setitems(). + # + # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved + # Changes summary: use SETITEM for all pickle protocols and + # conditionally pass an extra argument to a custom implementation of + # the method 'save'. if not self._refonfail: super().save_dict(obj) return From b7224313604ee2b3fca8a80479b7219c7ca8c3ec Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 2 Aug 2022 13:52:36 -0300 Subject: [PATCH 052/109] revert addition of PSF license; add link to license --- LICENSE | 53 --------------------------------------------------- dill/_dill.py | 3 ++- 2 files changed, 2 insertions(+), 54 deletions(-) diff --git a/LICENSE b/LICENSE index a76105cd..6e9cde5a 100644 --- a/LICENSE +++ b/LICENSE @@ -33,56 +33,3 @@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ---------------------------------------------------------------------------- - -Copyright (c) 2001-2022 Python Software Foundation. -All Rights Reserved. - -PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 - -1. This LICENSE AGREEMENT is between the Python Software Foundation -("PSF"), and the Individual or Organization ("Licensee") accessing and -otherwise using this software ("Python") in source or binary form and -its associated documentation. - -2. Subject to the terms and conditions of this License Agreement, PSF -hereby grants Licensee a nonexclusive, royalty-free, world-wide -license to reproduce, analyze, test, perform and/or display publicly, -prepare derivative works, distribute, and otherwise use Python alone -or in any derivative version, provided, however, that PSF's License -Agreement and PSF's notice of copyright, i.e., "Copyright (c) 2001, -2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, -2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022 Python -Software Foundation; All Rights Reserved" are retained in Python alone -or in any derivative version prepared by Licensee. - -3. In the event Licensee prepares a derivative work that is based on -or incorporates Python or any part thereof, and wants to make -the derivative work available to others as provided herein, then -Licensee hereby agrees to include in any such work a brief summary of -the changes made to Python. - -4. PSF is making Python available to Licensee on an "AS IS" -basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR -IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND -DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS -FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT -INFRINGE ANY THIRD PARTY RIGHTS. - -5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON -FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS -A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, -OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. - -6. This License Agreement will automatically terminate upon a material -breach of its terms and conditions. - -7. Nothing in this License Agreement shall be deemed to create any -relationship of agency, partnership, or joint venture between PSF and -Licensee. This License Agreement does not grant permission to use PSF -trademarks or trade name in a trademark sense to endorse or promote -products or services of Licensee, or any third party. - -8. By copying, installing or otherwise using Python, Licensee -agrees to be bound by the terms and conditions of this License -Agreement. diff --git a/dill/_dill.py b/dill/_dill.py index cafa0d94..564d7fe6 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -465,7 +465,8 @@ def _save_module_dict(self, obj): # and pickle._Pickler._batch_setitems(). # # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved - # Changes summary: use SETITEM for all pickle protocols and + # License Agreement: https://opensource.org/licenses/Python-2.0 + # Summary of changes: use SETITEM for all pickle protocols and # conditionally pass an extra argument to a custom implementation of # the method 'save'. if not self._refonfail: From 2a7e984873e1ffb4f8802d3e4885881d20c88ab5 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Wed, 3 Aug 2022 16:31:00 -0300 Subject: [PATCH 053/109] _open(): cover all the possible file opening modes --- dill/session.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/dill/session.py b/dill/session.py index 72e8c2c2..fb8de0af 100644 --- a/dill/session.py +++ b/dill/session.py @@ -84,11 +84,13 @@ def peek(self, n): def _open(file, mode, *, peekable=False): """return a context manager with an opened file-like object""" import io - attr = 'write' if 'w' in mode else 'read' - was_open = hasattr(file, attr) + readonly = ('r' in mode and '+' not in mode) + if peekable and not readonly: + raise ValueError("the 'peekable' option is invalid for writable files") + was_open = hasattr(file, 'read' if readonly else 'write') if not was_open: file = open(file, mode) - if attr == 'read' and peekable and not hasattr(file, 'peek'): + if readonly and peekable and not hasattr(file, 'peek'): # Try our best to return the stream as an object with a peek() method. if hasattr(file, 'tell') and hasattr(file, 'seek'): file = _PeekableReader(file) From fa4fa85c7a00e03b3972d53f3f6a0ef3419117bd Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Wed, 3 Aug 2022 16:31:22 -0300 Subject: [PATCH 054/109] grammar --- dill/session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dill/session.py b/dill/session.py index fb8de0af..c8acbe41 100644 --- a/dill/session.py +++ b/dill/session.py @@ -229,7 +229,7 @@ def dump_module( be saved by reference. If this also fails, saving their parent objects by reference will be attempted recursively. In the worst case scenario, the module itself may be saved by reference. Note: - The file-like object must be seekable and truncable with this + The file-like object must be seekable and truncatable with this option set. **kwds: extra keyword arguments passed to :py:class:`Pickler()`. From 92318a7314bbd0bbed2fdc98398954c456387ad3 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Wed, 3 Aug 2022 17:32:42 -0300 Subject: [PATCH 055/109] better document Pickler.save() and Pickler._save_module_dict() --- dill/_dill.py | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 564d7fe6..0e7a1b20 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -419,7 +419,15 @@ def save_numpy_array(pickler, obj): dump.__doc__ = StockPickler.dump.__doc__ def save(self, obj, save_persistent_id=True, *, name=None): - """If self._refonfail is True, try to save object by reference if pickling fails.""" + # This method overrides StockPickler.save() and is called for every + # object pickled. When 'refonfail' is True, it tries to save the object + # by reference if pickling it fails with a common pickling error, as + # defined by the constant UNPICKLEABLE_ERRORS. If that also fails, then + # the exception is risen and, if this was called indirectly from another + # Pickler.save() call, the parent objects will try to be saved by + # reference recursively, until it succeeds or the exception propagates + # beyond the topmost save() call. The extra 'name' argument is passed + # to StockPickler.save_global(). if not self._refonfail: super().save(obj, save_persistent_id) return @@ -430,7 +438,7 @@ def save(self, obj, save_persistent_id=True, *, name=None): memo_size = len(self.memo) try: super().save(obj, save_persistent_id) - except UNPICKLEABLE_ERRORS + (AttributeError,) as error_stack: + except (AttributeError, *UNPICKLEABLE_ERRORS) as error_stack: # AttributeError may happen in the save_global() call from a child object. if (type(error_stack) == AttributeError and "no attribute '__name__'" not in error_stack.args[0]): @@ -441,6 +449,7 @@ def save(self, obj, save_persistent_id=True, *, name=None): # Roll back memo. for _ in range(len(self.memo) - memo_size): self.memo.popitem() # LIFO order is guaranteed since 3.7 + # Try to save object by reference. try: if isinstance(obj, ModuleType) and \ (_is_builtin_module(obj) or obj is sys.modules['dill']): @@ -457,21 +466,22 @@ def save(self, obj, save_persistent_id=True, *, name=None): type(obj).__name__, id(obj), obj=obj) def _save_module_dict(self, obj): + """Save a module's dictionary. + + If an object doesn't have a '__name__' attribute, pass the object's name + in the module's namespace to save(), so that it can be used with + save_global() to increase the chances of finding the object for saving + it by reference in the event of a failed serialization. """ - Use object name in the module namespace as a last resource to try to - save it by reference when pickling fails. - """ - # Modified from Python Standard Library's pickle._Pickler.save_dict() - # and pickle._Pickler._batch_setitems(). - # - # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved - # License Agreement: https://opensource.org/licenses/Python-2.0 - # Summary of changes: use SETITEM for all pickle protocols and - # conditionally pass an extra argument to a custom implementation of - # the method 'save'. if not self._refonfail: super().save_dict(obj) return + # Modified from Python Standard Library's pickle._Pickler.save_dict() + # and pickle._Pickler._batch_setitems(). Summary of changes: use + # 'SETITEM' for all pickle protocols and conditionally pass an extra + # argument to a custom implementation of the method 'save'. + # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved + # License Agreement: https://opensource.org/licenses/Python-2.0 if self.bin: self.write(EMPTY_DICT) else: # proto 0 -- can't use EMPTY_DICT From 0e365f5c91a8ddbc0dd91a5c3fb9da3d41e442eb Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 4 Aug 2022 02:01:25 -0300 Subject: [PATCH 056/109] move session settings to session.py; changes to refonfail --- dill/_dill.py | 41 ++++++++++++++++++++++++++++++----------- dill/session.py | 31 +++++++++++++++++++------------ dill/settings.py | 4 ---- 3 files changed, 49 insertions(+), 27 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 635e9c12..d039be5e 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -241,7 +241,7 @@ def __reduce_ex__(self, protocol): #: Pickles the entire file (handle and contents), preserving mode and position. FILE_FMODE = 2 -# Exceptions commonly raised by unpicklable objects in the Standard Library. +# Exceptions commonly raised by unpickleable objects in the Standard Library. UNPICKLEABLE_ERRORS = (PicklingError, TypeError, ValueError, NotImplementedError) ### Shorthands (modified from python2.5/lib/pickle.py) @@ -438,13 +438,25 @@ def save(self, obj, save_persistent_id=True, *, name=None): # Store initial state. position = self._file_tell() memo_size = len(self.memo) + saved_as_global = False try: super().save(obj, save_persistent_id) except (AttributeError, *UNPICKLEABLE_ERRORS) as error_stack: # AttributeError may happen in the save_global() call from a child object. - if (type(error_stack) == AttributeError - and "no attribute '__name__'" not in error_stack.args[0]): + if type(error_stack) == AttributeError \ + and "no attribute '__name__'" not in error_stack.args[0]: raise + if self._session and obj is self._main: + warnings.warn( + "module %r being saved by reference due to unpickleable" + " objects in its namespace" % self._main.__name__, + PicklingWarning, + stacklevel=5, + ) + message = ( + "# X: fallback to save as global: <%s object at %#012x>" + % (type(obj).__name__, id(obj)) + ) # Roll back the stream. self._file_seek(position) self._file_truncate() @@ -452,20 +464,27 @@ def save(self, obj, save_persistent_id=True, *, name=None): for _ in range(len(self.memo) - memo_size): self.memo.popitem() # LIFO order is guaranteed since 3.7 # Try to save object by reference. + if isinstance(obj, ModuleType) and \ + (_is_builtin_module(obj) or obj is sys.modules['dill']): + self.save_reduce(_import_module, (obj.__name__,), obj=obj) + logger.trace(self, message, obj=obj) + return + if self._session: + if name is None and not (hasattr(obj, '__name__') or hasattr(obj, '__qualname__')): + name = self._id_to_name.get(id(obj)) + if name is not None and self._main.__name__ not in {'__main__', '__main_mp__'}: + self.save_reduce(getattr, (self._main, name), obj=obj) + logger.trace(self, message, obj=obj) + return try: - if isinstance(obj, ModuleType) and \ - (_is_builtin_module(obj) or obj is sys.modules['dill']): - self.save_reduce(_import_module, (obj.__name__,), obj=obj) - else: - self.save_global(obj, name) + self.save_global(obj, name) + logger.trace(self, message, obj=obj) except (AttributeError, PicklingError) as error: if getattr(self, '_trace_stack', None) and id(obj) == self._trace_stack[-1]: # Roll back trace state. self._trace_stack.pop() self._size_stack.pop() raise error from error_stack - logger.trace(self, "# X: fallback to save_global: <%s object at %#012x>", - type(obj).__name__, id(obj), obj=obj) def _save_module_dict(self, obj): """Save a module's dictionary. @@ -562,7 +581,7 @@ def use_diff(on=True): Reduces size of pickles by only including object which have changed. Decreases pickle size but increases CPU time needed. - Also helps avoid some unpicklable objects. + Also helps avoid some unpickleable objects. MUST be called at start of script, otherwise changes will not be recorded. """ global _use_diff, diff diff --git a/dill/session.py b/dill/session.py index c8acbe41..38dcc626 100644 --- a/dill/session.py +++ b/dill/session.py @@ -57,6 +57,11 @@ TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) +settings = { + 'refimported': False, + 'refonfail' : True, +} + class _PeekableReader: """lightweight readable stream wrapper that implements peek()""" def __init__(self, stream): @@ -225,12 +230,13 @@ def dump_module( similar but independent from ``dill.settings[`byref`]``, as ``refimported`` refers to virtually all imported objects, while ``byref`` only affects select objects. - refonfail: if `True`, objects that fail to pickle by value will try to - be saved by reference. If this also fails, saving their parent - objects by reference will be attempted recursively. In the worst - case scenario, the module itself may be saved by reference. Note: - The file-like object must be seekable and truncatable with this - option set. + refonfail: if `True` (the default), objects that fail to pickle by value + will try to be saved by reference. If this also fails, saving their + parent objects by reference will be attempted recursively. In the + worst case scenario, the module itself may be saved by reference, + with a warning. Note: this option disables framing for pickle + protocol >= 4. Turning this off may improve unpickling speed, but + may cause a module to fail pickling. **kwds: extra keyword arguments passed to :py:class:`Pickler()`. Raises: @@ -305,12 +311,12 @@ def dump_module( refimported = kwds.pop('byref', refimported) module = kwds.pop('main', module) - from .settings import settings - protocol = settings['protocol'] + from .settings import settings as dill_settings + protocol = dill_settings['protocol'] if refimported is None: - refimported = settings['dump_module']['refimported'] + refimported = settings['refimported'] if refonfail is None: - refonfail = settings['dump_module']['refonfail'] + refonfail = settings['refonfail'] main = module if main is None: main = _main_module @@ -339,6 +345,7 @@ def dump_module( if pickler._file_seek is None or pickler._file_truncate is None: raise TypeError("file must have 'tell', 'seek' and 'truncate'" " attributes if the 'refonfail' option is set.") + pickler._id_to_name = {id(v): k for k, v in main.__dict__.items()} pickler.dump(main) return @@ -367,8 +374,8 @@ def _identify_module(file, main=None): module_name = arg if not ( next(opcodes)[0] in ('TUPLE1', 'TUPLE') and - next(opcodes)[0] == 'REDUCE' and - next(opcodes)[0] in ('EMPTY_DICT', 'DICT') + next(opcodes)[0] == 'REDUCE' #and + #next(opcodes)[0] in ('EMPTY_DICT', 'DICT') ): raise ValueError return module_name diff --git a/dill/settings.py b/dill/settings.py index df1d30a4..140bfb5d 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -21,10 +21,6 @@ 'fmode' : 0, #HANDLE_FMODE 'recurse' : False, 'ignore' : False, - 'dump_module' : { - 'refimported': False, - 'refonfail' : False, - }, } del DEFAULT_PROTOCOL From 9c54e34c52b78db71c3b67dab6da6a5a523241aa Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 4 Aug 2022 19:06:25 -0300 Subject: [PATCH 057/109] add _TruncatableWriter to handle 'refonfail' with non-seekable streams --- dill/session.py | 73 +++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/dill/session.py b/dill/session.py index 38dcc626..cb77a40d 100644 --- a/dill/session.py +++ b/dill/session.py @@ -37,10 +37,11 @@ 'dump_session', 'load_session' # backward compatibility ] -import contextlib +import io import re import sys import warnings +from contextlib import AbstractContextManager, nullcontext, suppress from dill import _dill, Pickler, Unpickler, UnpicklingError from ._dill import ( @@ -62,10 +63,14 @@ 'refonfail' : True, } -class _PeekableReader: +class _PeekableReader(AbstractContextManager): """lightweight readable stream wrapper that implements peek()""" - def __init__(self, stream): + def __init__(self, stream, closing=True): self.stream = stream + self.closing = closing + def __exit__(self, *exc_info): + if self.closing: + self.stream.close() def read(self, n): return self.stream.read(n) def readline(self): @@ -86,31 +91,52 @@ def peek(self, n): except (AttributeError, OSError): raise NotImplementedError("stream is not peekable: %r", stream) from None -def _open(file, mode, *, peekable=False): +class _TruncatableWriter(io.BytesIO, AbstractContextManager): + """works as an unlimited buffer, writes to file on close""" + def __init__(self, stream, closing=True, *args, **kwds): + super().__init__(*args, **kwds) + self.stream = stream + self.closing = closing + def __exit__(self, *exc_info): + self.close() + def close(self): + self.stream.write(self.getvalue()) + with suppress(AttributeError): + self.stream.flush() + super().close() + if self.closing: + self.stream.close() + +def _open(file, mode, *, peekable=False, truncatable=False): """return a context manager with an opened file-like object""" - import io readonly = ('r' in mode and '+' not in mode) - if peekable and not readonly: + if not readonly and peekable: raise ValueError("the 'peekable' option is invalid for writable files") - was_open = hasattr(file, 'read' if readonly else 'write') - if not was_open: + if readonly and truncatable: + raise ValueError("the 'truncatable' option is invalid for read-only files") + should_close = not hasattr(file, 'read' if readonly else 'write') + if should_close: file = open(file, mode) - if readonly and peekable and not hasattr(file, 'peek'): - # Try our best to return the stream as an object with a peek() method. + # Wrap stream in a helper class if necessary. + if peekable and not hasattr(file, 'peek'): + # Try our best to return it as an object with a peek() method. if hasattr(file, 'tell') and hasattr(file, 'seek'): - file = _PeekableReader(file) + file = _PeekableReader(file, closing=should_close) else: try: file = io.BufferedReader(file) except Exception: - # Stream won't be peekable, but will fail gracefully in _identify_module(). - file = _PeekableReader(file) - if was_open: # should not close at exit - return contextlib.nullcontext(file) - elif type(file) == _PeekableReader: - return contextlib.closing(file) - else: + # It won't be peekable, but will fail gracefully in _identify_module(). + file = _PeekableReader(file, closing=should_close) + elif truncatable and ( + not hasattr(file, 'truncate') + or (hasattr(file, 'seekable') and not file.seekable()) + ): + file = _TruncatableWriter(file, closing=should_close) + if should_close or isinstance(file, (_PeekableReader, _TruncatableWriter)): return file + else: + return nullcontext(file) def _module_map(): """get map of imported modules""" @@ -327,7 +353,7 @@ def dump_module( original_main = main if refimported: main = _stash_modules(main) - with _open(filename, 'wb') as file: + with _open(filename, 'wb', truncatable=True) as file: pickler = Pickler(file, protocol, **kwds) if main is not original_main: pickler._original_main = original_main @@ -338,13 +364,8 @@ def dump_module( pickler._first_pass = True if refonfail: pickler._refonfail = True # False by default - pickler._file_seek = getattr(file, 'seek', None) - pickler._file_truncate = getattr(file, 'truncate', None) - if hasattr(file, 'seekable') and not file.seekable(): - pickler._file_seek = None - if pickler._file_seek is None or pickler._file_truncate is None: - raise TypeError("file must have 'tell', 'seek' and 'truncate'" - " attributes if the 'refonfail' option is set.") + pickler._file_seek = file.seek + pickler._file_truncate = file.truncate pickler._id_to_name = {id(v): k for k, v in main.__dict__.items()} pickler.dump(main) return From ffdd180476b53a9b2e3244c2a781ab67acf7c010 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 4 Aug 2022 19:30:50 -0300 Subject: [PATCH 058/109] update 'refonfail' example --- dill/_dill.py | 1 - dill/session.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index d039be5e..0ab1ec81 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -438,7 +438,6 @@ def save(self, obj, save_persistent_id=True, *, name=None): # Store initial state. position = self._file_tell() memo_size = len(self.memo) - saved_as_global = False try: super().save(obj, save_persistent_id) except (AttributeError, *UNPICKLEABLE_ERRORS) as error_stack: diff --git a/dill/session.py b/dill/session.py index cb77a40d..a63bede0 100644 --- a/dill/session.py +++ b/dill/session.py @@ -298,9 +298,9 @@ def dump_module( >>> import dill >>> import os >>> os.altsep = '\\' - >>> dill.dump_module('os_session.pkl', module=os) + >>> dill.dump_module('os_session.pkl', module=os, refonfail=False) PicklingError: ... - >>> dill.dump_module('os_session.pkl', module=os, refonfail=True) + >>> dill.dump_module('os_session.pkl', module=os, refonfail=True) # the default - Restore the state of the saved modules: @@ -410,7 +410,7 @@ def _identify_module(file, main=None): raise UnpicklingError("unable to identify module") from error def is_pickled_module(filename, importable: bool = True) -> bool: - """Check if a file is a module state pickle file. + """Check if a file is a pickle file readable by :py:func:`load_module`. Parameters: filename: a path-like object or a readable stream. From e5006f710de161dedd35de0efd242b7725ea8205 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 4 Aug 2022 23:13:38 -0300 Subject: [PATCH 059/109] Merge branch 'document-session' into session-excludes --- dill/_dill.py | 344 +++++++++++++++++++++++------------- dill/_objects.py | 4 +- dill/logger.py | 31 ++-- dill/session.py | 257 +++++++++++++++++++++------ dill/source.py | 46 +++-- dill/tests/test_classdef.py | 9 +- dill/tests/test_selected.py | 19 ++ dill/tests/test_weakref.py | 47 ++--- docs/source/conf.py | 11 +- setup.py | 2 +- 10 files changed, 516 insertions(+), 254 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index c5335da4..35cfb6fa 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -39,6 +39,7 @@ #XXX: get types from .objtypes ? import builtins as __builtin__ from pickle import _Pickler as StockPickler, Unpickler as StockUnpickler +from pickle import DICT, EMPTY_DICT, GLOBAL, MARK, SETITEM from _thread import LockType from _thread import RLock as RLockType #from io import IOBase @@ -58,6 +59,7 @@ import marshal import gc # import zlib +import dataclasses from weakref import ReferenceType, ProxyType, CallableProxyType from collections import OrderedDict from functools import partial @@ -91,51 +93,24 @@ def __hook__(): from numpy import dtype as NumpyDType return True if NumpyArrayType: # then has numpy - def ndarraysubclassinstance(obj): - if type(obj) in (TypeType, ClassType): - return False # all classes return False - try: # check if is ndarray, and elif is subclass of ndarray - cls = getattr(obj, '__class__', None) - if cls is None: return False - elif cls is TypeType: return False - elif 'numpy.ndarray' not in str(getattr(cls, 'mro', int.mro)()): - return False - except ReferenceError: return False # handle 'R3' weakref in 3.x - except TypeError: return False + def ndarraysubclassinstance(obj_type): + if all((c.__module__, c.__name__) != ('numpy', 'ndarray') for c in obj_type.__mro__): + return False # anything below here is a numpy array (or subclass) instance __hook__() # import numpy (so the following works!!!) # verify that __reduce__ has not been overridden - NumpyInstance = NumpyArrayType((0,),'int8') - if id(obj.__reduce_ex__) == id(NumpyInstance.__reduce_ex__) and \ - id(obj.__reduce__) == id(NumpyInstance.__reduce__): return True - return False - def numpyufunc(obj): - if type(obj) in (TypeType, ClassType): - return False # all classes return False - try: # check if is ufunc - cls = getattr(obj, '__class__', None) - if cls is None: return False - elif cls is TypeType: return False - if 'numpy.ufunc' not in str(getattr(cls, 'mro', int.mro)()): - return False - except ReferenceError: return False # handle 'R3' weakref in 3.x - except TypeError: return False - # anything below here is a numpy ufunc + if obj_type.__reduce_ex__ is not NumpyArrayType.__reduce_ex__ \ + or obj_type.__reduce__ is not NumpyArrayType.__reduce__: + return False return True - def numpydtype(obj): - if type(obj) in (TypeType, ClassType): - return False # all classes return False - try: # check if is dtype - cls = getattr(obj, '__class__', None) - if cls is None: return False - elif cls is TypeType: return False - if 'numpy.dtype' not in str(getattr(obj, 'mro', int.mro)()): - return False - except ReferenceError: return False # handle 'R3' weakref in 3.x - except TypeError: return False + def numpyufunc(obj_type): + return any((c.__module__, c.__name__) == ('numpy', 'ufunc') for c in obj_type.__mro__) + def numpydtype(obj_type): + if all((c.__module__, c.__name__) != ('numpy', 'dtype') for c in obj_type.__mro__): + return False # anything below here is a numpy dtype __hook__() # import numpy (so the following works!!!) - return type(obj) is type(NumpyDType) # handles subclasses + return obj_type is type(NumpyDType) # handles subclasses else: def ndarraysubclassinstance(obj): return False def numpyufunc(obj): return False @@ -196,9 +171,7 @@ def get_file_type(*args, **kwargs): singletontypes = [] import inspect -import dataclasses - -from pickle import GLOBAL +import typing ### Shims for different versions of Python and dill @@ -238,6 +211,9 @@ def __reduce_ex__(self, protocol): #: Pickles the entire file (handle and contents), preserving mode and position. FILE_FMODE = 2 +# Exceptions commonly raised by unpickleable objects in the Standard Library. +UNPICKLEABLE_ERRORS = (PicklingError, TypeError, ValueError, NotImplementedError) + ### Shorthands (modified from python2.5/lib/pickle.py) def copy(obj, *args, **kwds): """ @@ -357,6 +333,8 @@ def _getopt(settings, key, arg=None, *, kwds=None): class Pickler(StockPickler): """python's Pickler extended to interpreter sessions""" dispatch = MetaCatchingDict(StockPickler.dispatch.copy()) + _refimported = False + _refonfail = False _session = False _first_pass = False _original_main = None @@ -371,57 +349,158 @@ def __init__(self, file, *args, **kwds): self._recurse = _getopt(settings, 'recurse', kwds=kwds) self._strictio = False #_getopt(settings, 'strictio', kwds=kwds) self._postproc = OrderedDict() - self._file = file - StockPickler.__init__(self, file, *args, **kwds) + self._file_tell = getattr(file, 'tell', None) # for logger and refonfail + super().__init__(file, *args, **kwds) - def dump(self, obj): #NOTE: if settings change, need to update attributes + def save(self, obj, save_persistent_id=True): # register if the object is a numpy ufunc # thanks to Paul Kienzle for pointing out ufuncs didn't pickle - if NumpyUfuncType and numpyufunc(obj): - @register(type(obj)) - def save_numpy_ufunc(pickler, obj): - logger.trace(pickler, "Nu: %s", obj) - name = getattr(obj, '__qualname__', getattr(obj, '__name__', None)) - StockPickler.save_global(pickler, obj, name=name) - logger.trace(pickler, "# Nu") - return - # NOTE: the above 'save' performs like: - # import copy_reg - # def udump(f): return f.__name__ - # def uload(name): return getattr(numpy, name) - # copy_reg.pickle(NumpyUfuncType, udump, uload) - # register if the object is a numpy dtype - if NumpyDType and numpydtype(obj): - @register(type(obj)) - def save_numpy_dtype(pickler, obj): - logger.trace(pickler, "Dt: %s", obj) - pickler.save_reduce(_create_dtypemeta, (obj.type,), obj=obj) - logger.trace(pickler, "# Dt") - return - # NOTE: the above 'save' performs like: - # import copy_reg - # def uload(name): return type(NumpyDType(name)) - # def udump(f): return uload, (f.type,) - # copy_reg.pickle(NumpyDTypeType, udump, uload) - # register if the object is a subclassed numpy array instance - if NumpyArrayType and ndarraysubclassinstance(obj): - @register(type(obj)) - def save_numpy_array(pickler, obj): - logger.trace(pickler, "Nu: (%s, %s)", obj.shape, obj.dtype) - npdict = getattr(obj, '__dict__', None) - f, args, state = obj.__reduce__() - pickler.save_reduce(_create_array, (f,args,state,npdict), obj=obj) - logger.trace(pickler, "# Nu") - return + obj_type = type(obj) + if NumpyArrayType and not (obj_type is type or obj_type in Pickler.dispatch): + if NumpyUfuncType and numpyufunc(obj_type): + @register(obj_type) + def save_numpy_ufunc(pickler, obj): + logger.trace(pickler, "Nu: %s", obj) + name = getattr(obj, '__qualname__', getattr(obj, '__name__', None)) + StockPickler.save_global(pickler, obj, name=name) + logger.trace(pickler, "# Nu") + return + # NOTE: the above 'save' performs like: + # import copy_reg + # def udump(f): return f.__name__ + # def uload(name): return getattr(numpy, name) + # copy_reg.pickle(NumpyUfuncType, udump, uload) + # register if the object is a numpy dtype + if NumpyDType and numpydtype(obj_type): + @register(obj_type) + def save_numpy_dtype(pickler, obj): + logger.trace(pickler, "Dt: %s", obj) + pickler.save_reduce(_create_dtypemeta, (obj.type,), obj=obj) + logger.trace(pickler, "# Dt") + return + # NOTE: the above 'save' performs like: + # import copy_reg + # def uload(name): return type(NumpyDType(name)) + # def udump(f): return uload, (f.type,) + # copy_reg.pickle(NumpyDTypeType, udump, uload) + # register if the object is a subclassed numpy array instance + if NumpyArrayType and ndarraysubclassinstance(obj_type): + @register(obj_type) + def save_numpy_array(pickler, obj): + logger.trace(pickler, "Nu: (%s, %s)", obj.shape, obj.dtype, obj=obj) + npdict = getattr(obj, '__dict__', None) + f, args, state = obj.__reduce__() + pickler.save_reduce(_create_array, (f,args,state,npdict), obj=obj) + logger.trace(pickler, "# Nu") + return # end hack if GENERATOR_FAIL and type(obj) == GeneratorType: msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType raise PicklingError(msg) + StockPickler.save(self, obj, save_persistent_id) + + save.__doc__ = StockPickler.save.__doc__ + + def dump(self, obj): #NOTE: if settings change, need to update attributes logger.trace_setup(self) StockPickler.dump(self, obj) - dump.__doc__ = StockPickler.dump.__doc__ + def save(self, obj, save_persistent_id=True, *, name=None): + # This method overrides StockPickler.save() and is called for every + # object pickled. When 'refonfail' is True, it tries to save the object + # by reference if pickling it fails with a common pickling error, as + # defined by the constant UNPICKLEABLE_ERRORS. If that also fails, then + # the exception is risen and, if this was called indirectly from another + # Pickler.save() call, the parent objects will try to be saved by + # reference recursively, until it succeeds or the exception propagates + # beyond the topmost save() call. The extra 'name' argument is passed + # to StockPickler.save_global(). + if not self._refonfail: + super().save(obj, save_persistent_id) + return + # Disable framing (right after the framer.init_framing() call at dump()). + self.framer.current_frame = None + # Store initial state. + position = self._file_tell() + memo_size = len(self.memo) + try: + super().save(obj, save_persistent_id) + except (AttributeError, *UNPICKLEABLE_ERRORS) as error_stack: + # AttributeError may happen in the save_global() call from a child object. + if type(error_stack) == AttributeError \ + and "no attribute '__name__'" not in error_stack.args[0]: + raise + if self._session and obj is self._main: + warnings.warn( + "module %r being saved by reference due to unpickleable" + " objects in its namespace" % self._main.__name__, + PicklingWarning, + stacklevel=5, + ) + message = ( + "# X: fallback to save as global: <%s object at %#012x>" + % (type(obj).__name__, id(obj)) + ) + # Roll back the stream. + self._file_seek(position) + self._file_truncate() + # Roll back memo. + for _ in range(len(self.memo) - memo_size): + self.memo.popitem() # LIFO order is guaranteed since 3.7 + # Try to save object by reference. + if isinstance(obj, ModuleType) and \ + (_is_builtin_module(obj) or obj is sys.modules['dill']): + self.save_reduce(_import_module, (obj.__name__,), obj=obj) + logger.trace(self, message, obj=obj) + return + if self._session: + if name is None and not (hasattr(obj, '__name__') or hasattr(obj, '__qualname__')): + name = self._id_to_name.get(id(obj)) + if name is not None and self._main.__name__ not in {'__main__', '__main_mp__'}: + self.save_reduce(getattr, (self._main, name), obj=obj) + logger.trace(self, message, obj=obj) + return + try: + self.save_global(obj, name) + logger.trace(self, message, obj=obj) + except (AttributeError, PicklingError) as error: + if getattr(self, '_trace_stack', None) and id(obj) == self._trace_stack[-1]: + # Roll back trace state. + self._trace_stack.pop() + self._size_stack.pop() + raise error from error_stack + + def _save_module_dict(self, obj): + """Save a module's dictionary. + + If an object doesn't have a '__name__' attribute, pass the object's name + in the module's namespace to save(), so that it can be used with + save_global() to increase the chances of finding the object for saving + it by reference in the event of a failed serialization. + """ + if not self._refonfail: + super().save_dict(obj) + return + # Modified from Python Standard Library's pickle._Pickler.save_dict() + # and pickle._Pickler._batch_setitems(). Summary of changes: use + # 'SETITEM' for all pickle protocols and conditionally pass an extra + # argument to a custom implementation of the method 'save'. + # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved + # License Agreement: https://opensource.org/licenses/Python-2.0 + if self.bin: + self.write(EMPTY_DICT) + else: # proto 0 -- can't use EMPTY_DICT + self.write(MARK + DICT) + self.memoize(obj) + for k, v in obj.items(): + self.save(k) + if hasattr(v, '__name__') or hasattr(v, '__qualname__'): + self.save(v) + else: + self.save(v, name=k) + self.write(SETITEM) + class Unpickler(StockUnpickler): """python's Unpickler extended to interpreter sessions and more types""" from .settings import settings @@ -486,7 +565,7 @@ def use_diff(on=True): Reduces size of pickles by only including object which have changed. Decreases pickle size but increases CPU time needed. - Also helps avoid some unpicklable objects. + Also helps avoid some unpickleable objects. MUST be called at start of script, otherwise changes will not be recorded. """ global _use_diff, diff @@ -769,6 +848,13 @@ def _create_ftype(ftypeobj, func, args, kwds): args = () return ftypeobj(func, *args, **kwds) +def _create_typing_tuple(argz, *args): #NOTE: workaround python/cpython#94245 + if not argz: + return typing.Tuple[()].copy_with(()) + if argz == ((),): + return typing.Tuple[()] + return typing.Tuple[argz] + def _create_lock(locked, *args): #XXX: ignores 'blocking' from threading import Lock lock = Lock() @@ -1011,13 +1097,6 @@ def _get_attr(self, name): # stop recursive pickling return getattr(self, name, None) or getattr(__builtin__, name) -def _dict_from_dictproxy(dictproxy): - _dict = dictproxy.copy() # convert dictproxy to dict - _dict.pop('__dict__', None) - _dict.pop('__weakref__', None) - _dict.pop('__prepare__', None) - return _dict - def _import_module(import_name, safe=False): try: if import_name.startswith('__runtime__.'): @@ -1189,26 +1268,30 @@ def _repr_dict(obj): @register(dict) def save_module_dict(pickler, obj): - if is_dill(pickler, child=False) and obj == pickler._main.__dict__ and \ + pickler_is_dill = is_dill(pickler, child=False) + if pickler_is_dill and obj == pickler._main.__dict__ and \ not (pickler._session and pickler._first_pass): - logger.trace(pickler, "D1: %s", _repr_dict(obj)) # obj + logger.trace(pickler, "D1: %s", _repr_dict(obj), obj=obj) pickler.write(bytes('c__builtin__\n__main__\n', 'UTF-8')) logger.trace(pickler, "# D1") - elif (not is_dill(pickler, child=False)) and (obj == _main_module.__dict__): - logger.trace(pickler, "D3: %s", _repr_dict(obj)) # obj + elif (not pickler_is_dill) and (obj == _main_module.__dict__): + logger.trace(pickler, "D3: %s", _repr_dict(obj), obj=obj) pickler.write(bytes('c__main__\n__dict__\n', 'UTF-8')) #XXX: works in general? logger.trace(pickler, "# D3") elif '__name__' in obj and obj != _main_module.__dict__ \ and type(obj['__name__']) is str \ and obj is getattr(_import_module(obj['__name__'],True), '__dict__', None): - logger.trace(pickler, "D4: %s", _repr_dict(obj)) # obj + logger.trace(pickler, "D4: %s", _repr_dict(obj), obj=obj) pickler.write(bytes('c%s\n__dict__\n' % obj['__name__'], 'UTF-8')) logger.trace(pickler, "# D4") + elif pickler_is_dill and pickler._session and pickler._first_pass: + # we only care about session the first pass thru + pickler._first_pass = False + logger.trace(pickler, "D5: %s", _repr_dict(obj), obj=obj) + pickler._save_module_dict(obj) + logger.trace(pickler, "# D5") else: - logger.trace(pickler, "D2: %s", _repr_dict(obj)) # obj - if is_dill(pickler, child=False) and pickler._session: - # we only care about session the first pass thru - pickler._first_pass = False + logger.trace(pickler, "D2: %s", _repr_dict(obj), obj=obj) StockPickler.save_dict(pickler, obj) logger.trace(pickler, "# D2") return @@ -1284,6 +1367,23 @@ def save_classobj(pickler, obj): #FIXME: enable pickler._byref logger.trace(pickler, "# C2") return +@register(typing._GenericAlias) +def save_generic_alias(pickler, obj): + args = obj.__args__ + if type(obj.__reduce__()) is str: + logger.trace(pickler, "Ga0: %s", obj) + StockPickler.save_global(pickler, obj, name=obj.__reduce__()) + logger.trace(pickler, "# Ga0") + elif obj.__origin__ is tuple and (not args or args == ((),)): + logger.trace(pickler, "Ga1: %s", obj) + pickler.save_reduce(_create_typing_tuple, (args,), obj=obj) + logger.trace(pickler, "# Ga1") + else: + logger.trace(pickler, "Ga2: %s", obj) + StockPickler.save_reduce(pickler, *obj.__reduce__(), obj=obj) + logger.trace(pickler, "# Ga2") + return + @register(LockType) def save_lock(pickler, obj): logger.trace(pickler, "Lo: %s", obj) @@ -1507,7 +1607,7 @@ def save_cell(pickler, obj): if MAPPING_PROXY_TRICK: @register(DictProxyType) def save_dictproxy(pickler, obj): - logger.trace(pickler, "Mp: %s", _repr_dict(obj)) # obj + logger.trace(pickler, "Mp: %s", _repr_dict(obj), obj=obj) mapping = obj | _dictproxy_helper_instance pickler.save_reduce(DictProxyType, (mapping,), obj=obj) logger.trace(pickler, "# Mp") @@ -1515,7 +1615,7 @@ def save_dictproxy(pickler, obj): else: @register(DictProxyType) def save_dictproxy(pickler, obj): - logger.trace(pickler, "Mp: %s", _repr_dict(obj)) # obj + logger.trace(pickler, "Mp: %s", _repr_dict(obj), obj=obj) pickler.save_reduce(DictProxyType, (obj.copy(),), obj=obj) logger.trace(pickler, "# Mp") return @@ -1585,18 +1685,11 @@ def save_weakref(pickler, obj): @register(ProxyType) @register(CallableProxyType) def save_weakproxy(pickler, obj): + # Must do string substitution here and use %r to avoid ReferenceError. + logger.trace(pickler, "R2: %r" % obj, obj=obj) refobj = _locate_object(_proxy_helper(obj)) - try: - _t = "R2" - logger.trace(pickler, "%s: %s", _t, obj) - except ReferenceError: - _t = "R3" - logger.trace(pickler, "%s: %s", _t, sys.exc_info()[1]) - #callable = bool(getattr(refobj, '__call__', None)) - if type(obj) is CallableProxyType: callable = True - else: callable = False - pickler.save_reduce(_create_weakproxy, (refobj, callable), obj=obj) - logger.trace(pickler, "# %s", _t) + pickler.save_reduce(_create_weakproxy, (refobj, callable(obj)), obj=obj) + logger.trace(pickler, "# R2") return def _is_builtin_module(module): @@ -1622,7 +1715,7 @@ def save_module(pickler, obj): pass else: logger.trace(pickler, "M2: %s with diff", obj) - logger.trace(pickler, "Diff: %s", changed.keys()) + logger.info("Diff: %s", changed.keys()) pickler.save_reduce(_import_module, (obj.__name__,), obj=obj, state=changed) logger.trace(pickler, "# M2") @@ -1694,20 +1787,19 @@ def save_type(pickler, obj, postproc_list=None): obj_recursive = id(obj) in getattr(pickler, '_postproc', ()) incorrectly_named = not _locate_function(obj, pickler) if not _byref and not obj_recursive and incorrectly_named: # not a function, but the name was held over - if issubclass(type(obj), type): - # thanks to Tom Stepleton pointing out pickler._session unneeded - _t = 'T2' - logger.trace(pickler, "%s: %s", _t, obj) - _dict = _dict_from_dictproxy(obj.__dict__) - else: - _t = 'T3' - logger.trace(pickler, "%s: %s", _t, obj) - _dict = obj.__dict__ + # thanks to Tom Stepleton pointing out pickler._session unneeded + logger.trace(pickler, "T2: %s", obj) + _dict = obj.__dict__.copy() # convert dictproxy to dict #print (_dict) #print ("%s\n%s" % (type(obj), obj.__name__)) #print ("%s\n%s" % (obj.__bases__, obj.__dict__)) - for name in _dict.get("__slots__", []): + slots = _dict.get('__slots__', ()) + if type(slots) == str: slots = (slots,) # __slots__ accepts a single string + for name in slots: del _dict[name] + _dict.pop('__dict__', None) + _dict.pop('__weakref__', None) + _dict.pop('__prepare__', None) if obj_name != obj.__name__: if postproc_list is None: postproc_list = [] @@ -1715,7 +1807,7 @@ def save_type(pickler, obj, postproc_list=None): _save_with_postproc(pickler, (_create_type, ( type(obj), obj.__name__, obj.__bases__, _dict )), obj=obj, postproc_list=postproc_list) - logger.trace(pickler, "# %s", _t) + logger.trace(pickler, "# T2") else: logger.trace(pickler, "T4: %s", obj) if incorrectly_named: @@ -1965,14 +2057,14 @@ def pickles(obj,exact=False,safe=False,**kwds): """ if safe: exceptions = (Exception,) # RuntimeError, ValueError else: - exceptions = (TypeError, AssertionError, NotImplementedError, PicklingError, UnpicklingError) + exceptions = UNPICKLEABLE_ERRORS + (AssertionError, UnpicklingError) try: pik = copy(obj, **kwds) #FIXME: should check types match first, then check content if "exact" try: #FIXME: should be "(pik == obj).all()" for numpy comparison, though that'll fail if shapes differ result = bool(pik.all() == obj.all()) - except AttributeError: + except (AttributeError, TypeError): warnings.filterwarnings('ignore') result = pik == obj warnings.resetwarnings() diff --git a/dill/_objects.py b/dill/_objects.py index d9f9fe57..011f7f1f 100644 --- a/dill/_objects.py +++ b/dill/_objects.py @@ -188,10 +188,8 @@ class _Struct(ctypes.Structure): z['CFloatType'] = ctypes.c_float() z['CDoubleType'] = ctypes.c_double() z['CSizeTType'] = ctypes.c_size_t() - z = (sys.platform[:3] == 'win' or sys.platform[:6] == 'darwin') # non-'nux - z = a if (sys.hexversion >= 0x30b00b3 and not z) else x - z['CLibraryLoaderType'] = ctypes.cdll del z + a['CLibraryLoaderType'] = ctypes.cdll a['StructureType'] = _Struct # if not IS_PYPY: # a['BigEndianStructureType'] = ctypes.BigEndianStructure() diff --git a/dill/logger.py b/dill/logger.py index 9359d0e4..0e7ed4a5 100644 --- a/dill/logger.py +++ b/dill/logger.py @@ -50,7 +50,7 @@ import math import os from functools import partial -from typing import NoReturn, TextIO, Union +from typing import TextIO, Union import dill from ._utils import _format_bytes_size @@ -130,22 +130,26 @@ def trace_setup(self, pickler): if not dill._dill.is_dill(pickler, child=False): return if self.isEnabledFor(logging.INFO): - pickler._trace_depth = 1 + pickler._trace_stack = [] pickler._size_stack = [] else: - pickler._trace_depth = None - def trace(self, pickler, msg, *args, **kwargs): - if not hasattr(pickler, '_trace_depth'): + pickler._trace_stack = None + def trace(self, pickler, msg, *args, obj=None, **kwargs): + if not hasattr(pickler, '_trace_stack'): logger.info(msg, *args, **kwargs) return - if pickler._trace_depth is None: + if pickler._trace_stack is None: return extra = kwargs.get('extra', {}) pushed_obj = msg.startswith('#') + if not pushed_obj: + if obj is None: + obj = args[-1] + pickler._trace_stack.append(id(obj)) size = None try: # Streams are not required to be tellable. - size = pickler._file.tell() + size = pickler._file_tell() frame = pickler.framer.current_frame try: size += frame.tell() @@ -160,13 +164,11 @@ def trace(self, pickler, msg, *args, **kwargs): else: size -= pickler._size_stack.pop() extra['size'] = size - if pushed_obj: - pickler._trace_depth -= 1 - extra['depth'] = pickler._trace_depth + extra['depth'] = len(pickler._trace_stack) kwargs['extra'] = extra self.info(msg, *args, **kwargs) - if not pushed_obj: - pickler._trace_depth += 1 + if pushed_obj: + pickler._trace_stack.pop() class TraceFormatter(logging.Formatter): """ @@ -207,11 +209,12 @@ def format(self, record): return super().format(record) logger = logging.getLogger('dill') +logger.propagate = False adapter = TraceAdapter(logger) -stderr_handler = logging.StreamHandler() +stderr_handler = logging._StderrHandler() adapter.addHandler(stderr_handler) -def trace(arg: Union[bool, TextIO, str, os.PathLike] = None, *, mode: str = 'a') -> NoReturn: +def trace(arg: Union[bool, TextIO, str, os.PathLike] = None, *, mode: str = 'a') -> None: """print a trace through the stack when pickling; useful for debugging With a single boolean argument, enable or disable the tracing. diff --git a/dill/session.py b/dill/session.py index 31aff717..70d10ae9 100644 --- a/dill/session.py +++ b/dill/session.py @@ -8,6 +8,27 @@ # - https://github.com/uqfoundation/dill/blob/master/LICENSE """ Pickle and restore the intepreter session or a module's state. + +The functions :py:func:`dump_module`, :py:func:`load_module` and +:py:func:`load_module_asdict` are capable of saving and restoring, as long as +objects are pickleable, the complete state of a module. For imported modules +that are pickled, `dill` assumes that they are importable when unpickling. + +Contrary of using :py:func:`dill.dump` and :py:func:`dill.load` to save and load +a module object, :py:func:`dill.dump_module` always try to pickle the module by +value (including built-in modules). Also, options like +``dill.settings['byref']`` and ``dill.settings['recurse']`` don't affect its +behavior. + +However, if a module contains references to objects originating from other +modules, that would prevent it from pickling or drastically increase its disk +size, they can be saved by reference instead of by value using the option +``refimported``. + +With :py:func:`dump_module`, namespace filters may be used to restrict the list +of variables pickled to a subset of those in the module, based on their names or +values. Also, using :py:func:`load_module_asdict` allows one to load the +variables from different saved states of the same module into dictionaries. """ from __future__ import annotations @@ -22,11 +43,13 @@ logger = logging.getLogger('dill.session') import contextlib +import io import re import sys import warnings +from contextlib import AbstractContextManager, nullcontext, suppress -from dill import _dill, Pickler, Unpickler +from dill import _dill, Pickler, Unpickler, UnpicklingError from ._dill import ( BuiltinMethodType, FunctionType, MethodType, ModuleType, TypeType, _getopt, _import_module, _is_builtin_module, _is_imported_module, @@ -43,10 +66,14 @@ TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) -class _PeekableReader: +class _PeekableReader(AbstractContextManager): """lightweight readable stream wrapper that implements peek()""" - def __init__(self, stream): + def __init__(self, stream, closing=True): self.stream = stream + self.closing = closing + def __exit__(self, *exc_info): + if self.closing: + self.stream.close() def read(self, n): return self.stream.read(n) def readline(self): @@ -68,29 +95,52 @@ def peek(self, n): except (AttributeError, OSError): raise NotImplementedError("stream is not peekable: %r", stream) from None -def _open(file, mode, *, peekable=False): +class _TruncatableWriter(io.BytesIO, AbstractContextManager): + """works as an unlimited buffer, writes to file on close""" + def __init__(self, stream, closing=True, *args, **kwds): + super().__init__(*args, **kwds) + self.stream = stream + self.closing = closing + def __exit__(self, *exc_info): + self.close() + def close(self): + self.stream.write(self.getvalue()) + with suppress(AttributeError): + self.stream.flush() + super().close() + if self.closing: + self.stream.close() + +def _open(file, mode, *, peekable=False, truncatable=False): """return a context manager with an opened file-like object""" - import io - attr = 'write' if 'w' in mode else 'read' - was_open = hasattr(file, attr) - if not was_open: + readonly = ('r' in mode and '+' not in mode) + if not readonly and peekable: + raise ValueError("the 'peekable' option is invalid for writable files") + if readonly and truncatable: + raise ValueError("the 'truncatable' option is invalid for read-only files") + should_close = not hasattr(file, 'read' if readonly else 'write') + if should_close: file = open(file, mode) - if attr == 'read' and peekable and not hasattr(file, 'peek'): - # Try our best to return the stream as an object with a peek() method. + # Wrap stream in a helper class if necessary. + if peekable and not hasattr(file, 'peek'): + # Try our best to return it as an object with a peek() method. if hasattr(file, 'tell') and hasattr(file, 'seek'): - file = _PeekableReader(file) + file = _PeekableReader(file, closing=should_close) else: try: file = io.BufferedReader(file) except Exception: - # Stream won't be peekable, but will fail gracefully in _identify_module(). - file = _PeekableReader(file) - if was_open: # should not close at exit - return contextlib.nullcontext(file) - elif type(file) == _PeekableReader: - return contextlib.closing(file) - else: + # It won't be peekable, but will fail gracefully in _identify_module(). + file = _PeekableReader(file, closing=should_close) + elif truncatable and ( + not hasattr(file, 'truncate') + or (hasattr(file, 'seekable') and not file.seekable()) + ): + file = _TruncatableWriter(file, closing=should_close) + if should_close or isinstance(file, (_PeekableReader, _TruncatableWriter)): return file + else: + return nullcontext(file) def _module_map(): """get map of imported modules""" @@ -218,15 +268,16 @@ def _filter_vars(main_module, exclude, include, base_rules): def dump_module( filename = str(TEMPDIR/'session.pkl'), - module: Union[ModuleType, str] = None, + module: Optional[Union[ModuleType, str]] = None, *, - refimported: bool = None, - exclude: Union[Filter, Iterable[Filter]] = None, - include: Union[Filter, Iterable[Filter]] = None, - base_rules: ModuleFilters = None, + refimported: Optional[bool] = None, + refonfail: Optional[bool] = None, + exclude: Optional[Union[Filter, Iterable[Filter]]] = None, + include: Optional[Union[Filter, Iterable[Filter]]] = None, + base_rules: Optional[ModuleFilters] = None, **kwds ) -> None: - """Pickle the current state of :py:mod:`__main__` or another module to a file. + R"""Pickle the current state of :py:mod:`__main__` or another module to a file. Save the contents of :py:mod:`__main__` (e.g. from an interactive interpreter session), an imported module, or a module-type object (e.g. @@ -249,6 +300,13 @@ def dump_module( similar but independent from ``dill.settings[`byref`]``, as ``refimported`` refers to virtually all imported objects, while ``byref`` only affects select objects. + refonfail: if `True` (the default), objects that fail to pickle by value + will try to be saved by reference. If this also fails, saving their + parent objects by reference will be attempted recursively. In the + worst case scenario, the module itself may be saved by reference, + with a warning. Note: this option disables framing for pickle + protocol >= 4. Turning this off may improve unpickling speed, but + may cause a module to fail pickling. exclude: here be dragons include: here be dragons base_rules: here be dragons @@ -282,6 +340,15 @@ def dump_module( >>> foo.sin = math.sin >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) + - Save the state of a module with unpickleable objects: + + >>> import dill + >>> import os + >>> os.altsep = '\\' + >>> dill.dump_module('os_session.pkl', module=os, refonfail=False) + PicklingError: ... + >>> dill.dump_module('os_session.pkl', module=os, refonfail=True) # the default + - Restore the state of the saved modules: >>> import dill @@ -294,6 +361,9 @@ def dump_module( >>> foo = dill.load_module('foo_session.pkl') >>> [foo.sin(x) for x in foo.values] [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] + >>> os = dill.load_module('os_session.pkl') + >>> print(os.altsep.join('path')) + p\a\t\h - Save current session but exclude some variables: @@ -307,7 +377,7 @@ def dump_module( Note: Currently, ``dill.settings['byref']`` and ``dill.settings['recurse']`` - don't apply to this function.` + don't apply to this function. """ for old_par, par in [('main', 'module'), ('byref', 'refimported')]: if old_par in kwds: @@ -320,10 +390,10 @@ def dump_module( refimported = kwds.pop('byref', refimported) module = kwds.pop('main', module) - from .settings import settings - protocol = settings['protocol'] - refimported = _getopt(settings, 'dump_module.refimported', refimported) - base_rules = _getopt(settings, 'dump_module.filters', base_rules) + from .settings import settings as dill_settings + protocol = dill_settings['protocol'] + refimported = _getopt(settings, 'refimported', refimported) + base_rules = _getopt(settings, 'filters', base_rules) if type(base_rules) != ModuleFilters: base_rules = ModuleFilters(base_rules) main = module @@ -347,7 +417,7 @@ def dump_module( if getattr(main, '__loader__', None) is None and _is_imported_module(original_main): # Trick _is_imported_module() to force saving this as an imported module. main.__loader__ = True # will be discarded by _dill.save_module() - with _open(filename, 'wb') as file: + with _open(filename, 'wb', truncatable=True) as file: pickler = Pickler(file, protocol, **kwds) pickler._main = main #FIXME: dill.settings are disabled pickler._byref = False # disable pickling by name reference @@ -356,6 +426,11 @@ def dump_module( pickler._first_pass = True if main is not original_main: pickler._original_main = original_main + if refonfail: + pickler._refonfail = True # False by default + pickler._file_seek = file.seek + pickler._file_truncate = file.truncate + pickler._id_to_name = {id(v): k for k, v in main.__dict__.items()} pickler.dump(main) return @@ -367,31 +442,39 @@ def dump_session(filename=str(TEMPDIR/'session.pkl'), main=None, byref=False, ** def _identify_module(file, main=None): """identify the name of the module stored in the given file-type object""" - from pickletools import genops - UNICODE = {'UNICODE', 'BINUNICODE', 'SHORT_BINUNICODE'} - found_import = False + import pickletools + NEUTRAL = {'PROTO', 'FRAME', 'PUT', 'BINPUT', 'MEMOIZE', 'MARK', 'STACK_GLOBAL'} + opcodes = ((opcode.name, arg) for opcode, arg, pos in pickletools.genops(file.peek(256)) + if opcode.name not in NEUTRAL) try: - for opcode, arg, pos in genops(file.peek(256)): - if not found_import: - # Find the first '_import_module' constructor. - if opcode.name in ('GLOBAL', 'SHORT_BINUNICODE') and \ - arg.endswith('_import_module'): - found_import = True - else: - # Just after that, the argument is the main module name. - if opcode.name in UNICODE: - return arg - else: - raise UnpicklingError("reached STOP without finding main module") + opcode, arg = next(opcodes) + if (opcode, arg) == ('SHORT_BINUNICODE', 'dill._dill'): + # The file uses STACK_GLOBAL instead of GLOBAL. + opcode, arg = next(opcodes) + if not (opcode in ('SHORT_BINUNICODE', 'GLOBAL') and arg.split()[-1] == '_import_module'): + raise ValueError + opcode, arg = next(opcodes) + if not opcode in ('SHORT_BINUNICODE', 'BINUNICODE', 'UNICODE'): + raise ValueError + module_name = arg + if not ( + next(opcodes)[0] in ('TUPLE1', 'TUPLE') and + next(opcodes)[0] == 'REDUCE' #and + #next(opcodes)[0] in ('EMPTY_DICT', 'DICT') + ): + raise ValueError + return module_name + except StopIteration: + raise UnpicklingError("reached STOP without finding module") from None except (NotImplementedError, ValueError) as error: # ValueError occours when the end of the chunk is reached (without a STOP). if isinstance(error, NotImplementedError) and main is not None: # The file is not peekable, but we have the argument main. return None - raise UnpicklingError("unable to identify main module") from error + raise UnpicklingError("unable to identify module") from error def is_pickled_module(filename, importable: bool = True) -> bool: - """Check if file is a pickled module state. + """Check if a file is a pickle file readable by :py:func:`load_module`. Parameters: filename: a path-like object or a readable stream. @@ -439,7 +522,7 @@ def is_pickled_module(filename, importable: bool = True) -> bool: def load_module( filename = str(TEMPDIR/'session.pkl'), - module: Union[ModuleType, str] = None, + module: Optional[Union[ModuleType, str]] = None, **kwds ) -> Optional[ModuleType]: """Update the selected module (default is :py:mod:`__main__`) with @@ -451,7 +534,7 @@ def load_module( :py:class:`~types.ModuleType`). When restoring the state of a non-importable module-type object, the - current instance of this module may be passed as the argument ``main``. + current instance of this module may be passed as the argument ``module``. Otherwise, a new instance is created with :py:class:`~types.ModuleType` and returned. @@ -464,12 +547,66 @@ def load_module( Raises: :py:exc:`UnpicklingError`: if unpickling fails. - :py:exc:`ValueError`: if the argument ``main`` and module saved + :py:exc:`ValueError`: if the argument ``module`` and module saved at ``filename`` are incompatible. Returns: A module object, if the saved module is not :py:mod:`__main__` or - a module instance wasn't provided with the argument ``main``. + a module instance wasn't provided with the argument ``module``. + + Passing an argument to ``module`` forces `dill` to verify that the module + being loaded is compatible with the argument value. Additionally, if the + argument is a module (instead of a module name), it supresses the return + value. Each case and behavior is exemplified below: + + 1. `module`: ``None`` --- This call loads a previously saved state of + the module ``math`` and returns this at the end: + + >>> import dill + >>> # load module -> restore state -> return module + >>> dill.load_module('math_session.pkl') + + + 2. `module`: ``str`` --- Passing the module name does the same as above, + but also verifies that the module loaded, restored and returned is + indeed ``math``: + + >>> import dill + >>> # load module -> check name/kind -> restore state -> return module + >>> dill.load_module('math_session.pkl', module='math') + + >>> dill.load_module('math_session.pkl', module='cmath') + ValueError: can't update module 'cmath' with the saved state of module 'math' + + 3. `module`: ``ModuleType`` --- Passing the module itself instead of its + name have the additional effect of supressing the return value (and the + module is already loaded at this point): + + >>> import dill + >>> import math + >>> # check name/kind -> restore state -> return None + >>> dill.load_module('math_session.pkl', module=math) + + For imported modules, the return value is meant as a convenience, so that + the function call can substitute an ``import`` statement. Therefore these + statements: + + >>> import dill + >>> math2 = dill.load_module('math_session.pkl', module='math') + + are equivalent to these: + + >>> import dill + >>> import math as math2 + >>> dill.load_module('math_session.pkl', module=math2) + + Note that, in both cases, ``math2`` is just a reference to + ``sys.modules['math']``: + + >>> import math + >>> import sys + >>> math is math2 is sys.modules['math'] + True Examples: @@ -547,6 +684,7 @@ def load_module( raise TypeError("both 'module' and 'main' arguments were used") module = kwds.pop('main') + main = module with _open(filename, 'rb', peekable=True) as file: #FIXME: dill.settings are disabled unpickler = Unpickler(file, **kwds) @@ -820,6 +958,16 @@ def get_filters(self, rule_type: RuleType): raise return self._parent.get_filters(rule_type) + +## Default settings ## + +settings = { + 'refimported': False, + 'refonfail': True, + 'filters': ModuleFilters(rules=()), +} + + ## Session filter factories ## def ipython_filter(*, keep_history: str = 'input') -> Callable[NamedObject, bool]: @@ -888,10 +1036,7 @@ def not_interactive_var(obj): return not_interactive_var -## Variables set in this module to avoid circular import problems. ## - -from .settings import settings -settings['dump_module']['filters'] = ModuleFilters(rules=()) +## Variables set in this module to avoid circular import problems ## # Internal exports for backward compatibility with dill v0.3.5.1 for name in ( @@ -900,4 +1045,4 @@ def not_interactive_var(obj): ): setattr(_dill, name, globals()[name]) -del name, settings +del name diff --git a/dill/source.py b/dill/source.py index 7e70a635..229a3575 100644 --- a/dill/source.py +++ b/dill/source.py @@ -12,7 +12,7 @@ """ Extensions to python's 'inspect' module, which can be used to retrieve information from live python objects. The methods -defined in this module are augmented to facilitate access to +defined in this module are augmented to facilitate access to source code of interactively defined functions and classes, as well as provide access to source code for objects defined in a file. @@ -29,6 +29,8 @@ ismodule, istraceback) from tokenize import TokenError +from ._dill import IS_IPYTHON + def isfrommain(obj): "check if object was built in __main__" @@ -41,7 +43,7 @@ def isfrommain(obj): def isdynamic(obj): "check if object was built in the interpreter" try: file = getfile(obj) - except TypeError: file = None + except TypeError: file = None if file == '' and isfrommain(obj): return True return False @@ -112,10 +114,32 @@ def findsource(object): module = getmodule(object) try: file = getfile(module) - except TypeError: file = None + except TypeError: file = None + is_module_main = (module and module.__name__ == '__main__' and not file) + if IS_IPYTHON and is_module_main: + #FIXME: quick fix for functions and classes in IPython interpreter + try: + file = getfile(object) + sourcefile = getsourcefile(object) + except TypeError: + if isclass(object): + for object_method in filter(isfunction, object.__dict__.values()): + # look for a method of the class + file_candidate = getfile(object_method) + if not file_candidate.startswith(' indent or spaces < 0: spaces = indent for i in range(len(lines) if all else 1): #FIXME: works... but shouldn't outdent 2nd+ lines of multiline doc @@ -498,7 +522,7 @@ def _outdent(lines, spaces=None, all=True): def outdent(code, spaces=None, all=True): '''outdent a block of code (default is to strip all leading whitespace)''' - indent = indentsize(code) + indent = indentsize(code) if spaces is None or spaces > indent or spaces < 0: spaces = indent #XXX: will this delete '\n' in some cases? if not all: return code[spaces:] @@ -554,7 +578,7 @@ def dumpsource(object, alias='', new=False, enclose=True): else: stub = alias pre = '%s = ' % stub if alias else alias - + # if a 'new' instance is not needed, then just dump and load if not new or not _isinstance(object): code += pre + 'dill.loads(%s)\n' % pik @@ -829,7 +853,7 @@ def _closuredimport(func, alias='', builtin=False): re.match(pat, line)] if not candidate: mod = getname(getmodule(fobj)) - #HACK: get file containing 'inner' function; is func there? + #HACK: get file containing 'inner' function; is func there? lines,_ = findsource(fobj) candidate = [line for line in lines \ if getname(fobj) in line and re.match(pat, line)] diff --git a/dill/tests/test_classdef.py b/dill/tests/test_classdef.py index 8edf5daf..8480fc90 100644 --- a/dill/tests/test_classdef.py +++ b/dill/tests/test_classdef.py @@ -128,8 +128,8 @@ def test_dtype(): import numpy as np dti = np.dtype('int') - assert np.dtype == dill.loads(dill.dumps(np.dtype)) - assert dti == dill.loads(dill.dumps(dti)) + assert np.dtype == dill.copy(np.dtype) + assert dti == dill.copy(dti) except ImportError: pass @@ -139,8 +139,7 @@ def test_array_nested(): x = np.array([1]) y = (x,) - dill.dumps(x) - assert y == dill.loads(dill.dumps(y)) + assert y == dill.copy(y) except ImportError: pass @@ -198,7 +197,7 @@ def test(cls): # test slots class Y(object): - __slots__ = ['y'] + __slots__ = ('y', '__weakref__') def __init__(self, y): self.y = y diff --git a/dill/tests/test_selected.py b/dill/tests/test_selected.py index 59ff2ce1..2f0eda73 100644 --- a/dill/tests/test_selected.py +++ b/dill/tests/test_selected.py @@ -98,9 +98,28 @@ def test_frame_related(): assert ok if verbose: print ("") +def test_typing(): + import typing + x = typing.Any + assert x == dill.copy(x) + x = typing.Dict[int, str] + assert x == dill.copy(x) + x = typing.List[int] + assert x == dill.copy(x) + x = typing.Tuple[int, str] + assert x == dill.copy(x) + x = typing.Tuple[int] + assert x == dill.copy(x) + x = typing.Tuple[()] + assert x == dill.copy(x) + x = typing.Tuple[()].copy_with(()) + assert x == dill.copy(x) + return + if __name__ == '__main__': test_frame_related() test_dict_contents() test_class() test_class_descriptors() + test_typing() diff --git a/dill/tests/test_weakref.py b/dill/tests/test_weakref.py index 0e99f3ea..943b73d9 100644 --- a/dill/tests/test_weakref.py +++ b/dill/tests/test_weakref.py @@ -14,15 +14,7 @@ class _class: def _method(self): pass -class _class2: - def __call__(self): - pass - -class _newclass(object): - def _method(self): - pass - -class _newclass2(object): +class _callable_class: def __call__(self): pass @@ -32,42 +24,33 @@ def _function(): def test_weakref(): o = _class() - oc = _class2() - n = _newclass() - nc = _newclass2() + oc = _callable_class() f = _function - z = _class - x = _newclass + x = _class + # ReferenceType r = weakref.ref(o) - dr = weakref.ref(_class()) - p = weakref.proxy(o) - dp = weakref.proxy(_class()) - c = weakref.proxy(oc) - dc = weakref.proxy(_class2()) + d_r = weakref.ref(_class()) + fr = weakref.ref(f) + xr = weakref.ref(x) - m = weakref.ref(n) - dm = weakref.ref(_newclass()) - t = weakref.proxy(n) - dt = weakref.proxy(_newclass()) - d = weakref.proxy(nc) - dd = weakref.proxy(_newclass2()) + # ProxyType + p = weakref.proxy(o) + d_p = weakref.proxy(_class()) - fr = weakref.ref(f) + # CallableProxyType + cp = weakref.proxy(oc) + d_cp = weakref.proxy(_callable_class()) fp = weakref.proxy(f) - #zr = weakref.ref(z) #XXX: weakrefs not allowed for classobj objects - #zp = weakref.proxy(z) #XXX: weakrefs not allowed for classobj objects - xr = weakref.ref(x) xp = weakref.proxy(x) - objlist = [r,dr,m,dm,fr,xr, p,dp,t,dt, c,dc,d,dd, fp,xp] + objlist = [r,d_r,fr,xr, p,d_p, cp,d_cp,fp,xp] #dill.detect.trace(True) for obj in objlist: res = dill.detect.errors(obj) if res: - print ("%s" % res) - #print ("%s:\n %s" % (obj, res)) + print ("%r:\n %s" % (obj, res)) # else: # print ("PASS: %s" % obj) assert not res diff --git a/docs/source/conf.py b/docs/source/conf.py index 20fff7f4..e732a27a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -73,13 +73,12 @@ 'special-members': True, 'show-inheritance': True, 'exclude-members': ( #NOTE: this is a single string concatenation - '__dict__,' # implementation detail (may be verbose) - '__slots__,' # implementation detail - '__module__,' # implementation detail - '__weakref__,' # built-in automatic attribute, mostly meaningless - '__annotations__,' # redundant with signature documentation + '__dict__,' # implementation detail (may be verbose) + '__slots__,' # implementation detail + '__module__,' # implementation detail + '__annotations__,' # redundant with signature documentation '__dataclass_fields__,' # dataclass automatic attribute, redundant - '_abc_impl,' # implementation detail + '_abc_impl,' # implementation detail ) } autodoc_typehints = 'description' diff --git a/setup.py b/setup.py index bd182e23..b28e04f1 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ author_email = AUTHOR_EMAIL, maintainer = __author__, maintainer_email = AUTHOR_EMAIL, - license = '3-clause BSD', + license = 'BSD-3-Clause', platforms = ['Linux', 'Windows', 'Mac'], url = 'https://github.com/uqfoundation/dill', download_url = 'https://pypi.org/project/dill/#files', From f60d239156295876699221b77c9c13f8b0a625a5 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 13 Aug 2022 22:14:01 -0300 Subject: [PATCH 060/109] merge the two save() methods and save_module_dict() with _save_module_dict() --- dill/_dill.py | 123 ++++++++++++++++++++++++------------------------ dill/session.py | 1 - 2 files changed, 61 insertions(+), 63 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 91a0b51f..bfcd6a67 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -326,8 +326,11 @@ class UnpicklingWarning(PickleWarning, UnpicklingError): class Pickler(StockPickler): """python's Pickler extended to interpreter sessions""" dispatch = MetaCatchingDict(StockPickler.dispatch.copy()) - _refonfail = False + _refimported = False + _refonfail = False # True in session.settings _session = False + _first_pass = False + _original_main = None from .settings import settings def __init__(self, file, *args, **kwds): @@ -346,12 +349,23 @@ def __init__(self, file, *args, **kwds): self._postproc = OrderedDict() self._file_tell = getattr(file, 'tell', None) # for logger and refonfail - def save(self, obj, save_persistent_id=True): - # register if the object is a numpy ufunc - # thanks to Paul Kienzle for pointing out ufuncs didn't pickle + def save(self, obj, save_persistent_id=True, *, name=None): + # This method overrides StockPickler.save() and is called for every + # object pickled. When 'refonfail' is True, it tries to save the object + # by reference if pickling it fails with a common pickling error, as + # defined by the constant UNPICKLEABLE_ERRORS. If that also fails, then + # the exception is risen and, if this was called indirectly from another + # Pickler.save() call, the parent objects will try to be saved by + # reference recursively, until it succeeds or the exception propagates + # beyond the topmost save() call. The extra 'name' argument is passed + # to StockPickler.save_global(). + + # numpy hack obj_type = type(obj) if NumpyArrayType and not (obj_type is type or obj_type in Pickler.dispatch): - if NumpyUfuncType and numpyufunc(obj_type): + # register if the object is a numpy ufunc + # thanks to Paul Kienzle for pointing out ufuncs didn't pickle + if numpyufunc(obj_type): @register(obj_type) def save_numpy_ufunc(pickler, obj): logger.trace(pickler, "Nu: %s", obj) @@ -365,7 +379,7 @@ def save_numpy_ufunc(pickler, obj): # def uload(name): return getattr(numpy, name) # copy_reg.pickle(NumpyUfuncType, udump, uload) # register if the object is a numpy dtype - if NumpyDType and numpydtype(obj_type): + if numpydtype(obj_type): @register(obj_type) def save_numpy_dtype(pickler, obj): logger.trace(pickler, "Dt: %s", obj) @@ -378,7 +392,7 @@ def save_numpy_dtype(pickler, obj): # def udump(f): return uload, (f.type,) # copy_reg.pickle(NumpyDTypeType, udump, uload) # register if the object is a subclassed numpy array instance - if NumpyArrayType and ndarraysubclassinstance(obj_type): + if ndarraysubclassinstance(obj_type): @register(obj_type) def save_numpy_array(pickler, obj): logger.trace(pickler, "Nu: (%s, %s)", obj.shape, obj.dtype, obj=obj) @@ -387,32 +401,17 @@ def save_numpy_array(pickler, obj): pickler.save_reduce(_create_array, (f,args,state,npdict), obj=obj) logger.trace(pickler, "# Nu") return - # end hack - if GENERATOR_FAIL and type(obj) == GeneratorType: + # end numpy hack + + if GENERATOR_FAIL and obj_type is GeneratorType: msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType raise PicklingError(msg) - StockPickler.save(self, obj, save_persistent_id) - - save.__doc__ = StockPickler.save.__doc__ - - def dump(self, obj): #NOTE: if settings change, need to update attributes - logger.trace_setup(self) - StockPickler.dump(self, obj) - dump.__doc__ = StockPickler.dump.__doc__ - def save(self, obj, save_persistent_id=True, *, name=None): - # This method overrides StockPickler.save() and is called for every - # object pickled. When 'refonfail' is True, it tries to save the object - # by reference if pickling it fails with a common pickling error, as - # defined by the constant UNPICKLEABLE_ERRORS. If that also fails, then - # the exception is risen and, if this was called indirectly from another - # Pickler.save() call, the parent objects will try to be saved by - # reference recursively, until it succeeds or the exception propagates - # beyond the topmost save() call. The extra 'name' argument is passed - # to StockPickler.save_global(). if not self._refonfail: super().save(obj, save_persistent_id) return + + # Save with 'refonfail'. # Disable framing (right after the framer.init_framing() call at dump()). self.framer.current_frame = None # Store initial state. @@ -464,36 +463,13 @@ def save(self, obj, save_persistent_id=True, *, name=None): self._trace_stack.pop() self._size_stack.pop() raise error from error_stack + return + save.__doc__ = StockPickler.save.__doc__ - def _save_module_dict(self, obj): - """Save a module's dictionary. - - If an object doesn't have a '__name__' attribute, pass the object's name - in the module's namespace to save(), so that it can be used with - save_global() to increase the chances of finding the object for saving - it by reference in the event of a failed serialization. - """ - if not self._refonfail: - super().save_dict(obj) - return - # Modified from Python Standard Library's pickle._Pickler.save_dict() - # and pickle._Pickler._batch_setitems(). Summary of changes: use - # 'SETITEM' for all pickle protocols and conditionally pass an extra - # argument to a custom implementation of the method 'save'. - # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved - # License Agreement: https://opensource.org/licenses/Python-2.0 - if self.bin: - self.write(EMPTY_DICT) - else: # proto 0 -- can't use EMPTY_DICT - self.write(MARK + DICT) - self.memoize(obj) - for k, v in obj.items(): - self.save(k) - if hasattr(v, '__name__') or hasattr(v, '__qualname__'): - self.save(v) - else: - self.save(v, name=k) - self.write(SETITEM) + def dump(self, obj): #NOTE: if settings change, need to update attributes + logger.trace_setup(self) + StockPickler.dump(self, obj) + dump.__doc__ = StockPickler.dump.__doc__ class Unpickler(StockUnpickler): """python's Unpickler extended to interpreter sessions and more types""" @@ -1279,16 +1255,39 @@ def save_module_dict(pickler, obj): logger.trace(pickler, "D4: %s", _repr_dict(obj), obj=obj) pickler.write(bytes('c%s\n__dict__\n' % obj['__name__'], 'UTF-8')) logger.trace(pickler, "# D4") - elif pickler_is_dill and pickler._session and pickler._first_pass: + elif not (pickler_is_dill and pickler._session and pickler._first_pass and pickler._refonfail): # we only care about session the first pass thru - pickler._first_pass = False - logger.trace(pickler, "D5: %s", _repr_dict(obj), obj=obj) - pickler._save_module_dict(obj) - logger.trace(pickler, "# D5") - else: + if pickler_is_dill and pickler._first_pass: + pickler._first_pass = False logger.trace(pickler, "D2: %s", _repr_dict(obj), obj=obj) StockPickler.save_dict(pickler, obj) logger.trace(pickler, "# D2") + else: + # If an object doesn't have a '__name__' attribute, pass the object's name + # in the module's namespace to save(), so that it can be used with + # save_global() to increase the chances of finding the object for saving + # it by reference in the event of a failed serialization. + pickler._first_pass = False + logger.trace(pickler, "D5: %s", _repr_dict(obj), obj=obj) + # Modified from Python Standard Library's pickle._Pickler.save_dict() + # and pickle._Pickler._batch_setitems(). Summary of changes: use + # 'SETITEM' for all pickle protocols and conditionally pass an extra + # argument to a custom implementation of the method 'save'. + # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved + # License Agreement: https://opensource.org/licenses/Python-2.0 + if pickler.bin: + pickler.write(EMPTY_DICT) + else: # proto 0 -- can't use EMPTY_DICT + pickler.write(MARK + DICT) + pickler.memoize(obj) + for k, v in obj.items(): + pickler.save(k) + if hasattr(v, '__name__') or hasattr(v, '__qualname__'): + pickler.save(v) + else: + pickler.save(v, name=k) + pickler.write(SETITEM) + logger.trace(pickler, "# D5") return diff --git a/dill/session.py b/dill/session.py index feecc147..608abbcb 100644 --- a/dill/session.py +++ b/dill/session.py @@ -366,7 +366,6 @@ def dump_module( pickler._refonfail = True # False by default pickler._file_seek = file.seek pickler._file_truncate = file.truncate - pickler._id_to_name = {id(v): k for k, v in main.__dict__.items()} pickler.dump(main) return From 4fe577b6c3eb693a870bc7dc932b78c7ac63f3dc Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 13 Aug 2022 22:21:00 -0300 Subject: [PATCH 061/109] minor --- dill/_dill.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index bfcd6a67..938f4728 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -39,8 +39,7 @@ #XXX: get types from .objtypes ? import builtins as __builtin__ from pickle import _Pickler as StockPickler, Unpickler as StockUnpickler -from pickle import DICT, EMPTY_DICT, MARK, SETITEM -from struct import pack +from pickle import DICT, EMPTY_DICT, GLOBAL, MARK, SETITEM from _thread import LockType from _thread import RLock as RLockType #from io import IOBase @@ -174,8 +173,6 @@ def get_file_type(*args, **kwargs): import dataclasses import typing -from pickle import GLOBAL - ### Shims for different versions of Python and dill class Sentinel(object): From d059d842fb37f1c7b2b55fd2d60709f59d4cd390 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 13 Aug 2022 23:09:56 -0300 Subject: [PATCH 062/109] grammar; keep __weakref__ attribute in docs --- dill/session.py | 30 +++++++++++++++--------------- docs/source/conf.py | 2 +- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/dill/session.py b/dill/session.py index 608abbcb..36964eb7 100644 --- a/dill/session.py +++ b/dill/session.py @@ -15,18 +15,18 @@ that are pickled, `dill` assumes that they are importable when unpickling. Contrary of using :py:func:`dill.dump` and :py:func:`dill.load` to save and load -a module object, :py:func:`dill.dump_module` always try to pickle the module by -value (including built-in modules). Also, options like +a module object, :py:func:`dill.dump_module` always tries to pickle the module +by value (including built-in modules). Also, options like ``dill.settings['byref']`` and ``dill.settings['recurse']`` don't affect its behavior. However, if a module contains references to objects originating from other modules, that would prevent it from pickling or drastically increase its disk -size, they can be saved by reference instead of by value using the option +size, they can be saved by reference instead of by value, using the option ``refimported``. With :py:func:`dump_module`, namespace filters may be used to restrict the list -of variables pickled to a subset of those in the module, based on their names or +of pickled variables to a subset of those in the module, based on their names or values. Also, using :py:func:`load_module_asdict` allows one to load the variables from different saved states of the same module into dictionaries. """ @@ -261,8 +261,8 @@ def dump_module( parent objects by reference will be attempted recursively. In the worst case scenario, the module itself may be saved by reference, with a warning. Note: this option disables framing for pickle - protocol >= 4. Turning this off may improve unpickling speed, but - may cause a module to fail pickling. + protocol >= 4. Turning it off may improve unpickling speed, but may + cause a module to fail pickling. **kwds: extra keyword arguments passed to :py:class:`Pickler()`. Raises: @@ -470,7 +470,7 @@ def load_module( value. Each case and behavior is exemplified below: 1. `module`: ``None`` --- This call loads a previously saved state of - the module ``math`` and returns this at the end: + the module ``math`` and returns it (the module object) at the end: >>> import dill >>> # load module -> restore state -> return module @@ -478,7 +478,7 @@ def load_module( 2. `module`: ``str`` --- Passing the module name does the same as above, - but also verifies that the module loaded, restored and returned is + but also verifies that the module being loaded, restored and returned is indeed ``math``: >>> import dill @@ -489,7 +489,7 @@ def load_module( ValueError: can't update module 'cmath' with the saved state of module 'math' 3. `module`: ``ModuleType`` --- Passing the module itself instead of its - name have the additional effect of supressing the return value (and the + name has the additional effect of suppressing the return value (and the module is already loaded at this point): >>> import dill @@ -715,22 +715,22 @@ def load_module_asdict( raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") with _open(filename, 'rb', peekable=True) as file: main_name = _identify_module(file) - old_main = sys.modules.get(main_name) + original_main = sys.modules.get(main_name) main = ModuleType(main_name) if update: - if old_main is None: - old_main = _import_module(main_name) - main.__dict__.update(old_main.__dict__) + if original_main is None: + original_main = _import_module(main_name) + main.__dict__.update(original_main.__dict__) else: main.__builtins__ = __builtin__ try: sys.modules[main_name] = main load_module(file, **kwds) finally: - if old_main is None: + if original_main is None: del sys.modules[main_name] else: - sys.modules[main_name] = old_main + sys.modules[main_name] = original_main main.__session__ = str(filename) return main.__dict__ diff --git a/docs/source/conf.py b/docs/source/conf.py index ff34cd55..72c6fdfe 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -72,7 +72,7 @@ 'private-members': True, 'special-members': True, 'show-inheritance': True, - 'exclude-members': '__dict__, __module__, __slots__, __weakref__', + 'exclude-members': '__dict__, __module__, __slots__', } autodoc_typehints = 'description' napoleon_include_init_with_doc = True From 35cd4a0e0a14c084dda4a0fef80f4fc3449dcd9f Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 13 Aug 2022 23:35:20 -0300 Subject: [PATCH 063/109] Merge branch 'document-session' into session-excludes --- dill/_dill.py | 120 +++++++++++++++++++++++------------------------- dill/session.py | 22 ++++----- 2 files changed, 69 insertions(+), 73 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 35cfb6fa..6aeb68ab 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -334,7 +334,7 @@ class Pickler(StockPickler): """python's Pickler extended to interpreter sessions""" dispatch = MetaCatchingDict(StockPickler.dispatch.copy()) _refimported = False - _refonfail = False + _refonfail = False # True in session.settings _session = False _first_pass = False _original_main = None @@ -352,12 +352,23 @@ def __init__(self, file, *args, **kwds): self._file_tell = getattr(file, 'tell', None) # for logger and refonfail super().__init__(file, *args, **kwds) - def save(self, obj, save_persistent_id=True): - # register if the object is a numpy ufunc - # thanks to Paul Kienzle for pointing out ufuncs didn't pickle + def save(self, obj, save_persistent_id=True, *, name=None): + # This method overrides StockPickler.save() and is called for every + # object pickled. When 'refonfail' is True, it tries to save the object + # by reference if pickling it fails with a common pickling error, as + # defined by the constant UNPICKLEABLE_ERRORS. If that also fails, then + # the exception is risen and, if this was called indirectly from another + # Pickler.save() call, the parent objects will try to be saved by + # reference recursively, until it succeeds or the exception propagates + # beyond the topmost save() call. The extra 'name' argument is passed + # to StockPickler.save_global(). + + # numpy hack obj_type = type(obj) if NumpyArrayType and not (obj_type is type or obj_type in Pickler.dispatch): - if NumpyUfuncType and numpyufunc(obj_type): + # register if the object is a numpy ufunc + # thanks to Paul Kienzle for pointing out ufuncs didn't pickle + if numpyufunc(obj_type): @register(obj_type) def save_numpy_ufunc(pickler, obj): logger.trace(pickler, "Nu: %s", obj) @@ -371,7 +382,7 @@ def save_numpy_ufunc(pickler, obj): # def uload(name): return getattr(numpy, name) # copy_reg.pickle(NumpyUfuncType, udump, uload) # register if the object is a numpy dtype - if NumpyDType and numpydtype(obj_type): + if numpydtype(obj_type): @register(obj_type) def save_numpy_dtype(pickler, obj): logger.trace(pickler, "Dt: %s", obj) @@ -384,7 +395,7 @@ def save_numpy_dtype(pickler, obj): # def udump(f): return uload, (f.type,) # copy_reg.pickle(NumpyDTypeType, udump, uload) # register if the object is a subclassed numpy array instance - if NumpyArrayType and ndarraysubclassinstance(obj_type): + if ndarraysubclassinstance(obj_type): @register(obj_type) def save_numpy_array(pickler, obj): logger.trace(pickler, "Nu: (%s, %s)", obj.shape, obj.dtype, obj=obj) @@ -393,32 +404,17 @@ def save_numpy_array(pickler, obj): pickler.save_reduce(_create_array, (f,args,state,npdict), obj=obj) logger.trace(pickler, "# Nu") return - # end hack - if GENERATOR_FAIL and type(obj) == GeneratorType: + # end numpy hack + + if GENERATOR_FAIL and obj_type is GeneratorType: msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType raise PicklingError(msg) - StockPickler.save(self, obj, save_persistent_id) - - save.__doc__ = StockPickler.save.__doc__ - - def dump(self, obj): #NOTE: if settings change, need to update attributes - logger.trace_setup(self) - StockPickler.dump(self, obj) - dump.__doc__ = StockPickler.dump.__doc__ - def save(self, obj, save_persistent_id=True, *, name=None): - # This method overrides StockPickler.save() and is called for every - # object pickled. When 'refonfail' is True, it tries to save the object - # by reference if pickling it fails with a common pickling error, as - # defined by the constant UNPICKLEABLE_ERRORS. If that also fails, then - # the exception is risen and, if this was called indirectly from another - # Pickler.save() call, the parent objects will try to be saved by - # reference recursively, until it succeeds or the exception propagates - # beyond the topmost save() call. The extra 'name' argument is passed - # to StockPickler.save_global(). if not self._refonfail: super().save(obj, save_persistent_id) return + + # Save with 'refonfail'. # Disable framing (right after the framer.init_framing() call at dump()). self.framer.current_frame = None # Store initial state. @@ -470,36 +466,13 @@ def save(self, obj, save_persistent_id=True, *, name=None): self._trace_stack.pop() self._size_stack.pop() raise error from error_stack + return + save.__doc__ = StockPickler.save.__doc__ - def _save_module_dict(self, obj): - """Save a module's dictionary. - - If an object doesn't have a '__name__' attribute, pass the object's name - in the module's namespace to save(), so that it can be used with - save_global() to increase the chances of finding the object for saving - it by reference in the event of a failed serialization. - """ - if not self._refonfail: - super().save_dict(obj) - return - # Modified from Python Standard Library's pickle._Pickler.save_dict() - # and pickle._Pickler._batch_setitems(). Summary of changes: use - # 'SETITEM' for all pickle protocols and conditionally pass an extra - # argument to a custom implementation of the method 'save'. - # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved - # License Agreement: https://opensource.org/licenses/Python-2.0 - if self.bin: - self.write(EMPTY_DICT) - else: # proto 0 -- can't use EMPTY_DICT - self.write(MARK + DICT) - self.memoize(obj) - for k, v in obj.items(): - self.save(k) - if hasattr(v, '__name__') or hasattr(v, '__qualname__'): - self.save(v) - else: - self.save(v, name=k) - self.write(SETITEM) + def dump(self, obj): #NOTE: if settings change, need to update attributes + logger.trace_setup(self) + StockPickler.dump(self, obj) + dump.__doc__ = StockPickler.dump.__doc__ class Unpickler(StockUnpickler): """python's Unpickler extended to interpreter sessions and more types""" @@ -1284,16 +1257,39 @@ def save_module_dict(pickler, obj): logger.trace(pickler, "D4: %s", _repr_dict(obj), obj=obj) pickler.write(bytes('c%s\n__dict__\n' % obj['__name__'], 'UTF-8')) logger.trace(pickler, "# D4") - elif pickler_is_dill and pickler._session and pickler._first_pass: + elif not (pickler_is_dill and pickler._session and pickler._first_pass and pickler._refonfail): # we only care about session the first pass thru - pickler._first_pass = False - logger.trace(pickler, "D5: %s", _repr_dict(obj), obj=obj) - pickler._save_module_dict(obj) - logger.trace(pickler, "# D5") - else: + if pickler_is_dill and pickler._first_pass: + pickler._first_pass = False logger.trace(pickler, "D2: %s", _repr_dict(obj), obj=obj) StockPickler.save_dict(pickler, obj) logger.trace(pickler, "# D2") + else: + # If an object doesn't have a '__name__' attribute, pass the object's name + # in the module's namespace to save(), so that it can be used with + # save_global() to increase the chances of finding the object for saving + # it by reference in the event of a failed serialization. + pickler._first_pass = False + logger.trace(pickler, "D5: %s", _repr_dict(obj), obj=obj) + # Modified from Python Standard Library's pickle._Pickler.save_dict() + # and pickle._Pickler._batch_setitems(). Summary of changes: use + # 'SETITEM' for all pickle protocols and conditionally pass an extra + # argument to a custom implementation of the method 'save'. + # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved + # License Agreement: https://opensource.org/licenses/Python-2.0 + if pickler.bin: + pickler.write(EMPTY_DICT) + else: # proto 0 -- can't use EMPTY_DICT + pickler.write(MARK + DICT) + pickler.memoize(obj) + for k, v in obj.items(): + pickler.save(k) + if hasattr(v, '__name__') or hasattr(v, '__qualname__'): + pickler.save(v) + else: + pickler.save(v, name=k) + pickler.write(SETITEM) + logger.trace(pickler, "# D5") return diff --git a/dill/session.py b/dill/session.py index 70d10ae9..a0feda0d 100644 --- a/dill/session.py +++ b/dill/session.py @@ -15,18 +15,18 @@ that are pickled, `dill` assumes that they are importable when unpickling. Contrary of using :py:func:`dill.dump` and :py:func:`dill.load` to save and load -a module object, :py:func:`dill.dump_module` always try to pickle the module by -value (including built-in modules). Also, options like +a module object, :py:func:`dill.dump_module` always tries to pickle the module +by value (including built-in modules). Also, options like ``dill.settings['byref']`` and ``dill.settings['recurse']`` don't affect its behavior. However, if a module contains references to objects originating from other modules, that would prevent it from pickling or drastically increase its disk -size, they can be saved by reference instead of by value using the option +size, they can be saved by reference instead of by value, using the option ``refimported``. With :py:func:`dump_module`, namespace filters may be used to restrict the list -of variables pickled to a subset of those in the module, based on their names or +of pickled variables to a subset of those in the module, based on their names or values. Also, using :py:func:`load_module_asdict` allows one to load the variables from different saved states of the same module into dictionaries. """ @@ -305,8 +305,8 @@ def dump_module( parent objects by reference will be attempted recursively. In the worst case scenario, the module itself may be saved by reference, with a warning. Note: this option disables framing for pickle - protocol >= 4. Turning this off may improve unpickling speed, but - may cause a module to fail pickling. + protocol >= 4. Turning it off may improve unpickling speed, but may + cause a module to fail pickling. exclude: here be dragons include: here be dragons base_rules: here be dragons @@ -394,7 +394,8 @@ def dump_module( protocol = dill_settings['protocol'] refimported = _getopt(settings, 'refimported', refimported) base_rules = _getopt(settings, 'filters', base_rules) - if type(base_rules) != ModuleFilters: base_rules = ModuleFilters(base_rules) + if not isinstance(base_rules, ModuleFilters): + base_rules = ModuleFilters(base_rules) main = module if main is None: @@ -430,7 +431,6 @@ def dump_module( pickler._refonfail = True # False by default pickler._file_seek = file.seek pickler._file_truncate = file.truncate - pickler._id_to_name = {id(v): k for k, v in main.__dict__.items()} pickler.dump(main) return @@ -560,7 +560,7 @@ def load_module( value. Each case and behavior is exemplified below: 1. `module`: ``None`` --- This call loads a previously saved state of - the module ``math`` and returns this at the end: + the module ``math`` and returns it (the module object) at the end: >>> import dill >>> # load module -> restore state -> return module @@ -568,7 +568,7 @@ def load_module( 2. `module`: ``str`` --- Passing the module name does the same as above, - but also verifies that the module loaded, restored and returned is + but also verifies that the module being loaded, restored and returned is indeed ``math``: >>> import dill @@ -579,7 +579,7 @@ def load_module( ValueError: can't update module 'cmath' with the saved state of module 'math' 3. `module`: ``ModuleType`` --- Passing the module itself instead of its - name have the additional effect of supressing the return value (and the + name has the additional effect of suppressing the return value (and the module is already loaded at this point): >>> import dill From 20f04e2a35a069561e339393579e5f140f7188bc Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sun, 14 Aug 2022 23:28:53 -0300 Subject: [PATCH 064/109] settings updates and tests --- dill/__init__.py | 2 +- dill/_utils.py | 31 +++++-- dill/session.py | 20 ++++- dill/settings.py | 41 +++++---- dill/tests/test_session.py | 6 +- dill/tests/test_settings.py | 171 ++++++++++++++++++++++++++++++++++++ 6 files changed, 241 insertions(+), 30 deletions(-) create mode 100644 dill/tests/test_settings.py diff --git a/dill/__init__.py b/dill/__init__.py index dc984cca..3dd3ab02 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -37,7 +37,7 @@ from . import detect, logger, session, source, temp # get global settings -from .settings import settings, read_settings +from .settings import settings, read_settings, reset_settings # make sure "trace" is turned off logger.trace(False) diff --git a/dill/_utils.py b/dill/_utils.py index 319d33d4..b8e2598d 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -48,19 +48,19 @@ def _format_bytes_size(size: Union[int, float]) -> Tuple[int, str]: Rule = Tuple[RuleType, Union[Filter, Iterable[Filter]]] class NamedObject: - """Simple container class for a variable name and value.""" + """Simple container for a variable's name and value used by filter functions.""" __slots__ = 'name', 'value' - def __init__(self, name_value): + def __init__(self, name_value: Tuple[str, Any]): self.name, self.value = name_value - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: """ Prevent simple bugs from writing `lambda obj: obj == 'literal'` instead of `lambda obj: obj.value == 'literal' in a filter definition.` """ - if type(other) != NamedObject: + if type(other) is not NamedObject: raise TypeError("'==' not supported between instances of 'NamedObject' and %r" % type(other).__name__) - return super().__eq__(other) + return self.value is other.value and self.name == other.name def _iter(filters): if isinstance(filters, str): @@ -78,9 +78,11 @@ class FilterSet(MutableSet): ids: Set[int] = field(default_factory=set) types: Set[type] = field(default_factory=set) funcs: Set[Callable] = field(default_factory=set) + _fields = None _rtypemap = None _typename_regex = re.compile(r'\w+(?=Type$)|\w+$', re.IGNORECASE) + def _match_type(self, filter: Filter) -> Tuple[filter, str]: filter_type = type(filter) if filter_type == str: @@ -103,6 +105,7 @@ def _match_type(self, filter: Filter) -> Tuple[filter, str]: else: raise ValueError("invalid filter: %r" % filter) return filter, getattr(self, field) + # Mandatory MutableSet methods. @classmethod def _from_iterable(cls, it): @@ -122,6 +125,7 @@ def add(self, filter): def discard(self, filter): filter, filter_set = self._match_type(filter) filter_set.discard(filter) + # Overwrite generic methods (optimization). def remove(self, filter): filter, filter_set = self._match_type(filter) @@ -144,6 +148,7 @@ def __ior__(self, filters): for filter in filters: self.add(filter) return self + # Extra methods. def update(self, filters): self |= filters @@ -158,6 +163,7 @@ def get_type(cls, typename: str) -> type: if cls._rtypemap is None: cls._rtypemap = {cls._get_typekey(k): v for k, v in _dill._reverse_typemap.items()} return cls._rtypemap[cls._get_typekey(typename)] + FilterSet._fields = tuple(field.name for field in fields(FilterSet)) class _FilterSetDescriptor: @@ -254,11 +260,13 @@ class FilterRules: __slots__ = '_exclude', '_include' exclude = _FilterSetDescriptor() include = _FilterSetDescriptor() + def __init__(self, rules: Union[Iterable[Rule], FilterRules] = None): self._exclude = FilterSet() self._include = FilterSet() if rules is not None: self.update(rules) + def __repr__(self): desc = [" 78 else " " return sep.join(desc) + ">" + + def __eq__(self, other): + if not isinstance(other, FilterRules): + return NotImplemented + MISSING = object() + self_exclude = getattr(self, 'exclude', MISSING) + self_include = getattr(self, 'include', MISSING) + other_exclude = getattr(other, 'exclude', MISSING) + other_include = getattr(other, 'include', MISSING) + return self_exclude == other_exclude and self_include == other_include + # Proxy add(), discard(), remove() and clear() to FilterSets. def __proxy__(self, method, filter, *, rule_type=RuleType.EXCLUDE): if not isinstance(rule_type, RuleType): @@ -287,9 +306,11 @@ def __proxy__(self, method, filter, *, rule_type=RuleType.EXCLUDE): add = partialmethod(__proxy__, 'add') discard = partialmethod(__proxy__, 'discard') remove = partialmethod(__proxy__, 'remove') + def clear(self): self.exclude.clear() self.include.clear() + def update(self, rules: Union[Iterable[Rule], FilterRules]): """Update both FilterSets from a list of (RuleType, Filter) rules.""" if isinstance(rules, FilterRules): diff --git a/dill/session.py b/dill/session.py index a0feda0d..c6e092ae 100644 --- a/dill/session.py +++ b/dill/session.py @@ -40,7 +40,7 @@ ] import logging -logger = logging.getLogger('dill.session') +logger = logging.getLogger(__name__) import contextlib import io @@ -339,6 +339,7 @@ def dump_module( >>> import math >>> foo.sin = math.sin >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) + FIXME: here be dragons - Save the state of a module with unpickleable objects: @@ -393,6 +394,7 @@ def dump_module( from .settings import settings as dill_settings protocol = dill_settings['protocol'] refimported = _getopt(settings, 'refimported', refimported) + refonfail = _getopt(settings, 'refonfail', refonfail) base_rules = _getopt(settings, 'filters', base_rules) if not isinstance(base_rules, ModuleFilters): base_rules = ModuleFilters(base_rules) @@ -431,6 +433,7 @@ def dump_module( pickler._refonfail = True # False by default pickler._file_seek = file.seek pickler._file_truncate = file.truncate + pickler._id_to_name = {id(v): k for k, v in main.__dict__.items()} pickler.dump(main) return @@ -942,6 +945,13 @@ def __getitem__(self, name): return mod_rules else: return mod_rules[submodules] + def __eq__(self, other): + if isinstance(other, ModuleFilters): + return super().__eq__(other) and self._module == other._module + elif isinstance(other, FilterRules): + return super().__eq__(other) + else: + return NotImplemented def get(self, name: str, default: ModuleFilters = None): try: return self[name] @@ -959,7 +969,7 @@ def get_filters(self, rule_type: RuleType): return self._parent.get_filters(rule_type) -## Default settings ## +## Session settings ## settings = { 'refimported': False, @@ -967,6 +977,12 @@ def get_filters(self, rule_type: RuleType): 'filters': ModuleFilters(rules=()), } +# For read_settings(): +from .settings import DEFAULT_SETTINGS +DEFAULT_SETTINGS[__name__] = settings.copy() +del DEFAULT_SETTINGS[__name__]['filters'] +del DEFAULT_SETTINGS + ## Session filter factories ## diff --git a/dill/settings.py b/dill/settings.py index 4847c071..39944009 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -9,7 +9,7 @@ global settings for Pickler """ -__all__ = ['settings', 'read_settings'] +__all__ = ['settings', 'read_settings', 'reset_settings'] from pickle import DEFAULT_PROTOCOL, HIGHEST_PROTOCOL @@ -21,21 +21,14 @@ 'fmode' : 0, #HANDLE_FMODE 'recurse' : False, 'ignore' : False, - 'dump_module' : { - 'filters' : None, #ModuleFilters(rules=()) # set in dill.session - 'refimported' : False, - }, } ### Config file reader (INI format) ### DEFAULT_SETTINGS = { 'dill': settings.copy(), - 'dill.dump_module': settings['dump_module'].copy() + 'dill.session': None, # set in dill.session } -del DEFAULT_SETTINGS['dill']['dump_module'] -del DEFAULT_SETTINGS['dill.dump_module']['filters'] - FMODES = dict(HANDLE_FMODE=0, CONTENTS_FMODE=1, FILE_FMODE=2) STANDARD_PROTOCOLS = dict(DEFAULT_PROTOCOL=DEFAULT_PROTOCOL, HIGHEST_PROTOCOL=HIGHEST_PROTOCOL) @@ -105,6 +98,7 @@ def read_settings(filename) -> None: - `protocol`: DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, 0, 1, 2, 3, ... - `fmode`: HANDLE_FMODE, 0, CONTENTS_FMODE, 1, FILE_FMODE, 2 + .. IMPORTANT: The demo config file below is used in test_settings.py .. Lexer 'pacmanconf' generates better highlighting than 'ini'. .. code-block:: pacmanconf @@ -114,10 +108,10 @@ def read_settings(filename) -> None: protocol = HIGHEST_PROTOCOL byref = yes - [dill.dump_module] + [dill.session] # Settings for dill.dump_module() - ## Stored in dill.settings['dump_module']. - refimported = yes + ## Stored in dill.session.settings. + refonfail = no [filters] # Default exclude/include filters for dill.dump_module() @@ -152,7 +146,7 @@ def read_settings(filename) -> None: """ import configparser from dill import DEFAULT_PROTOCOL, HANDLE_FMODE - from dill.session import ModuleFilters + from dill.session import ModuleFilters, settings as session_settings cp = configparser.ConfigParser( dict_type=dict, # internal, in place of OrderedDict @@ -174,12 +168,10 @@ def read_settings(filename) -> None: new_settings['fmode'] = int(FMODES.get(fmode, fmode)) new_settings['protocol'] = int(STANDARD_PROTOCOLS.get(protocol, protocol)) - # dump_module() settings. - new_settings['dump_module'] = { - 'refimported': cp.getboolean('dill.dump_module', 'refimported'), - 'filters': ModuleFilters(rules=()), - } - filters = new_settings['dump_module']['filters'] + # Session settings (for dump_module). + section = cp['dill.session'] + new_session_settings = {k: section.getboolean(k) for k in DEFAULT_SETTINGS['dill.session']} + filters = new_session_settings['filters'] = ModuleFilters(rules=()) if 'filters' in cp: # Default filters. _read_filters(cp['filters'], filters) @@ -194,3 +186,14 @@ def read_settings(filename) -> None: # Update settings dictionary. settings.clear() settings.update(new_settings) + session_settings.clear() + session_settings.update(new_session_settings) + +def reset_settings() -> None: + "Reset all the dill settings to its default values." + from dill.session import ModuleFilters, settings as session_settings + settings.clear() + settings.update(DEFAULT_SETTINGS['dill']) + session_settings.clear() + session_settings.update(DEFAULT_SETTINGS['dill.session']) + session_settings['filters'] = ModuleFilters(rules=()) diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index f671e0bc..3cc14c7d 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -22,7 +22,7 @@ # Child process # ################### -def _error_line(error, obj, refimported): +def _error_line(obj, refimported): import traceback line = traceback.format_exc().splitlines()[-2].replace('[obj]', '['+repr(obj)+']') return "while testing (with refimported=%s): %s" % (refimported, line.lstrip()) @@ -54,7 +54,7 @@ def test_modules(refimported): assert __main__.complex_log is cmath.log except AssertionError as error: - error.args = (_error_line(error, obj, refimported),) + error.args = (_error_line(obj, refimported),) raise test_modules(refimported) @@ -145,7 +145,7 @@ def _test_objects(main, globals_copy, refimported): assert selfref is __main__ except AssertionError as error: - error.args = (_error_line(error, obj, refimported),) + error.args = (_error_line(obj, refimported),) raise def test_session_main(refimported): diff --git a/dill/tests/test_settings.py b/dill/tests/test_settings.py new file mode 100644 index 00000000..9c202604 --- /dev/null +++ b/dill/tests/test_settings.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python + +# Author: Leonardo Gama (@leogama) +# Copyright (c) 2022 The Uncertainty Quantification Foundation. +# License: 3-clause BSD. The full license text is available at: +# - https://github.com/uqfoundation/dill/blob/master/LICENSE + +import io +import re +import sys +import textwrap +import warnings +from pickletools import optimize +from types import ModuleType + +import dill +from dill.session import ModuleFilters, settings as session_settings +from dill.settings import DEFAULT_SETTINGS + +regex = r' +\[dill].+(?=\n +Parameters:$)' +config_demo = re.search(regex, dill.read_settings.__doc__, re.DOTALL | re.MULTILINE).group() +config_demo = textwrap.dedent(config_demo) + +def test_read_settings(): + dill.read_settings(io.StringIO(config_demo)) + + # dill general settings + dill_default = DEFAULT_SETTINGS['dill'] + assert dill.settings['recurse'] is dill_default['recurse'] # unchanged + assert dill.settings['byref'] is (not dill_default['byref']) # here and below: changed + assert dill.settings['protocol'] != dill_default['protocol'] + assert dill.settings['protocol'] == dill.HIGHEST_PROTOCOL # value passed as text + + # session settings (excluding filters) + session_default = DEFAULT_SETTINGS['dill.session'] + assert session_settings['refimported'] is session_default['refimported'] # unchanged + assert session_settings['refonfail'] is (not session_default['refonfail']) # changed + + # session default filters + filters = session_settings['filters'] + assert type(filters) is dill.session.ModuleFilters + assert filters._module == 'DEFAULT' + assert len(filters.exclude) == 8 and len(filters.include) == 2 + assert filters.exclude.regexes == {re.compile(r'_.+')} + assert io.BytesIO in filters.exclude.types + for filter in filters.exclude.funcs: # it's a set, we don't know the order + if isinstance(filter, dill._utils.size_filter): + assert filter.limit == 10000 + else: + obj1 = dill.session.NamedObject(('bool', True)) + obj2 = dill.session.NamedObject(('int', 1)) + assert filter(obj1) is False + assert filter(obj2) is True + ## include: different types of filters in the same entry. + assert len(filters.include.names) == len(filters.include.regexes) == 1 + + # module specific filters + assert filters['some.module']._module == 'some.module' + assert filters['some.module'].exclude.regexes == filters.exclude.regexes + assert not hasattr(filters['some.module'], 'include') # not set, fall back to parent + ## 'some': parent placeholder + assert filters['some']._module == 'some' + assert not hasattr(filters['some'], 'exclude') and not hasattr(filters['some'], 'include') + ## 'another.module': empty filters, overwrite default filters + assert len(filters['another.module'].exclude) == len(filters['another.module'].include) == 0 + +def test_reset_settings(): + dill.reset_settings() + assert dill.settings == DEFAULT_SETTINGS['dill'] + settings_copy = session_settings.copy() + del settings_copy['filters'] + assert settings_copy == DEFAULT_SETTINGS['dill.session'] + assert session_settings['filters'] == ModuleFilters(rules=()) + +class Test: + def __init__(self): + pass + +def test_settings(): + # byref and recurse + for option in ('byref', 'recurse'): + dill.reset_settings() + NON_DEFAULT = not dill.settings[option] + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + pickle1 = dill.dumps(Test) # default + pickle2 = dill.dumps(Test, **{option: NON_DEFAULT}) + dill.settings[option] = NON_DEFAULT + try: + assert pickle1 != pickle2 + assert dill.dumps(Test) == pickle2 + except AssertionError as error: + error.args = ("while testing option %r" % option,) + raise + + # ignore + dill.reset_settings() + NON_DEFAULT = not dill.settings['ignore'] + obj = Test() + copy1 = dill.copy(obj) # default + copy2 = dill.copy(obj, ignore=NON_DEFAULT) + dill.settings['ignore'] = NON_DEFAULT + copy3 = dill.copy(obj) + default_res = type(copy1) is Test + non_default_res = type(copy2) is Test + assert default_res is not non_default_res + assert (type(copy3) is Test) is non_default_res + + # protocol + # Only protocol zero doesn't have an opcode for empty tuple. + dill.reset_settings() + EMPTY_TUPLE_0 = b'(t.' + assert dill.dumps(()) != EMPTY_TUPLE_0 + dill.settings['protocol'] = 0 + assert dill.dumps(()) == EMPTY_TUPLE_0 + + # fmode + dill.reset_settings() + dill.settings['protocol'] = 0 + for fmode in (dill.HANDLE_FMODE, dill.CONTENTS_FMODE): + dill.settings['fmode'] = fmode + dump = optimize(dill.dumps(sys.stdin)) # remove momeize opcodes + assert dump.endswith(str(fmode).encode() + b'\nV\ntR.') + + # session.refimported + dill.reset_settings() + module = ModuleType('__test__') + module.BUILTIN_CONSTANTS = dill.session.BUILTIN_CONSTANTS + NON_DEFAULT = not session_settings['refimported'] + ## default + buf = io.BytesIO() + dill.dump_module(buf, module) # refimported=DEFAULT + buf.seek(0) + copy1 = dill.load_module(buf) + ## non-default + buf = io.BytesIO() + dill.dump_module(buf, module, refimported=NON_DEFAULT) + buf.seek(0) + copy2 = dill.load_module(buf) + ## non-default (settings) + session_settings['refimported'] = NON_DEFAULT + buf = io.BytesIO() + dill.dump_module(buf, module) + buf.seek(0) + copy3 = dill.load_module(buf) + ## tuple was saved by reference? + default_res = copy1.BUILTIN_CONSTANTS is dill.session.BUILTIN_CONSTANTS + non_default_res = copy2.BUILTIN_CONSTANTS is dill.session.BUILTIN_CONSTANTS + test_res = copy3.BUILTIN_CONSTANTS is dill.session.BUILTIN_CONSTANTS + assert default_res is not non_default_res + assert test_res is non_default_res + + # session.refonfail + dill.reset_settings() + assert session_settings['refonfail'] is True + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + + dill.dump_module(io.BytesIO(), sys) # should work + session_settings['refonfail'] = False + try: + dill.dump_module(io.BytesIO(), sys) + except Exception: + pass + else: + raise("saving 'sys' without 'refonfail' should have failed") + +if __name__ == '__main__': + test_read_settings() + test_reset_settings() + test_settings() From d2a023bd6a41f819af860fd3b402c5c6b5e87ce0 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sun, 14 Aug 2022 23:45:07 -0300 Subject: [PATCH 065/109] fix test that doesn't apply to PyPy --- dill/tests/test_settings.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dill/tests/test_settings.py b/dill/tests/test_settings.py index 9c202604..c84966f2 100644 --- a/dill/tests/test_settings.py +++ b/dill/tests/test_settings.py @@ -65,6 +65,9 @@ def test_read_settings(): assert len(filters['another.module'].exclude) == len(filters['another.module'].include) == 0 def test_reset_settings(): + dill.settings['byref'] = 'anything' + session_settings['refimported'] = 'something else' + session_settings['filters'].add('a_name') dill.reset_settings() assert dill.settings == DEFAULT_SETTINGS['dill'] settings_copy = session_settings.copy() @@ -166,6 +169,7 @@ def test_settings(): raise("saving 'sys' without 'refonfail' should have failed") if __name__ == '__main__': - test_read_settings() + if not dill._dill.IS_PYPY: + test_read_settings() test_reset_settings() test_settings() From 3c7291e4df0ed9b82ec393ae28a2c2bb7f7a5dcc Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 15 Aug 2022 00:32:58 -0300 Subject: [PATCH 066/109] minor changes to settings docs --- dill/settings.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dill/settings.py b/dill/settings.py index 39944009..3676170e 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -94,12 +94,12 @@ def read_settings(filename) -> None: Accepted option values for general settings: - - boolean options (case insensitive): yes, no, on, off, true, false + - boolean options (case insensitive): False, True - `protocol`: DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, 0, 1, 2, 3, ... - `fmode`: HANDLE_FMODE, 0, CONTENTS_FMODE, 1, FILE_FMODE, 2 - .. IMPORTANT: The demo config file below is used in test_settings.py - .. Lexer 'pacmanconf' generates better highlighting than 'ini'. + Example of a settings file: + .. code-block:: pacmanconf [dill] @@ -132,7 +132,7 @@ def read_settings(filename) -> None: #include = [filters.another.module] - # Filter rules specifit to the module 'another.module' + # Filter rules specific to the module 'another.module' ## Empty filter sets disable filtering for this module. exclude = include = From f5b9ecf787d476283575b064dbf760e8097b87f1 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 15 Aug 2022 22:52:15 -0300 Subject: [PATCH 067/109] review code; complement annotation; clean up imports; some extra methods --- dill/_utils.py | 130 +++++++++++++++++++++++++++-------------------- dill/session.py | 98 ++++++++++++++++++++--------------- dill/settings.py | 22 ++++---- 3 files changed, 142 insertions(+), 108 deletions(-) diff --git a/dill/_utils.py b/dill/_utils.py index b8e2598d..7783e04a 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -14,18 +14,17 @@ __all__ = ['FilterRules', 'Filter', 'RuleType', 'size_filter', 'EXCLUDE', 'INCLUDE'] import math -import random import re import warnings from dataclasses import dataclass, field, fields -from collections import namedtuple from collections.abc import MutableSet from enum import Enum from functools import partialmethod from itertools import chain, filterfalse -from statistics import mean -from types import ModuleType -from typing import Any, Callable, Dict, Iterable, Pattern, Set, Tuple, Union +from typing import ( + Any, Callable, Dict, Iterable, Iterator, + Optional, Pattern, Set, Tuple, Union, +) from dill import _dill @@ -35,18 +34,17 @@ def _format_bytes_size(size: Union[int, float]) -> Tuple[int, str]: power_of_2 = math.trunc(size).bit_length() - 1 magnitude = min(power_of_2 - power_of_2 % 10, 80) # 2**80 == 1 YiB if magnitude: - size = ((size >> magnitude-1) + 1) >> 1 # rounding trick: 1535 -> 1K; 1536 -> 2K - unit = "%siB" % "KMGTPEZY"[magnitude // 10] + # Rounding trick: 1535 (1024 + 511) -> 1K; 1536 -> 2K + size = ((size >> magnitude-1) + 1) >> 1 + unit = "%siB" % "KMGTPEZY"[(magnitude // 10) - 1] return size, unit -# Namespace filtering. + +## Namespace filtering. ## RuleType = Enum('RuleType', 'EXCLUDE INCLUDE', module=__name__) EXCLUDE, INCLUDE = RuleType.EXCLUDE, RuleType.INCLUDE -Filter = Union[str, Pattern[str], int, type, Callable] -Rule = Tuple[RuleType, Union[Filter, Iterable[Filter]]] - class NamedObject: """Simple container for a variable's name and value used by filter functions.""" __slots__ = 'name', 'value' @@ -62,6 +60,10 @@ def __eq__(self, other: Any) -> bool: type(other).__name__) return self.value is other.value and self.name == other.name +FilterFunction = Callable[[NamedObject], bool] +Filter = Union[str, Pattern[str], int, type, FilterFunction] +Rule = Tuple[RuleType, Union[Filter, Iterable[Filter]]] + def _iter(filters): if isinstance(filters, str): return None @@ -77,15 +79,16 @@ class FilterSet(MutableSet): regexes: Set[Pattern[str]] = field(default_factory=set) ids: Set[int] = field(default_factory=set) types: Set[type] = field(default_factory=set) - funcs: Set[Callable] = field(default_factory=set) + funcs: Set[FilterFunction] = field(default_factory=set) + # Initialized later. _fields = None _rtypemap = None - _typename_regex = re.compile(r'\w+(?=Type$)|\w+$', re.IGNORECASE) def _match_type(self, filter: Filter) -> Tuple[filter, str]: + """identify the filter's type and convert it to standard internal format""" filter_type = type(filter) - if filter_type == str: + if filter_type is str: if filter.isidentifier(): field = 'names' elif filter.startswith('type:'): @@ -94,9 +97,9 @@ def _match_type(self, filter: Filter) -> Tuple[filter, str]: else: filter = re.compile(filter) field = 'regexes' - elif filter_type == re.Pattern: + elif filter_type is re.Pattern: field = 'regexes' - elif filter_type == int: + elif filter_type is int: field = 'ids' elif isinstance(filter, type): field = 'types' @@ -108,55 +111,60 @@ def _match_type(self, filter: Filter) -> Tuple[filter, str]: # Mandatory MutableSet methods. @classmethod - def _from_iterable(cls, it): + def _from_iterable(cls, it: Iterable[Filter]) -> FilterSet: obj = cls() obj |= it return obj - def __contains__(self, filter): + def __bool__(self) -> bool: + return any(getattr(self, field) for field in self._fields) + def __len__(self) -> int: + return sum(len(getattr(self, field)) for field in self._fields) + def __contains__(self, filter: Filter) -> bool: filter, filter_set = self._match_type(filter) return filter in filter_set - def __iter__(self): + def __iter__(self) -> Iterator[Filter]: return chain.from_iterable(getattr(self, field) for field in self._fields) - def __len__(self): - return sum(len(getattr(self, field)) for field in self._fields) - def add(self, filter): + def add(self, filter: Filter) -> None: filter, filter_set = self._match_type(filter) filter_set.add(filter) - def discard(self, filter): + def discard(self, filter: Filter) -> None: filter, filter_set = self._match_type(filter) filter_set.discard(filter) # Overwrite generic methods (optimization). - def remove(self, filter): + def remove(self, filter: Filter) -> None: filter, filter_set = self._match_type(filter) filter_set.remove(filter) - def clear(self): + def clear(self) -> None: for field in self._fields: getattr(self, field).clear() - def __or__(self, other): - if not isinstance(other, Iterable): - return NotImplemented + def __or__(self, other: Iterable[Filter]) -> FilterSet: obj = self.copy() obj |= other return obj __ror__ = __or__ - def __ior__(self, filters): - if isinstance(filters, FilterSet): + def __ior__(self, other: Iterable[Filter]) -> FilterSet: + if not isinstance(other, Iterable): + return NotImplemented + if isinstance(other, FilterSet): for field in self._fields: - getattr(self, field).update(getattr(filters, field)) + getattr(self, field).update(getattr(other, field)) else: - for filter in filters: + for filter in other: self.add(filter) return self # Extra methods. - def update(self, filters): + def update(self, filters: Iterable[Filters]) -> None: self |= filters - def copy(self): + def copy(self) -> FilterSet: return FilterSet(*(getattr(self, field).copy() for field in self._fields)) + + # Convert type name to type. + TYPENAME_REGEX = re.compile(r'\w+(?=Type$)|\w+$', re.IGNORECASE) @classmethod def _get_typekey(cls, typename: str) -> str: - return cls._typename_regex.match(typename).group().lower() + return cls.TYPENAME_REGEX.match(typename).group().lower() @classmethod def get_type(cls, typename: str) -> type: """retrieve a type registered in ``dill``'s "reverse typemap"'""" @@ -257,7 +265,7 @@ class FilterRules: rules work as an allowlist** instead, and only the variables matched by the include filters are kept. """ - __slots__ = '_exclude', '_include' + __slots__ = '_exclude', '_include', '__weakref__' exclude = _FilterSetDescriptor() include = _FilterSetDescriptor() @@ -267,7 +275,9 @@ def __init__(self, rules: Union[Iterable[Rule], FilterRules] = None): if rules is not None: self.update(rules) - def __repr__(self): + def __repr__(self) -> str: + """Compact representation of FilterSet.""" + COLUMNS = 78 desc = [" 78: + if len(set_desc) > COLUMNS: set_desc = ["FilterSet("] + re.findall(r'\w+={.+?}', set_desc) set_desc = ",\n ".join(set_desc) + "\n )" set_desc = "%s=%s" % (attr, set_desc) @@ -284,10 +294,10 @@ def __repr__(self): desc.append(set_desc) if len(desc) == 1: desc += ["NOT SET"] - sep = "\n " if sum(len(x) for x in desc) > 78 else " " + sep = "\n " if sum(len(x) for x in desc) > COLUMNS else " " return sep.join(desc) + ">" - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: if not isinstance(other, FilterRules): return NotImplemented MISSING = object() @@ -298,7 +308,13 @@ def __eq__(self, other): return self_exclude == other_exclude and self_include == other_include # Proxy add(), discard(), remove() and clear() to FilterSets. - def __proxy__(self, method, filter, *, rule_type=RuleType.EXCLUDE): + def __proxy__(self, + method: str, + filter: Filter, + *, + rule_type: RuleType = RuleType.EXCLUDE, + ) -> None: + """Call 'method' over FilterSet specified by 'rule_type'.""" if not isinstance(rule_type, RuleType): raise ValueError("invalid rule type: %r (must be one of %r)" % (rule_type, list(RuleType))) filter_set = getattr(self, rule_type.name.lower()) @@ -306,12 +322,11 @@ def __proxy__(self, method, filter, *, rule_type=RuleType.EXCLUDE): add = partialmethod(__proxy__, 'add') discard = partialmethod(__proxy__, 'discard') remove = partialmethod(__proxy__, 'remove') - - def clear(self): + def clear(self) -> None: self.exclude.clear() self.include.clear() - def update(self, rules: Union[Iterable[Rule], FilterRules]): + def update(self, rules: Union[Iterable[Rule], FilterRules]) -> None: """Update both FilterSets from a list of (RuleType, Filter) rules.""" if isinstance(rules, FilterRules): for field in FilterSet._fields: @@ -330,7 +345,10 @@ def update(self, rules: Union[Iterable[Rule], FilterRules]): else: self.add(filter, rule_type=rule_type) - def _apply_filters(self, filter_set, objects): + def _apply_filters(self, + filter_set: FilterSet, + objects: Iterable[NamedObject] + ) -> Iterator[NamedObject]: filters = [] types_list = tuple(filter_set.types) # Apply broader/cheaper filters first. @@ -355,17 +373,17 @@ def apply_filters(self, namespace: Dict[str, Any]) -> Dict[str, Any]: namespace_copy = namespace.copy() all_objs = [NamedObject(item) for item in namespace_copy.items()] - if not self.exclude: - # Treat this rule set as an allowlist. - exclude_objs = all_objs - else: + if self.exclude: include_names = {obj.name for obj in self._apply_filters(self.exclude, all_objs)} exclude_objs = [obj for obj in all_objs if obj.name not in include_names] + else: + # Treat this rule set as an allowlist. + exclude_objs = all_objs if self.include and exclude_objs: exclude_objs = list(self._apply_filters(self.include, exclude_objs)) + if not exclude_objs: return namespace - if len(exclude_objs) == len(namespace): warnings.warn( "the exclude/include rules applied have excluded all %d items" % len(all_objs), @@ -384,7 +402,10 @@ def apply_filters(self, namespace: Dict[str, Any]) -> Dict[str, Any]: import collections import collections.abc +import random +from statistics import mean from sys import getsizeof +from types import ModuleType class size_filter: """Create a filter function with a limit for estimated object size. @@ -403,15 +424,14 @@ class size_filter: Note: Doesn't work on PyPy. See ``help(sys.getsizeof)``. """ - __slots__ = 'limit', 'recursive' # Cover "true" collections from 'builtins', 'collections' and 'collections.abc'. COLLECTION_TYPES = ( list, tuple, collections.deque, collections.UserList, - collections.abc.Mapping, - collections.abc.Set, + collections.abc.Mapping, # dict, OrderedDict, UserDict, etc. + collections.abc.Set, # set, frozenset ) MINIMUM_SIZE = getsizeof(None, 16) MISSING_SLOT = object() @@ -419,7 +439,7 @@ class size_filter: def __init__(self, limit: Union[int, float, str], recursive: bool = True, - ) -> Callable[NamedObject, bool]: + ) -> FilterFunction: if _dill.IS_PYPY: raise NotImplementedError("size_filter() is not implemented for PyPy") self.limit = limit @@ -464,7 +484,7 @@ def __repr__(self): ) @classmethod - def estimate_size(cls, obj: Any, memo: set = None) -> int: + def estimate_size(cls, obj: Any, memo: Optional[set] = None) -> int: if memo is None: memo = set() obj_id = id(obj) diff --git a/dill/session.py b/dill/session.py index c6e092ae..0561dc87 100644 --- a/dill/session.py +++ b/dill/session.py @@ -58,8 +58,8 @@ from ._utils import FilterRules, FilterSet, size_filter, EXCLUDE, INCLUDE # Type hints. -from typing import Callable, Iterable, Optional, Union -from ._utils import Filter, NamedObject, RuleType +from typing import Iterable, Optional, Union +from ._utils import Filter, FilterFunction, NamedObject, RuleType import pathlib import tempfile @@ -239,9 +239,9 @@ def _restore_modules(unpickler, main_module): def _filter_vars(main_module, exclude, include, base_rules): """apply exclude/include filters from arguments *and* settings""" rules = FilterRules() - mod_rules = base_rules.get(main_module.__name__, base_rules) - rules.exclude |= mod_rules.get_filters(EXCLUDE) - rules.include |= mod_rules.get_filters(INCLUDE) + mod_filters = base_rules.get(main_module.__name__, base_rules) + rules.exclude |= mod_filters.get_filters(EXCLUDE) + rules.include |= mod_filters.get_filters(INCLUDE) if exclude is not None: rules.update([(EXCLUDE, exclude)]) if include is not None: @@ -447,9 +447,9 @@ def _identify_module(file, main=None): """identify the name of the module stored in the given file-type object""" import pickletools NEUTRAL = {'PROTO', 'FRAME', 'PUT', 'BINPUT', 'MEMOIZE', 'MARK', 'STACK_GLOBAL'} - opcodes = ((opcode.name, arg) for opcode, arg, pos in pickletools.genops(file.peek(256)) - if opcode.name not in NEUTRAL) try: + opcodes = ((opcode.name, arg) for opcode, arg, pos in pickletools.genops(file.peek(256)) + if opcode.name not in NEUTRAL) opcode, arg = next(opcodes) if (opcode, arg) == ('SHORT_BINUNICODE', 'dill._dill'): # The file uses STACK_GLOBAL instead of GLOBAL. @@ -470,7 +470,7 @@ def _identify_module(file, main=None): except StopIteration: raise UnpicklingError("reached STOP without finding module") from None except (NotImplementedError, ValueError) as error: - # ValueError occours when the end of the chunk is reached (without a STOP). + # ValueError also occours when the end of the chunk is reached (without a STOP). if isinstance(error, NotImplementedError) and main is not None: # The file is not peekable, but we have the argument main. return None @@ -689,12 +689,7 @@ def load_module( main = module with _open(filename, 'rb', peekable=True) as file: - #FIXME: dill.settings are disabled - unpickler = Unpickler(file, **kwds) - unpickler._session = True - # Resolve main. - main = module pickle_main = _identify_module(file, main) if main is None: main = pickle_main @@ -728,6 +723,9 @@ def load_module( ) # Load the module's state. + #FIXME: dill.settings are disabled + unpickler = Unpickler(file, **kwds) + unpickler._session = True try: if not is_main_imported: # This is for find_class() to be able to locate it. @@ -753,6 +751,7 @@ def load_session(filename=str(TEMPDIR/'session.pkl'), main=None, **kwds): def load_module_asdict( filename = str(TEMPDIR/'session.pkl'), + *, update: bool = False, **kwds ) -> dict: @@ -812,6 +811,7 @@ def load_module_asdict( main_name = _identify_module(file) original_main = sys.modules.get(main_name) main = ModuleType(main_name) + del main.__doc__, main.__package__, main.__spec__ if update: if original_main is None: original_main = _import_module(main_name) @@ -899,36 +899,48 @@ class ModuleFilters(FilterRules): >>> filters.parent.child.grandchild = [(EXCLUDE, str)] # works fine """ __slots__ = '_module', '_parent', '__dict__' - _fields = tuple(x.lstrip('_') for x in FilterRules.__slots__) + def __init__(self, - rules: Union[Iterable[Rule], FilterRules] = None, + rules: Union[Iterable[Rule], FilterRules, None] = None, module: str = 'DEFAULT', parent: ModuleFilters = None, ): - # Don't call super().__init__() if rules is not None: super().__init__(rules) + # else: don't initialize FilterSets. if parent is not None and parent._module != 'DEFAULT': module = '%s.%s' % (parent._module, module) + # Bypass self.__setattr__() super().__setattr__('_module', module) super().__setattr__('_parent', parent) - def __repr__(self): + + def __repr__(self) -> str: desc = "DEFAULT" if self._module == 'DEFAULT' else "for %r" % self._module return " bool: + if isinstance(other, ModuleFilters): + return super().__eq__(other) and self._module == other._module + elif isinstance(other, FilterRules): + return super().__eq__(other) + else: + return NotImplemented + + def __setattr__(self, name: str, value: Any) -> None: if name in FilterRules.__slots__: # Don't interfere with superclass attributes. super().__setattr__(name, value) - elif name in self._fields: - if not any(hasattr(self, x) for x in FilterRules.__slots__): - # Initialize other. This is not a placeholder anymore. + elif name in ('exclude', 'include'): + if not (hasattr(self, 'exclude') or hasattr(self, 'include')): + # This was a placeholder node. Initialize 'other'. other = 'include' if name == 'exclude' else 'exclude' super().__setattr__(other, ()) super().__setattr__(name, value) else: # Create a child node for submodule 'name'. - super().__setattr__(name, ModuleFilters(rules=value, module=name, parent=self)) - def __setitem__(self, name, value): + mod_filters = ModuleFilters(rules=value, module=name, parent=self) + super().__setattr__(name, mod_filters) + # Proxy __setitem__ and __getitem__ to self.__dict__ through attributes. + def __setitem__(self, name: str, value: Union[Iterable[Rule], FilterRules, None]) -> None: if '.' not in name: setattr(self, name, value) else: @@ -936,34 +948,36 @@ def __setitem__(self, name, value): if module not in self.__dict__: # Create a placeholder node, like logging.PlaceHolder. setattr(self, module, None) - mod_rules = getattr(self, module) - mod_rules[submodules] = value - def __getitem__(self, name): + mod_filters = getattr(self, module) + mod_filters[submodules] = value + def __getitem__(self, name: str) -> ModuleFilters: module, _, submodules = name.partition('.') - mod_rules = getattr(self, module) + mod_filters = getattr(self, module) if not submodules: - return mod_rules + return mod_filters else: - return mod_rules[submodules] - def __eq__(self, other): - if isinstance(other, ModuleFilters): - return super().__eq__(other) and self._module == other._module - elif isinstance(other, FilterRules): - return super().__eq__(other) - else: - return NotImplemented - def get(self, name: str, default: ModuleFilters = None): + return mod_filters[submodules] + + def keys(self) -> List[str]: + values = self.__dict__.values() + # Don't include placeholder nodes. + keys = [x._module for x in values if hasattr(x, 'exclude') or hasattr(x, 'include')] + for mod_filters in values: + keys += mod_filters.keys() + keys.sort() + return keys + def get(self, name: str, default: Optional[ModuleFilters] = None) -> ModuleFilters: try: return self[name] except AttributeError: return default - def get_filters(self, rule_type: RuleType): + def get_filters(self, rule_type: RuleType) -> FilterSet: + """Get exclude/include filters. If not set, fall back to parent module's or default filters.""" if not isinstance(rule_type, RuleType): raise ValueError("invalid rule type: %r (must be one of %r)" % (rule_type, list(RuleType))) try: return getattr(self, rule_type.name.lower()) except AttributeError: - # 'self' is a placeholder, 'exclude' and 'include' are unset. if self._parent is None: raise return self._parent.get_filters(rule_type) @@ -986,7 +1000,7 @@ def get_filters(self, rule_type: RuleType): ## Session filter factories ## -def ipython_filter(*, keep_history: str = 'input') -> Callable[NamedObject, bool]: +def ipython_filter(*, keep_history: str = 'input') -> FilterFunction: """Filter factory to exclude IPython hidden variables. When saving the session with :py:func:`dump_module` in an IPython @@ -1032,8 +1046,8 @@ def ipython_filter(*, keep_history: str = 'input') -> Callable[NamedObject, bool # Code snippet adapted from IPython.core.magics.namespace.who_ls() user_ns = ipython_shell.user_ns user_ns_hidden = ipython_shell.user_ns_hidden - nonmatching = object() # This can never be in user_ns - interactive_vars = {x for x in user_ns if user_ns[x] is not user_ns_hidden.get(x, nonmatching)} + NONMATCHING = object() # This can never be in user_ns + interactive_vars = {x for x in user_ns if user_ns[x] is not user_ns_hidden.get(x, NONMATCHING)} # Input and output history hidden variables. history_regex = [] diff --git a/dill/settings.py b/dill/settings.py index 3676170e..0d51262b 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # # Author: Mike McKerns (mmckerns @caltech and @uqfoundation) +# Author: Leonardo Gama (@leogama) # Copyright (c) 2008-2016 California Institute of Technology. # Copyright (c) 2016-2022 The Uncertainty Quantification Foundation. # License: 3-clause BSD. The full license text is available at: @@ -35,7 +36,7 @@ del DEFAULT_PROTOCOL, HIGHEST_PROTOCOL def _split_option(option, strip_quotes=False): - """split option text by commas or newlines""" + """Split option text by commas *or* newlines.""" import re if not option: return [] # empty value option = option.strip(",\n") # strip leading/trailing commas and empty lines @@ -48,7 +49,7 @@ def _read_filters(section, module_filters): for rule_type in (EXCLUDE, INCLUDE): rule = rule_type.name.lower() if not any(option.lower().startswith(rule) for option in section): - # If no filters, let it fall back to parent module or default filters. + # If no filters, let it fall back to parent module's or default filters. delattr(module_filters, rule) continue for option in (rule, '%s.names' % rule, '%s.regexes' % rule): @@ -66,8 +67,7 @@ def _read_filters(section, module_filters): module_filters.add(filter, rule_type=rule_type) if '%s.funcs' % rule in section: for code in section['%s.funcs' % rule].strip('\n').splitlines(): - if code.startswith('(') and code.endswith(')'): - code = code[1:-1] + code = code.strip() globals_ = {} if not code.startswith('lambda'): name = code.partition('(')[0] @@ -145,7 +145,6 @@ def read_settings(filename) -> None: For details about the accepted INI format, see :py:mod:`configparser`. """ import configparser - from dill import DEFAULT_PROTOCOL, HANDLE_FMODE from dill.session import ModuleFilters, settings as session_settings cp = configparser.ConfigParser( @@ -162,11 +161,12 @@ def read_settings(filename) -> None: # General settings. section = cp['dill'] new_settings = {k: section.getboolean(k) - for k, v in DEFAULT_SETTINGS['dill'].items() if type(v) == bool} - fmode = section.get('fmode') - protocol = section.get('protocol') - new_settings['fmode'] = int(FMODES.get(fmode, fmode)) - new_settings['protocol'] = int(STANDARD_PROTOCOLS.get(protocol, protocol)) + for k, v in DEFAULT_SETTINGS['dill'].items() if type(v) is bool} + for option, named_opts in [('fmode', FMODES), ('protocol', STANDARD_PROTOCOLS)]: + try: + new_settings[option] = section.getint(option) + except ValueError: + new_settings[option] = named_opts[section.get(option)] # Session settings (for dump_module). section = cp['dill.session'] @@ -180,7 +180,7 @@ def read_settings(filename) -> None: continue module = module.partition('.')[-1] assert all(x.isidentifier() for x in module.split('.')) - filters[module] = () # instantiate ModuleFilters and FilterSet's + filters[module] = () # instantiate ModuleFilters and FilterSets _read_filters(section, filters[module]) # Update settings dictionary. From 9ec9ae0450003036e8ce1196b9acba4dd3c2fc8a Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 16 Aug 2022 19:56:51 -0300 Subject: [PATCH 068/109] Move file-related functions and classes from session to _utils - Rename _TruncatableWriter to _SeekableWriter --- dill/_utils.py | 104 +++++++++++++++++++++++++++++++++++++++++++----- dill/session.py | 87 ++-------------------------------------- 2 files changed, 99 insertions(+), 92 deletions(-) diff --git a/dill/_utils.py b/dill/_utils.py index 7783e04a..4aed5c5e 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -11,23 +11,31 @@ from __future__ import annotations -__all__ = ['FilterRules', 'Filter', 'RuleType', 'size_filter', 'EXCLUDE', 'INCLUDE'] +__all__ = [ + 'Filter', 'FilterFunction', 'FilterRules', 'FilterSet', 'NamedObject', + 'Rule', 'RuleType', 'size_filter', 'EXCLUDE', 'INCLUDE', +] +import contextlib +import io import math import re import warnings from dataclasses import dataclass, field, fields -from collections.abc import MutableSet +from collections import abc +from contextlib import suppress from enum import Enum from functools import partialmethod from itertools import chain, filterfalse + +from dill import _dill # _dill is not completely loaded + +# Type hints. from typing import ( Any, Callable, Dict, Iterable, Iterator, Optional, Pattern, Set, Tuple, Union, ) -from dill import _dill - def _format_bytes_size(size: Union[int, float]) -> Tuple[int, str]: """Return bytes size text representation in human-redable form.""" unit = "B" @@ -40,7 +48,87 @@ def _format_bytes_size(size: Union[int, float]) -> Tuple[int, str]: return size, unit -## Namespace filtering. ## +## File-related utilities ## + +class _PeekableReader(contextlib.AbstractContextManager): + """lightweight readable stream wrapper that implements peek()""" + def __init__(self, stream, closing=True): + self.stream = stream + self.closing = closing + def __exit__(self, *exc_info): + if self.closing: + self.stream.close() + def read(self, n): + return self.stream.read(n) + def readline(self): + return self.stream.readline() + def tell(self): + return self.stream.tell() + def close(self): + return self.stream.close() + def peek(self, n): + stream = self.stream + try: + if hasattr(stream, 'flush'): + stream.flush() + position = stream.tell() + stream.seek(position) # assert seek() works before reading + chunk = stream.read(n) + stream.seek(position) + return chunk + except (AttributeError, OSError): + raise NotImplementedError("stream is not peekable: %r", stream) from None + +class _SeekableWriter(io.BytesIO, contextlib.AbstractContextManager): + """works as an unlimited buffer, writes to file on close""" + def __init__(self, stream, closing=True, *args, **kwds): + super().__init__(*args, **kwds) + self.stream = stream + self.closing = closing + def __exit__(self, *exc_info): + self.close() + def close(self): + self.stream.write(self.getvalue()) + with suppress(AttributeError): + self.stream.flush() + super().close() + if self.closing: + self.stream.close() + +def _open(file, mode, *, peekable=False, seekable=False): + """return a context manager with an opened file-like object""" + readonly = ('r' in mode and '+' not in mode) + if not readonly and peekable: + raise ValueError("the 'peekable' option is invalid for writable files") + if readonly and seekable: + raise ValueError("the 'seekable' option is invalid for read-only files") + should_close = not hasattr(file, 'read' if readonly else 'write') + if should_close: + file = open(file, mode) + # Wrap stream in a helper class if necessary. + if peekable and not hasattr(file, 'peek'): + # Try our best to return it as an object with a peek() method. + if hasattr(file, 'tell') and hasattr(file, 'seek'): + file = _PeekableReader(file, closing=should_close) + else: + try: + file = io.BufferedReader(file) + except Exception: + # It won't be peekable, but will fail gracefully in _identify_module(). + file = _PeekableReader(file, closing=should_close) + elif seekable and ( + not hasattr(file, 'seek') + or not hasattr(file, 'truncate') + or (hasattr(file, 'seekable') and not file.seekable()) + ): + file = _SeekableWriter(file, closing=should_close) + if should_close or isinstance(file, (_PeekableReader, _SeekableWriter)): + return file + else: + return contextlib.nullcontext(file) + + +## Namespace filtering ## RuleType = Enum('RuleType', 'EXCLUDE INCLUDE', module=__name__) EXCLUDE, INCLUDE = RuleType.EXCLUDE, RuleType.INCLUDE @@ -73,7 +161,7 @@ def _iter(filters): return None @dataclass -class FilterSet(MutableSet): +class FilterSet(abc.MutableSet): """A superset of exclude/include filter sets.""" names: Set[str] = field(default_factory=set) regexes: Set[Pattern[str]] = field(default_factory=set) @@ -396,9 +484,7 @@ def apply_filters(self, namespace: Dict[str, Any]) -> Dict[str, Any]: return namespace_copy -###################### -# Filter factories # -###################### +## Filter factories ## import collections import collections.abc diff --git a/dill/session.py b/dill/session.py index 0561dc87..2378e16f 100644 --- a/dill/session.py +++ b/dill/session.py @@ -42,12 +42,9 @@ import logging logger = logging.getLogger(__name__) -import contextlib -import io import re import sys import warnings -from contextlib import AbstractContextManager, nullcontext, suppress from dill import _dill, Pickler, Unpickler, UnpicklingError from ._dill import ( @@ -55,93 +52,17 @@ _getopt, _import_module, _is_builtin_module, _is_imported_module, _main_module, _reverse_typemap, __builtin__, ) -from ._utils import FilterRules, FilterSet, size_filter, EXCLUDE, INCLUDE +from ._utils import FilterRules, FilterSet, _open, size_filter, EXCLUDE, INCLUDE # Type hints. from typing import Iterable, Optional, Union -from ._utils import Filter, FilterFunction, NamedObject, RuleType +from ._utils import Filter, FilterFunction, NamedObject, Rule, RuleType import pathlib import tempfile TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) -class _PeekableReader(AbstractContextManager): - """lightweight readable stream wrapper that implements peek()""" - def __init__(self, stream, closing=True): - self.stream = stream - self.closing = closing - def __exit__(self, *exc_info): - if self.closing: - self.stream.close() - def read(self, n): - return self.stream.read(n) - def readline(self): - return self.stream.readline() - def tell(self): - return self.stream.tell() - def close(self): - return self.stream.close() - def peek(self, n): - stream = self.stream - try: - if hasattr(stream, 'flush'): - stream.flush() - position = stream.tell() - stream.seek(position) # assert seek() works before reading - chunk = stream.read(n) - stream.seek(position) - return chunk - except (AttributeError, OSError): - raise NotImplementedError("stream is not peekable: %r", stream) from None - -class _TruncatableWriter(io.BytesIO, AbstractContextManager): - """works as an unlimited buffer, writes to file on close""" - def __init__(self, stream, closing=True, *args, **kwds): - super().__init__(*args, **kwds) - self.stream = stream - self.closing = closing - def __exit__(self, *exc_info): - self.close() - def close(self): - self.stream.write(self.getvalue()) - with suppress(AttributeError): - self.stream.flush() - super().close() - if self.closing: - self.stream.close() - -def _open(file, mode, *, peekable=False, truncatable=False): - """return a context manager with an opened file-like object""" - readonly = ('r' in mode and '+' not in mode) - if not readonly and peekable: - raise ValueError("the 'peekable' option is invalid for writable files") - if readonly and truncatable: - raise ValueError("the 'truncatable' option is invalid for read-only files") - should_close = not hasattr(file, 'read' if readonly else 'write') - if should_close: - file = open(file, mode) - # Wrap stream in a helper class if necessary. - if peekable and not hasattr(file, 'peek'): - # Try our best to return it as an object with a peek() method. - if hasattr(file, 'tell') and hasattr(file, 'seek'): - file = _PeekableReader(file, closing=should_close) - else: - try: - file = io.BufferedReader(file) - except Exception: - # It won't be peekable, but will fail gracefully in _identify_module(). - file = _PeekableReader(file, closing=should_close) - elif truncatable and ( - not hasattr(file, 'truncate') - or (hasattr(file, 'seekable') and not file.seekable()) - ): - file = _TruncatableWriter(file, closing=should_close) - if should_close or isinstance(file, (_PeekableReader, _TruncatableWriter)): - return file - else: - return nullcontext(file) - def _module_map(): """get map of imported modules""" from collections import defaultdict @@ -420,7 +341,7 @@ def dump_module( if getattr(main, '__loader__', None) is None and _is_imported_module(original_main): # Trick _is_imported_module() to force saving this as an imported module. main.__loader__ = True # will be discarded by _dill.save_module() - with _open(filename, 'wb', truncatable=True) as file: + with _open(filename, 'wb', seekable=True) as file: pickler = Pickler(file, protocol, **kwds) pickler._main = main #FIXME: dill.settings are disabled pickler._byref = False # disable pickling by name reference @@ -1058,7 +979,7 @@ def ipython_filter(*, keep_history: str = 'input') -> FilterFunction: interactive_vars |= {'_oh', 'Out', '_', '__', '___'} history_regex.append(re.compile(r'_\d+')) - def not_interactive_var(obj): + def not_interactive_var(obj: NamedObject) -> bool: if any(regex.fullmatch(obj.name) for regex in history_regex): return False return obj.name not in interactive_vars From 86b59d69527b79d41810632b971f842b9073e73d Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sun, 21 Aug 2022 22:48:32 -0300 Subject: [PATCH 069/109] complete refonfail behavior for unpickleables, plus tests --- dill/_dill.py | 312 ++++++++++++++++++++++++++----------- dill/logger.py | 8 +- dill/session.py | 105 +++++-------- dill/tests/test_session.py | 94 ++++++++++- 4 files changed, 362 insertions(+), 157 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 6aeb68ab..07f48e91 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -60,13 +60,13 @@ import gc # import zlib import dataclasses +import weakref from weakref import ReferenceType, ProxyType, CallableProxyType from collections import OrderedDict -from functools import partial +from functools import partial, wraps from operator import itemgetter, attrgetter GENERATOR_FAIL = False import importlib.machinery -EXTENSION_SUFFIXES = tuple(importlib.machinery.EXTENSION_SUFFIXES) try: import ctypes HAS_CTYPES = True @@ -163,12 +163,12 @@ def get_file_type(*args, **kwargs): try: IS_IPYTHON = __IPYTHON__ # is True ExitType = None # IPython.core.autocall.ExitAutocall - singletontypes = ['exit', 'quit', 'get_ipython'] + ipython_singletons = ('exit', 'quit', 'get_ipython') except NameError: IS_IPYTHON = False try: ExitType = type(exit) # apparently 'exit' can be removed except NameError: ExitType = None - singletontypes = [] + ipython_singletons = () import inspect import typing @@ -337,7 +337,6 @@ class Pickler(StockPickler): _refonfail = False # True in session.settings _session = False _first_pass = False - _original_main = None from .settings import settings def __init__(self, file, *args, **kwds): @@ -352,7 +351,7 @@ def __init__(self, file, *args, **kwds): self._file_tell = getattr(file, 'tell', None) # for logger and refonfail super().__init__(file, *args, **kwds) - def save(self, obj, save_persistent_id=True, *, name=None): + def save(self, obj, save_persistent_id=True): # This method overrides StockPickler.save() and is called for every # object pickled. When 'refonfail' is True, it tries to save the object # by reference if pickling it fails with a common pickling error, as @@ -414,7 +413,8 @@ def save_numpy_array(pickler, obj): super().save(obj, save_persistent_id) return - # Save with 'refonfail'. + ## Save with 'refonfail' ## + # Disable framing (right after the framer.init_framing() call at dump()). self.framer.current_frame = None # Store initial state. @@ -422,18 +422,7 @@ def save_numpy_array(pickler, obj): memo_size = len(self.memo) try: super().save(obj, save_persistent_id) - except (AttributeError, *UNPICKLEABLE_ERRORS) as error_stack: - # AttributeError may happen in the save_global() call from a child object. - if type(error_stack) == AttributeError \ - and "no attribute '__name__'" not in error_stack.args[0]: - raise - if self._session and obj is self._main: - warnings.warn( - "module %r being saved by reference due to unpickleable" - " objects in its namespace" % self._main.__name__, - PicklingWarning, - stacklevel=5, - ) + except UNPICKLEABLE_ERRORS as error_stack: message = ( "# X: fallback to save as global: <%s object at %#012x>" % (type(obj).__name__, id(obj)) @@ -444,28 +433,35 @@ def save_numpy_array(pickler, obj): # Roll back memo. for _ in range(len(self.memo) - memo_size): self.memo.popitem() # LIFO order is guaranteed since 3.7 - # Try to save object by reference. - if isinstance(obj, ModuleType) and \ - (_is_builtin_module(obj) or obj is sys.modules['dill']): + # Handle modules specially. + if self._session and obj is self._main: + if not _is_builtin_module(self._main): + raise self.save_reduce(_import_module, (obj.__name__,), obj=obj) logger.trace(self, message, obj=obj) - return - if self._session: - if name is None and not (hasattr(obj, '__name__') or hasattr(obj, '__qualname__')): - name = self._id_to_name.get(id(obj)) - if name is not None and self._main.__name__ not in {'__main__', '__main_mp__'}: - self.save_reduce(getattr, (self._main, name), obj=obj) - logger.trace(self, message, obj=obj) - return - try: - self.save_global(obj, name) + warnings.warn( + "module %r saved by reference due to the unpickleable " + "variable %r" % (self._main.__name__, error_stack.name), + PicklingWarning, + stacklevel=5, + ) + elif (isinstance(obj, ModuleType) + and (_is_builtin_module(obj) or obj is sys.modules['dill'])): + self.save_reduce(_import_module, (obj.__name__,), obj=obj) logger.trace(self, message, obj=obj) - except (AttributeError, PicklingError) as error: - if getattr(self, '_trace_stack', None) and id(obj) == self._trace_stack[-1]: + # Try to save object by reference. + elif hasattr(obj, '__name__') or hasattr(obj, '__qualname__'): + try: + self.save_global(obj) + logger.trace(self, message, obj=obj) + except PicklingError as error: # Roll back trace state. - self._trace_stack.pop() - self._size_stack.pop() - raise error from error_stack + logger.roll_back(self, obj) + raise error from error_stack + else: + # Roll back trace state. + logger.roll_back(self, obj) + raise return save.__doc__ = StockPickler.save.__doc__ @@ -1235,61 +1231,144 @@ def save_code(pickler, obj): logger.trace(pickler, "# Co") return +def _module_map(main_module): + """get map of imported modules""" + from collections import defaultdict + from types import SimpleNamespace + modmap = SimpleNamespace( + by_name=defaultdict(list), + by_id=defaultdict(list), + top_level={}, # top-level modules + package = _module_package(main_module), + ) + for modname, module in sys.modules.items(): + if (modname in ('__main__', '__mp_main__') or module is main_module + or not isinstance(module, ModuleType)): + continue + if '.' not in modname: + modmap.top_level[id(module)] = modname + for objname, modobj in module.__dict__.items(): + modmap.by_name[objname].append((modobj, modname)) + modmap.by_id[id(modobj)].append((objname, modname)) + return modmap + +def _lookup_module(modmap, name, obj, lookup_by_id=True) -> typing.Tuple[str, str, bool]: + """Lookup name or id of obj if module is imported. + + Lookup for objects identical to 'obj' at modules in 'modmpap'. If multiple + copies are found in different modules, return the one from the module with + higher probability of being available at unpickling time, according to the + hierarchy: + + 1. Standard Library modules + 2. modules of the same package as the module being saved (if it's part of a module) + 3. installed modules in general + 4. non-installed modules + + Returns: + A 3-tuple containing the module's name, the object's name in the module, + and a boolean flag, which is `True` if the module falls under categories + (1) to (3) from the hierarchy, or `False` if it's in category (4). + """ + for map, by_id in [(modmap.by_name, False), (modmap.by_id, True)]: + if by_id and not lookup_by_id: + break + _2nd_choice = _3rd_choice = _4th_choice = None + key = id(obj) if by_id else name + for other, modname in map[key]: + if by_id or other is obj: + other_module = sys.modules[modname] + other_name = other if by_id else name + # Prefer modules imported earlier (first found). + if _is_stdlib_module(other_module): + return modname, other_name, True + elif modmap.package and modmap.package == _module_package(other_module): + if _2nd_choice: continue + _2nd_choice = modname, other_name, True + elif not _2nd_choice: + # Don't call _is_builtin_module() unnecessarily. + if _is_builtin_module(other_module): + if _3rd_choice: continue + _3rd_choice = modname, other_name, True + else: + if _4th_choice: continue + _4th_choice = modname, other_name, False # unsafe + found = _2nd_choice or _3rd_choice or _4th_choice + if found: + return found + return None, None, None + +def _global_string(modname, name): + return GLOBAL + bytes('%s\n%s\n' % (modname, name), 'UTF-8') + +def _save_module_dict(pickler, obj): + # If an object doesn't have a '__name__' attribute, pass the object's name + # in the module's namespace to save(), so that it can be used with + # save_global() to increase the chances of finding the object for saving + # it by reference in the event of a failed serialization. + main = getattr(pickler, '_original_main', pickler._main) + modmap = getattr(pickler, '_modmap', None) # cached from _stash_modules() + is_builtin = _is_builtin_module(main) + pickler.write(MARK + DICT) # don't need to memoize + for name, value in obj.items(): + pickler.save(name) + try: + pickler.save(value) + except UNPICKLEABLE_ERRORS as error_stack: + if modmap is None: + modmap = _module_map(main) + modname, objname, installed = _lookup_module(modmap, name, value) + if modname and (installed or not is_builtin): + pickler.write(_global_string(modname, objname)) + elif is_builtin: + pickler.write(_global_string(main.__name__, name)) + else: + error = PicklingError("can't save variable %r as global" % name) + error.name = name + raise error from error_stack + pickler.memoize(value) + pickler.write(SETITEM) + def _repr_dict(obj): """make a short string representation of a dictionary""" return "<%s object at %#012x>" % (type(obj).__name__, id(obj)) @register(dict) def save_module_dict(pickler, obj): - pickler_is_dill = is_dill(pickler, child=False) - if pickler_is_dill and obj == pickler._main.__dict__ and \ - not (pickler._session and pickler._first_pass): + is_pickler_dill = is_dill(pickler, child=False) + if (is_pickler_dill + and obj is pickler._main.__dict__ + and not (pickler._session and pickler._first_pass)): logger.trace(pickler, "D1: %s", _repr_dict(obj), obj=obj) pickler.write(bytes('c__builtin__\n__main__\n', 'UTF-8')) logger.trace(pickler, "# D1") - elif (not pickler_is_dill) and (obj == _main_module.__dict__): + elif not is_pickler_dill and obj is _main_module.__dict__: logger.trace(pickler, "D3: %s", _repr_dict(obj), obj=obj) pickler.write(bytes('c__main__\n__dict__\n', 'UTF-8')) #XXX: works in general? logger.trace(pickler, "# D3") - elif '__name__' in obj and obj != _main_module.__dict__ \ - and type(obj['__name__']) is str \ - and obj is getattr(_import_module(obj['__name__'],True), '__dict__', None): + elif (is_pickler_dill + and pickler._session + and pickler._refonfail + and obj is pickler._main_dict_copy): + logger.trace(pickler, "D5: %s", _repr_dict(obj), obj=obj) + # we only care about session the first pass thru + pickler.first_pass = False + _save_module_dict(pickler, obj) + logger.trace(pickler, "# D3") + elif ('__name__' in obj + and obj is not _main_module.__dict__ + and type(obj['__name__']) is str + and obj is getattr(_import_module(obj['__name__'], safe=True), '__dict__', None)): logger.trace(pickler, "D4: %s", _repr_dict(obj), obj=obj) pickler.write(bytes('c%s\n__dict__\n' % obj['__name__'], 'UTF-8')) logger.trace(pickler, "# D4") - elif not (pickler_is_dill and pickler._session and pickler._first_pass and pickler._refonfail): - # we only care about session the first pass thru - if pickler_is_dill and pickler._first_pass: - pickler._first_pass = False + else: logger.trace(pickler, "D2: %s", _repr_dict(obj), obj=obj) + if is_pickler_dill: + # we only care about session the first pass thru + pickler._first_pass = False StockPickler.save_dict(pickler, obj) logger.trace(pickler, "# D2") - else: - # If an object doesn't have a '__name__' attribute, pass the object's name - # in the module's namespace to save(), so that it can be used with - # save_global() to increase the chances of finding the object for saving - # it by reference in the event of a failed serialization. - pickler._first_pass = False - logger.trace(pickler, "D5: %s", _repr_dict(obj), obj=obj) - # Modified from Python Standard Library's pickle._Pickler.save_dict() - # and pickle._Pickler._batch_setitems(). Summary of changes: use - # 'SETITEM' for all pickle protocols and conditionally pass an extra - # argument to a custom implementation of the method 'save'. - # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved - # License Agreement: https://opensource.org/licenses/Python-2.0 - if pickler.bin: - pickler.write(EMPTY_DICT) - else: # proto 0 -- can't use EMPTY_DICT - pickler.write(MARK + DICT) - pickler.memoize(obj) - for k, v in obj.items(): - pickler.save(k) - if hasattr(v, '__name__') or hasattr(v, '__qualname__'): - pickler.save(v) - else: - pickler.save(v, name=k) - pickler.write(SETITEM) - logger.trace(pickler, "# D5") return @@ -1688,19 +1767,70 @@ def save_weakproxy(pickler, obj): logger.trace(pickler, "# R2") return +def _weak_cache(func=None, *, defaults=None): + if defaults is None: + defaults = {} + if func is None: + return partial(_weak_cache, defaults=defaults) + cache = weakref.WeakKeyDictionary() + @wraps(func) + def wrapper(referent): + try: + return defaults[referent] + except KeyError: + try: + return cache[referent] + except KeyError: + value = func(referent) + cache[referent] = value + return value + return wrapper + +PYTHONPATH_PREFIXES = {getattr(sys, attr) for attr in ( + 'base_prefix', 'prefix', 'base_exec_prefix', 'exec_prefix', + 'real_prefix', # for old virtualenv versions + ) if hasattr(sys, attr)} +PYTHONPATH_PREFIXES = tuple(os.path.realpath(path) for path in PYTHONPATH_PREFIXES) +EXTENSION_SUFFIXES = tuple(importlib.machinery.EXTENSION_SUFFIXES) +if OLD310: + STDLIB_PREFIX = os.path.dirname(os.path.realpath(os.__file__)) + +@_weak_cache(defaults={None: True}) def _is_builtin_module(module): - if not hasattr(module, "__file__"): return True + if module.__name__ in ('__main__', '__mp_main__'): + return False + mod_path = getattr(module, '__file__', None) + if not mod_path: + return True # If a module file name starts with prefix, it should be a builtin # module, so should always be pickled as a reference. - names = ["base_prefix", "base_exec_prefix", "exec_prefix", "prefix", "real_prefix"] - return any(os.path.realpath(module.__file__).startswith(os.path.realpath(getattr(sys, name))) - for name in names if hasattr(sys, name)) or \ - module.__file__.endswith(EXTENSION_SUFFIXES) or \ - 'site-packages' in module.__file__ + mod_path = os.path.realpath(mod_path) + return ( + any(mod_path.startswith(prefix) for prefix in PYTHONPATH_PREFIXES) + or mod_path.endswith(EXTENSION_SUFFIXES) + or 'site-packages' in mod_path + ) + +@_weak_cache(defaults={None: False}) +def _is_stdlib_module(module): + first_level = module.__name__.partition('.')[0] + if OLD310: + if first_level in sys.builtin_module_names: + return True + mod_path = getattr(module, '__file__', '') + if mod_path: + mod_path = os.path.realpath(mod_path) + return mod_path.startswith(STDLIB_PREFIX) + else: + return first_level in sys.stdlib_module_names def _is_imported_module(module): return getattr(module, '__loader__', None) is not None or module in sys.modules.values() +def _module_package(module): + package = getattr(module, '__package__', None) + return package.partition('.')[0] if package else None + @register(ModuleType) def save_module(pickler, obj): if False: #_use_diff: @@ -1722,15 +1852,22 @@ def save_module(pickler, obj): logger.trace(pickler, "# M1") else: builtin_mod = _is_builtin_module(obj) - if obj.__name__ not in ("builtins", "dill", "dill._dill") and not builtin_mod or \ - is_dill(pickler, child=True) and obj is pickler._main: + is_session_main = is_dill(pickler, child=True) and obj is pickler._main + if (obj.__name__ not in ("builtins", "dill", "dill._dill") and not builtin_mod + or is_session_main): logger.trace(pickler, "M1: %s", obj) - _main_dict = obj.__dict__.copy() #XXX: better no copy? option to copy? - [_main_dict.pop(item, None) for item in singletontypes - + ["__builtins__", "__loader__"]] + # Hack for handling module-type objects in load_module(). mod_name = obj.__name__ if _is_imported_module(obj) else '__runtime__.%s' % obj.__name__ - pickler.save_reduce(_import_module, (mod_name,), obj=obj, - state=_main_dict) + # Second references are saved as __builtin__.__main__ in save_module_dict(). + main_dict = obj.__dict__.copy() + for item in ('__builtins__', '__loader__'): + main_dict.pop(item, None) + for item in ipython_singletons: + if getattr(item, '__module__', '').startswith('IPython'): + main_dict.pop(item, None) + if is_session_main: + pickler._main_dict_copy = main_dict + pickler.save_reduce(_import_module, (mod_name,), obj=obj, state=main_dict) logger.trace(pickler, "# M1") elif obj.__name__ == "dill._dill": logger.trace(pickler, "M2: %s", obj) @@ -1740,7 +1877,6 @@ def save_module(pickler, obj): logger.trace(pickler, "M2: %s", obj) pickler.save_reduce(_import_module, (obj.__name__,), obj=obj) logger.trace(pickler, "# M2") - return return @register(TypeType) diff --git a/dill/logger.py b/dill/logger.py index 0e7ed4a5..d27787d8 100644 --- a/dill/logger.py +++ b/dill/logger.py @@ -129,7 +129,7 @@ def trace_setup(self, pickler): # Called by Pickler.dump(). if not dill._dill.is_dill(pickler, child=False): return - if self.isEnabledFor(logging.INFO): + elif self.isEnabledFor(logging.INFO): pickler._trace_stack = [] pickler._size_stack = [] else: @@ -138,7 +138,7 @@ def trace(self, pickler, msg, *args, obj=None, **kwargs): if not hasattr(pickler, '_trace_stack'): logger.info(msg, *args, **kwargs) return - if pickler._trace_stack is None: + elif pickler._trace_stack is None: return extra = kwargs.get('extra', {}) pushed_obj = msg.startswith('#') @@ -169,6 +169,10 @@ def trace(self, pickler, msg, *args, obj=None, **kwargs): self.info(msg, *args, **kwargs) if pushed_obj: pickler._trace_stack.pop() + def roll_back(self, pickler, obj): + if pickler._trace_stack and id(obj) == pickler._trace_stack[-1]: + pickler._trace_stack.pop() + pickler._size_stack.pop() class TraceFormatter(logging.Formatter): """ diff --git a/dill/session.py b/dill/session.py index 2378e16f..895eaeb7 100644 --- a/dill/session.py +++ b/dill/session.py @@ -50,7 +50,7 @@ from ._dill import ( BuiltinMethodType, FunctionType, MethodType, ModuleType, TypeType, _getopt, _import_module, _is_builtin_module, _is_imported_module, - _main_module, _reverse_typemap, __builtin__, + _lookup_module, _main_module, _module_map, _reverse_typemap, __builtin__, ) from ._utils import FilterRules, FilterSet, _open, size_filter, EXCLUDE, INCLUDE @@ -63,25 +63,6 @@ TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) -def _module_map(): - """get map of imported modules""" - from collections import defaultdict - from types import SimpleNamespace - modmap = SimpleNamespace( - by_name=defaultdict(list), - by_id=defaultdict(list), - top_level={}, # top-level modules - ) - for modname, module in sys.modules.items(): - if modname in ('__main__', '__mp_main__') or not isinstance(module, ModuleType): - continue - if '.' not in modname: - modmap.top_level[id(module)] = modname - for objname, modobj in module.__dict__.items(): - modmap.by_name[objname].append((modobj, modname)) - modmap.by_id[id(modobj)].append((modobj, objname, modname)) - return modmap - # Unique objects (with no duplicates) that may be imported with "import as". IMPORTED_AS_TYPES = (ModuleType, TypeType, FunctionType, MethodType, BuiltinMethodType) if 'PyCapsuleType' in _reverse_typemap: @@ -93,42 +74,31 @@ def _module_map(): r'concurrent\.futures(\.\w+)?', r'multiprocessing(\.\w+)?' )] -def _lookup_module(modmap, name, obj, main_module): - """lookup name or id of obj if module is imported""" - for modobj, modname in modmap.by_name[name]: - if modobj is obj and sys.modules[modname] is not main_module: - return modname, name - __module__ = getattr(obj, '__module__', None) - if isinstance(obj, IMPORTED_AS_TYPES) or (__module__ is not None - and any(regex.fullmatch(__module__) for regex in IMPORTED_AS_MODULES)): - for modobj, objname, modname in modmap.by_id[id(obj)]: - if sys.modules[modname] is not main_module: - return modname, objname - return None, None - BUILTIN_CONSTANTS = (None, False, True, NotImplemented) -def _stash_modules(main_module, original_main): +def _stash_modules(main_module): """pop imported variables to be saved by reference in the __dill_imported* attributes""" - modmap = _module_map() + modmap = _module_map(main_module) newmod = ModuleType(main_module.__name__) - + original = {} imported = [] imported_as = [] imported_top_level = [] # keep separated for backward compatibility - original = {} + for name, obj in main_module.__dict__.items(): - # Self-references. - if obj is main_module: - original[name] = newmod - elif obj is main_module.__dict__: - original[name] = newmod.__dict__ # Avoid incorrectly matching a singleton value in another package (e.g. __doc__ == None). - elif (any(obj is constant for constant in BUILTIN_CONSTANTS) # must compare by identity - or isinstance(obj, ModuleType) and _is_builtin_module(obj)): # always saved by ref + if (any(obj is constant for constant in BUILTIN_CONSTANTS) # must compare by identity + or isinstance(obj, ModuleType) and _is_builtin_module(obj) # always saved by ref + or obj is main_module or obj is main_module.__dict__): original[name] = obj else: - source_module, objname = _lookup_module(modmap, name, obj, main_module=original_main) + modname = getattr(obj, '__module__', None) + lookup_by_id = ( + isinstance(obj, IMPORTED_AS_TYPES) + or modname is not None + and any(regex.fullmatch(modname) for regex in IMPORTED_AS_MODULES) + ) + source_module, objname, _ = _lookup_module(modmap, name, obj, lookup_by_id) if source_module is not None: if objname == name: imported.append((source_module, name)) @@ -145,9 +115,9 @@ def _stash_modules(main_module, original_main): newmod.__dill_imported = imported newmod.__dill_imported_as = imported_as newmod.__dill_imported_top_level = imported_top_level - return newmod + return newmod, modmap else: - return main_module + return main_module, modmap def _restore_modules(unpickler, main_module): for modname, name in main_module.__dict__.pop('__dill_imported', ()): @@ -180,13 +150,25 @@ def _filter_vars(main_module, exclude, include, base_rules): newmod = ModuleType(main_module.__name__) newmod.__dict__.update(namespace) - for name, obj in namespace.items(): - if obj is main_module: - setattr(newmod, name, newmod) - elif obj is main_module.__dict__: - setattr(newmod, name, newmod.__dict__) return newmod +def _fix_module_namespace(main, original_main): + # Self-references. + for name, obj in main.__dict__.items(): + if obj is original_main: + setattr(main, name, main) + elif obj is original_main.__dict__: + setattr(main, name, main.__dict__) + # Some empty attributes like __doc__ may have been added by ModuleType(). + added_names = set(main.__dict__) + added_names.difference_update(original_main.__dict__) + added_names.difference_update('__dill_imported%s' % s for s in ('', '_as', '_top_level')) + for name in added_names: + delattr(main, name) + # Trick _is_imported_module(), forcing main to be saved as an imported module. + if getattr(main, '__loader__', None) is None and _is_imported_module(original_main): + main.__loader__ = True # will be discarded by _dill.save_module() + def dump_module( filename = str(TEMPDIR/'session.pkl'), module: Optional[Union[ModuleType, str]] = None, @@ -330,17 +312,7 @@ def dump_module( original_main = main main = _filter_vars(main, exclude, include, base_rules) if refimported: - main = _stash_modules(main, original_main) - if main is not original_main: - # Some empty attributes like __doc__ may have been added by ModuleType(). - added_names = set(main.__dict__) - added_names.difference_update(original_main.__dict__) - added_names.difference_update('__dill_imported%s' % s for s in ('', '_as', '_top_level')) - for name in added_names: - delattr(main, name) - if getattr(main, '__loader__', None) is None and _is_imported_module(original_main): - # Trick _is_imported_module() to force saving this as an imported module. - main.__loader__ = True # will be discarded by _dill.save_module() + main, modmap = _stash_modules(main) with _open(filename, 'wb', seekable=True) as file: pickler = Pickler(file, protocol, **kwds) pickler._main = main #FIXME: dill.settings are disabled @@ -350,11 +322,14 @@ def dump_module( pickler._first_pass = True if main is not original_main: pickler._original_main = original_main + _fix_module_namespace(main, original_main) if refonfail: pickler._refonfail = True # False by default pickler._file_seek = file.seek pickler._file_truncate = file.truncate - pickler._id_to_name = {id(v): k for k, v in main.__dict__.items()} + if refimported: + # Cache modmap for refonfail. + pickler._modmap = modmap pickler.dump(main) return @@ -991,7 +966,7 @@ def not_interactive_var(obj: NamedObject) -> bool: # Internal exports for backward compatibility with dill v0.3.5.1 for name in ( - '_lookup_module', '_module_map', '_restore_modules', '_stash_modules', + '_restore_modules', '_stash_modules', 'dump_session', 'load_session' # backward compatibility functions ): setattr(_dill, name, globals()[name]) diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index 3cc14c7d..deba0ec4 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -93,6 +93,7 @@ def weekdays(self): return [day_name[i] for i in self.iterweekdays()] cal = CalendarSubclass() selfref = __main__ +self_dict = __main__.__dict__ # Setup global namespace for session saving tests. class TestNamespace: @@ -122,7 +123,7 @@ def _clean_up_cache(module): def _test_objects(main, globals_copy, refimported): try: main_dict = __main__.__dict__ - global Person, person, Calendar, CalendarSubclass, cal, selfref + global Person, person, Calendar, CalendarSubclass, cal, selfref, self_dict for obj in ('json', 'url', 'local_mod', 'sax', 'dom'): assert globals()[obj].__name__ == globals_copy[obj].__name__ @@ -143,6 +144,7 @@ def _test_objects(main, globals_copy, refimported): assert cal.weekdays() == globals_copy['cal'].weekdays() assert selfref is __main__ + assert self_dict is __main__.__dict__ except AssertionError as error: error.args = (_error_line(obj, refimported),) @@ -198,7 +200,7 @@ def test_runtime_module(): runtime = ModuleType(modname) runtime.x = 42 - mod = dill.session._stash_modules(runtime, runtime) + mod, _ = dill.session._stash_modules(runtime) if mod is not runtime: print("There are objects to save by referenece that shouldn't be:", mod.__dill_imported, mod.__dill_imported_as, mod.__dill_imported_top_level, @@ -226,6 +228,47 @@ def test_runtime_module(): assert runtime.x == 42 assert runtime not in sys.modules.values() +def test_lookup_module(): + assert not dill._dill._is_builtin_module(local_mod) and local_mod.__package__ == '' + + def lookup(mod, name, obj, lookup_by_name=True): + from dill._dill import _lookup_module, _module_map + return _lookup_module(_module_map(mod), name, obj, lookup_by_name) + + name = '__test_obj' + obj = object() + setattr(dill, name, obj) + assert lookup(dill, name, obj) == (None, None, None) + + # 4th level: non-installed module + setattr(local_mod, name, obj) + sys.modules[local_mod.__name__] = sys.modules.pop(local_mod.__name__) # put at the end + assert lookup(dill, name, obj) == (local_mod.__name__, name, False) # not installed + try: + import pox + # 3rd level: installed third-party module + setattr(pox, name, obj) + sys.modules['pox'] = sys.modules.pop('pox') + assert lookup(dill, name, obj) == ('pox', name, True) + except ModuleNotFoundError: + pass + # 2nd level: module of same package + setattr(dill.session, name, obj) + sys.modules['dill.session'] = sys.modules.pop('dill.session') + assert lookup(dill, name, obj) == ('dill.session', name, True) + # 1st level: stdlib module + setattr(os, name, obj) + sys.modules['os'] = sys.modules.pop('os') + assert lookup(dill, name, obj) == ('os', name, True) + + # Lookup by id. + name2 = name + '2' + setattr(dill, name2, obj) + assert lookup(dill, name2, obj) == ('os', name, True) + assert lookup(dill, name2, obj, lookup_by_name=False) == (None, None, None) + setattr(local_mod, name2, obj) + assert lookup(dill, name2, obj) == (local_mod.__name__, name2, False) + def test_refimported_imported_as(): import collections import concurrent.futures @@ -250,6 +293,51 @@ def test_refimported_imported_as(): ('dill', 'executor', 'thread_exec'), } +def test_refonfail_unpickleable(): + global local_mod + import keyword as builtin_mod + from dill._dill import _global_string + dill.session.settings['refonfail'] = True + name = '__test_obj' + obj = memoryview(b'') + assert dill._dill._is_builtin_module(builtin_mod) + assert not dill._dill._is_builtin_module(local_mod) + # assert not dill.pickles(obj) + try: + dill.dumps(obj) + except dill._dill.UNPICKLEABLE_ERRORS: + pass + else: + raise Exception("test object should be unpickleable") + + def dump_with_ref(mod, other_mod): + setattr(other_mod, name, obj) + buf = BytesIO() + dill.dump_module(buf, mod) + return buf.getvalue() + + # "user" modules + _local_mod = local_mod + del local_mod # remove from __main__'s namespace + try: + dump_with_ref(__main__, __main__) + except dill.PicklingError: + pass # success + else: + raise Exception("saving with a reference to the module itself should fail for '__main__'") + assert _global_string(_local_mod.__name__, name) in dump_with_ref(__main__, _local_mod) + assert _global_string('os', name) in dump_with_ref(__main__, os) + local_mod = _local_mod + del _local_mod, __main__.__test_obj, local_mod.__test_obj, os.__test_obj + + # "builtin" or "installed" modules + assert _global_string(builtin_mod.__name__, name) in dump_with_ref(builtin_mod, builtin_mod) + assert _global_string(builtin_mod.__name__, name) in dump_with_ref(builtin_mod, local_mod) + assert _global_string('os', name) in dump_with_ref(builtin_mod, os) + del builtin_mod.__test_obj, local_mod.__test_obj, os.__test_obj + + dill.reset_settings() + def test_load_module_asdict(): with TestNamespace(): session_buffer = BytesIO() @@ -303,6 +391,8 @@ def namespace_matches(keep_history, should_keep_vars): test_session_main(refimported=True) test_session_other() test_runtime_module() + test_lookup_module() test_refimported_imported_as() + test_refonfail_unpickleable() test_load_module_asdict() test_ipython_filter() From d5fd37ee549d8ac9b1a51a2c15380c0763a68859 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sun, 21 Aug 2022 22:49:27 -0300 Subject: [PATCH 070/109] small optimization for opcode strings --- dill/_dill.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 07f48e91..313150c3 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -39,7 +39,7 @@ #XXX: get types from .objtypes ? import builtins as __builtin__ from pickle import _Pickler as StockPickler, Unpickler as StockUnpickler -from pickle import DICT, EMPTY_DICT, GLOBAL, MARK, SETITEM +from pickle import DICT, EMPTY_DICT, GLOBAL, MARK, POP, SETITEM from _thread import LockType from _thread import RLock as RLockType #from io import IOBase @@ -1162,7 +1162,7 @@ def _save_with_postproc(pickler, reduction, is_pickler_dill=None, obj=Getattr.NO else: pickler.save_reduce(*reduction) # pop None created by calling preprocessing step off stack - pickler.write(bytes('0', 'UTF-8')) + pickler.write(POP) #@register(CodeType) #def save_code(pickler, obj): @@ -1340,11 +1340,11 @@ def save_module_dict(pickler, obj): and obj is pickler._main.__dict__ and not (pickler._session and pickler._first_pass)): logger.trace(pickler, "D1: %s", _repr_dict(obj), obj=obj) - pickler.write(bytes('c__builtin__\n__main__\n', 'UTF-8')) + pickler.write(GLOBAL + b'__builtin__\n__main__\n') logger.trace(pickler, "# D1") elif not is_pickler_dill and obj is _main_module.__dict__: logger.trace(pickler, "D3: %s", _repr_dict(obj), obj=obj) - pickler.write(bytes('c__main__\n__dict__\n', 'UTF-8')) #XXX: works in general? + pickler.write(GLOBAL + b'__main__\n__dict__\n') #XXX: works in general? logger.trace(pickler, "# D3") elif (is_pickler_dill and pickler._session @@ -1360,7 +1360,7 @@ def save_module_dict(pickler, obj): and type(obj['__name__']) is str and obj is getattr(_import_module(obj['__name__'], safe=True), '__dict__', None)): logger.trace(pickler, "D4: %s", _repr_dict(obj), obj=obj) - pickler.write(bytes('c%s\n__dict__\n' % obj['__name__'], 'UTF-8')) + pickler.write(_global_string(obj['__name__'], '__dict__')) logger.trace(pickler, "# D4") else: logger.trace(pickler, "D2: %s", _repr_dict(obj), obj=obj) @@ -1654,7 +1654,7 @@ def save_cell(pickler, obj): # The result of this function call will be None pickler.save_reduce(_shims._delattr, (obj, 'cell_contents')) # pop None created by calling _delattr off stack - pickler.write(bytes('0', 'UTF-8')) + pickler.write(POP) logger.trace(pickler, "# Ce3") return if is_dill(pickler, child=True): @@ -1902,7 +1902,7 @@ def save_type(pickler, obj, postproc_list=None): elif obj is type(None): logger.trace(pickler, "T7: %s", obj) #XXX: pickler.save_reduce(type, (None,), obj=obj) - pickler.write(bytes('c__builtin__\nNoneType\n', 'UTF-8')) + pickler.write(GLOBAL + b'__builtin__\nNoneType\n') logger.trace(pickler, "# T7") elif obj is NotImplementedType: logger.trace(pickler, "T7: %s", obj) @@ -2080,7 +2080,7 @@ def save_function(pickler, obj): # Change the value of the cell pickler.save_reduce(*possible_postproc) # pop None created by calling preprocessing step off stack - pickler.write(bytes('0', 'UTF-8')) + pickler.write(POP) logger.trace(pickler, "# F1") else: From 58e2af9091d93d8f89dd2d419ed7037d2648a7c7 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 22 Aug 2022 09:17:59 -0300 Subject: [PATCH 071/109] minor --- dill/_dill.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 313150c3..b84c71bd 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -163,12 +163,12 @@ def get_file_type(*args, **kwargs): try: IS_IPYTHON = __IPYTHON__ # is True ExitType = None # IPython.core.autocall.ExitAutocall - ipython_singletons = ('exit', 'quit', 'get_ipython') + IPYTHON_SINGLETONS = ('exit', 'quit', 'get_ipython') except NameError: IS_IPYTHON = False try: ExitType = type(exit) # apparently 'exit' can be removed except NameError: ExitType = None - ipython_singletons = () + IPYTHON_SINGLETONS = () import inspect import typing @@ -1862,7 +1862,7 @@ def save_module(pickler, obj): main_dict = obj.__dict__.copy() for item in ('__builtins__', '__loader__'): main_dict.pop(item, None) - for item in ipython_singletons: + for item in IPYTHON_SINGLETONS: if getattr(item, '__module__', '').startswith('IPython'): main_dict.pop(item, None) if is_session_main: From 0e7e7a8e27e6905ad0ce84bd0d40f51f52763661 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 22 Aug 2022 09:38:29 -0300 Subject: [PATCH 072/109] _getopt: no more settings with composed names like 'dump_module.refonfail' --- dill/_dill.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index b84c71bd..e3af98be 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -232,7 +232,6 @@ def dump(obj, file, protocol=None, byref=None, fmode=None, recurse=None, **kwds) """ from .settings import settings protocol = int(_getopt(settings, 'protocol', protocol)) - kwds = kwds.copy() kwds.update(byref=byref, fmode=fmode, recurse=recurse) Pickler(file, protocol, **kwds).dump(obj) return @@ -320,14 +319,25 @@ class UnpicklingWarning(PickleWarning, UnpicklingError): pass def _getopt(settings, key, arg=None, *, kwds=None): + """Get option from 'kwds' or named 'arg', falling back to settings. + + Examples: + + # With an explict named argument: + protocol = int(_getopt(settings, 'protocol', protocol)) + + # With a named argument in **kwds: + self._byref = _getopt(settings, 'byref', kwds=kwds) + + """ + # Sanity check, it's a bug in calling code if False. + assert kwds is None or arg is None if kwds is not None: arg = kwds.pop(key, None) if arg is not None: return arg - while '.' in key: - prefix, _, key = key.partition('.') - settings = settings[prefix] - return settings[key] + else: + return settings[key] ### Extend the Picklers class Pickler(StockPickler): From 3dce5b873a59f0ddc3e3a2f89576514ad5ff21a0 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 22 Aug 2022 09:52:43 -0300 Subject: [PATCH 073/109] use StockPickler instead of super() by now --- dill/_dill.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index e3af98be..c34dd1ad 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -359,7 +359,7 @@ def __init__(self, file, *args, **kwds): self._strictio = False #_getopt(settings, 'strictio', kwds=kwds) self._postproc = OrderedDict() self._file_tell = getattr(file, 'tell', None) # for logger and refonfail - super().__init__(file, *args, **kwds) + StockPickler.__init__(self, file, *args, **kwds) def save(self, obj, save_persistent_id=True): # This method overrides StockPickler.save() and is called for every @@ -420,7 +420,7 @@ def save_numpy_array(pickler, obj): raise PicklingError(msg) if not self._refonfail: - super().save(obj, save_persistent_id) + StockPickler.save(self, obj, save_persistent_id) return ## Save with 'refonfail' ## @@ -431,7 +431,7 @@ def save_numpy_array(pickler, obj): position = self._file_tell() memo_size = len(self.memo) try: - super().save(obj, save_persistent_id) + StockPickler.save(obj, save_persistent_id) except UNPICKLEABLE_ERRORS as error_stack: message = ( "# X: fallback to save as global: <%s object at %#012x>" From dc9c758f2e2e96cf195c9841f6734903292902dc Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 22 Aug 2022 10:19:04 -0300 Subject: [PATCH 074/109] minor fix --- dill/_dill.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dill/_dill.py b/dill/_dill.py index c34dd1ad..3540b91a 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -1364,7 +1364,7 @@ def save_module_dict(pickler, obj): # we only care about session the first pass thru pickler.first_pass = False _save_module_dict(pickler, obj) - logger.trace(pickler, "# D3") + logger.trace(pickler, "# D5") elif ('__name__' in obj and obj is not _main_module.__dict__ and type(obj['__name__']) is str From 8cf5949b5ef6927f35cc249da11931ad0db70062 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 22 Aug 2022 13:41:45 -0300 Subject: [PATCH 075/109] TraceAdapter: check trace() arguments and complement documentation --- dill/_dill.py | 2 +- dill/logger.py | 29 +++++++++++++++++++++++------ 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 3540b91a..0d4535e4 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -431,7 +431,7 @@ def save_numpy_array(pickler, obj): position = self._file_tell() memo_size = len(self.memo) try: - StockPickler.save(obj, save_persistent_id) + StockPickler.save(self, obj, save_persistent_id) except UNPICKLEABLE_ERRORS as error_stack: message = ( "# X: fallback to save as global: <%s object at %#012x>" diff --git a/dill/logger.py b/dill/logger.py index d27787d8..dab0ae34 100644 --- a/dill/logger.py +++ b/dill/logger.py @@ -106,13 +106,24 @@ class TraceAdapter(logging.LoggerAdapter): creates extra values to be added in the LogRecord from it, then calls 'info()'. - Usage of logger with 'trace()' method: + Examples: - >>> from dill.logger import adapter as logger #NOTE: not dill.logger.logger - >>> ... - >>> def save_atype(pickler, obj): - >>> logger.trace(pickler, "Message with %s and %r etc. placeholders", 'text', obj) - >>> ... + In the first call to `trace()`, before pickling an object, it must be passed + to `trace()` as the last positional argument or as the keyword argument + `obj`. Note how, in the second example, the object is not passed as a + positional argument, and therefore won't be substituted in the message: + + >>> from dill.logger import adapter as logger #NOTE: not dill.logger.logger + >>> ... + >>> def save_atype(pickler, obj): + >>> logger.trace(pickler, "X: Message with %s and %r placeholders", 'text', obj) + >>> ... + >>> logger.trace(pickler, "# X") + >>> def save_weakproxy(pickler, obj) + >>> trace_message = "W: This works even with a broken weakproxy: %r" % obj + >>> logger.trace(pickler, trace_message, obj=obj) + >>> ... + >>> logger.trace(pickler, "# W") """ def __init__(self, logger): self.logger = logger @@ -143,6 +154,12 @@ def trace(self, pickler, msg, *args, obj=None, **kwargs): extra = kwargs.get('extra', {}) pushed_obj = msg.startswith('#') if not pushed_obj: + if obj is None and (not args or type(args[-1]) is str): + raise TypeError( + "the pickled object must be passed as the last positional " + "argument (being substituted in the message) or as the " + "'obj' keyword argument." + ) if obj is None: obj = args[-1] pickler._trace_stack.append(id(obj)) From 41bf44a6dc469a205fd28901f1a56fda3045999e Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 22 Aug 2022 14:02:35 -0300 Subject: [PATCH 076/109] moved config file stuff to a new PR --- dill/__init__.py | 2 +- dill/session.py | 6 -- dill/settings.py | 178 +----------------------------------- dill/tests/test_session.py | 3 +- dill/tests/test_settings.py | 175 ----------------------------------- 5 files changed, 6 insertions(+), 358 deletions(-) delete mode 100644 dill/tests/test_settings.py diff --git a/dill/__init__.py b/dill/__init__.py index 3dd3ab02..3571f54e 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -37,7 +37,7 @@ from . import detect, logger, session, source, temp # get global settings -from .settings import settings, read_settings, reset_settings +from .settings import settings # make sure "trace" is turned off logger.trace(False) diff --git a/dill/session.py b/dill/session.py index 895eaeb7..c793fff8 100644 --- a/dill/session.py +++ b/dill/session.py @@ -887,12 +887,6 @@ def get_filters(self, rule_type: RuleType) -> FilterSet: 'filters': ModuleFilters(rules=()), } -# For read_settings(): -from .settings import DEFAULT_SETTINGS -DEFAULT_SETTINGS[__name__] = settings.copy() -del DEFAULT_SETTINGS[__name__]['filters'] -del DEFAULT_SETTINGS - ## Session filter factories ## diff --git a/dill/settings.py b/dill/settings.py index 0d51262b..1a5bde79 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -10,9 +10,9 @@ global settings for Pickler """ -__all__ = ['settings', 'read_settings', 'reset_settings'] +__all__ = ['settings'] -from pickle import DEFAULT_PROTOCOL, HIGHEST_PROTOCOL +from pickle import DEFAULT_PROTOCOL settings = { #'main' : None, @@ -24,176 +24,4 @@ 'ignore' : False, } -### Config file reader (INI format) ### - -DEFAULT_SETTINGS = { - 'dill': settings.copy(), - 'dill.session': None, # set in dill.session -} -FMODES = dict(HANDLE_FMODE=0, CONTENTS_FMODE=1, FILE_FMODE=2) -STANDARD_PROTOCOLS = dict(DEFAULT_PROTOCOL=DEFAULT_PROTOCOL, HIGHEST_PROTOCOL=HIGHEST_PROTOCOL) - -del DEFAULT_PROTOCOL, HIGHEST_PROTOCOL - -def _split_option(option, strip_quotes=False): - """Split option text by commas *or* newlines.""" - import re - if not option: return [] # empty value - option = option.strip(",\n") # strip leading/trailing commas and empty lines - option = option.splitlines() if '\n' in option else re.split(r',\s+', option) - return [x.strip("\"'") for x in option] if strip_quotes else option - -def _read_filters(section, module_filters): - from logging.config import _resolve - from dill.session import EXCLUDE, INCLUDE - for rule_type in (EXCLUDE, INCLUDE): - rule = rule_type.name.lower() - if not any(option.lower().startswith(rule) for option in section): - # If no filters, let it fall back to parent module's or default filters. - delattr(module_filters, rule) - continue - for option in (rule, '%s.names' % rule, '%s.regexes' % rule): - if option in section: - for filter in _split_option(section[option], strip_quotes=True): - module_filters.add(filter, rule_type=rule_type) - if '%s.types' % rule in section: - for name in _split_option(section['%s.types' % rule]): - if '.' in name: - filter = _resolve(name) - if not isinstance(filter, type): - raise TypeError("filter is not a type: %r" % filter) - else: - filter = 'type:%s' % name - module_filters.add(filter, rule_type=rule_type) - if '%s.funcs' % rule in section: - for code in section['%s.funcs' % rule].strip('\n').splitlines(): - code = code.strip() - globals_ = {} - if not code.startswith('lambda'): - name = code.partition('(')[0] - _resolve(name) # import required modules - top_level = name.partition('.')[0] - globals_[top_level] = __import__(top_level) - filter = eval(code, globals_) - if not callable(filter): - raise TypeError("filter is not callable: %r" % filter) - module_filters.add(filter, rule_type=rule_type) - -def read_settings(filename) -> None: - """Read dill settings from an INI file. - - Update the ``dill.settings`` dictionary with the contents of the INI file - ``filename``. Accepted file sections: - - - `dill`: general :py:mod:`dill` settings - - `dill.module`: settings for :py:func:`dill.dump_module` - - `filters`: default exclude/include filters for :py:func:`dill.dump_module` - - `filters.`: module-specific filters for - :py:func:`dill.dump_module`, where `` is the complete module - path in the form `module[.submodule...]` - - Accepted option values for general settings: - - - boolean options (case insensitive): False, True - - `protocol`: DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, 0, 1, 2, 3, ... - - `fmode`: HANDLE_FMODE, 0, CONTENTS_FMODE, 1, FILE_FMODE, 2 - - Example of a settings file: - - .. code-block:: pacmanconf - - [dill] - # General settings - ## Stored in dill.settings. - protocol = HIGHEST_PROTOCOL - byref = yes - - [dill.session] - # Settings for dill.dump_module() - ## Stored in dill.session.settings. - refonfail = no - - [filters] - # Default exclude/include filters for dill.dump_module() - ## Stored in dill.settings['dump_module']['filters']. - exclude.names = some_var, SomeClass - exclude.regexes = '_.+' - exclude.types = function, ModuleType, io.BytesIO - exclude.funcs = - lambda obj: type(obj.value) == int - dill.session.size_filter('10 KB') - include = _keep_this_var, '__.+__' - - [filters.some.module] - # Filter rules specific to the module 'some.module' - ## Reuse regex filters defined in the previous section. - ## Option 'include' is unset, will fall back to default 'include' filters. - exclude = ${filters:exclude.regexes} - #include = - - [filters.another.module] - # Filter rules specific to the module 'another.module' - ## Empty filter sets disable filtering for this module. - exclude = - include = - - Parameters: - filename: a path-like object or a readable stream. - - Tip: - The parser uses default syntax with extended interpolation enabled. - For details about the accepted INI format, see :py:mod:`configparser`. - """ - import configparser - from dill.session import ModuleFilters, settings as session_settings - - cp = configparser.ConfigParser( - dict_type=dict, # internal, in place of OrderedDict - empty_lines_in_values=False, # one value per line - interpolation=configparser.ExtendedInterpolation(), - ) - cp.read_dict(DEFAULT_SETTINGS) - if hasattr(filename, 'readline'): - cp.read_file(filename) - else: - cp.read(filename) - - # General settings. - section = cp['dill'] - new_settings = {k: section.getboolean(k) - for k, v in DEFAULT_SETTINGS['dill'].items() if type(v) is bool} - for option, named_opts in [('fmode', FMODES), ('protocol', STANDARD_PROTOCOLS)]: - try: - new_settings[option] = section.getint(option) - except ValueError: - new_settings[option] = named_opts[section.get(option)] - - # Session settings (for dump_module). - section = cp['dill.session'] - new_session_settings = {k: section.getboolean(k) for k in DEFAULT_SETTINGS['dill.session']} - filters = new_session_settings['filters'] = ModuleFilters(rules=()) - if 'filters' in cp: - # Default filters. - _read_filters(cp['filters'], filters) - for module, section in cp.items(): - if not module.startswith('filters.'): - continue - module = module.partition('.')[-1] - assert all(x.isidentifier() for x in module.split('.')) - filters[module] = () # instantiate ModuleFilters and FilterSets - _read_filters(section, filters[module]) - - # Update settings dictionary. - settings.clear() - settings.update(new_settings) - session_settings.clear() - session_settings.update(new_session_settings) - -def reset_settings() -> None: - "Reset all the dill settings to its default values." - from dill.session import ModuleFilters, settings as session_settings - settings.clear() - settings.update(DEFAULT_SETTINGS['dill']) - session_settings.clear() - session_settings.update(DEFAULT_SETTINGS['dill.session']) - session_settings['filters'] = ModuleFilters(rules=()) +del DEFAULT_PROTOCOL diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index deba0ec4..a77ee3c7 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -297,6 +297,7 @@ def test_refonfail_unpickleable(): global local_mod import keyword as builtin_mod from dill._dill import _global_string + refonfail_default = dill.session.settings['refonfail'] dill.session.settings['refonfail'] = True name = '__test_obj' obj = memoryview(b'') @@ -336,7 +337,7 @@ def dump_with_ref(mod, other_mod): assert _global_string('os', name) in dump_with_ref(builtin_mod, os) del builtin_mod.__test_obj, local_mod.__test_obj, os.__test_obj - dill.reset_settings() + dill.session.settings['refonfail'] = refonfail_default def test_load_module_asdict(): with TestNamespace(): diff --git a/dill/tests/test_settings.py b/dill/tests/test_settings.py deleted file mode 100644 index c84966f2..00000000 --- a/dill/tests/test_settings.py +++ /dev/null @@ -1,175 +0,0 @@ -#!/usr/bin/env python - -# Author: Leonardo Gama (@leogama) -# Copyright (c) 2022 The Uncertainty Quantification Foundation. -# License: 3-clause BSD. The full license text is available at: -# - https://github.com/uqfoundation/dill/blob/master/LICENSE - -import io -import re -import sys -import textwrap -import warnings -from pickletools import optimize -from types import ModuleType - -import dill -from dill.session import ModuleFilters, settings as session_settings -from dill.settings import DEFAULT_SETTINGS - -regex = r' +\[dill].+(?=\n +Parameters:$)' -config_demo = re.search(regex, dill.read_settings.__doc__, re.DOTALL | re.MULTILINE).group() -config_demo = textwrap.dedent(config_demo) - -def test_read_settings(): - dill.read_settings(io.StringIO(config_demo)) - - # dill general settings - dill_default = DEFAULT_SETTINGS['dill'] - assert dill.settings['recurse'] is dill_default['recurse'] # unchanged - assert dill.settings['byref'] is (not dill_default['byref']) # here and below: changed - assert dill.settings['protocol'] != dill_default['protocol'] - assert dill.settings['protocol'] == dill.HIGHEST_PROTOCOL # value passed as text - - # session settings (excluding filters) - session_default = DEFAULT_SETTINGS['dill.session'] - assert session_settings['refimported'] is session_default['refimported'] # unchanged - assert session_settings['refonfail'] is (not session_default['refonfail']) # changed - - # session default filters - filters = session_settings['filters'] - assert type(filters) is dill.session.ModuleFilters - assert filters._module == 'DEFAULT' - assert len(filters.exclude) == 8 and len(filters.include) == 2 - assert filters.exclude.regexes == {re.compile(r'_.+')} - assert io.BytesIO in filters.exclude.types - for filter in filters.exclude.funcs: # it's a set, we don't know the order - if isinstance(filter, dill._utils.size_filter): - assert filter.limit == 10000 - else: - obj1 = dill.session.NamedObject(('bool', True)) - obj2 = dill.session.NamedObject(('int', 1)) - assert filter(obj1) is False - assert filter(obj2) is True - ## include: different types of filters in the same entry. - assert len(filters.include.names) == len(filters.include.regexes) == 1 - - # module specific filters - assert filters['some.module']._module == 'some.module' - assert filters['some.module'].exclude.regexes == filters.exclude.regexes - assert not hasattr(filters['some.module'], 'include') # not set, fall back to parent - ## 'some': parent placeholder - assert filters['some']._module == 'some' - assert not hasattr(filters['some'], 'exclude') and not hasattr(filters['some'], 'include') - ## 'another.module': empty filters, overwrite default filters - assert len(filters['another.module'].exclude) == len(filters['another.module'].include) == 0 - -def test_reset_settings(): - dill.settings['byref'] = 'anything' - session_settings['refimported'] = 'something else' - session_settings['filters'].add('a_name') - dill.reset_settings() - assert dill.settings == DEFAULT_SETTINGS['dill'] - settings_copy = session_settings.copy() - del settings_copy['filters'] - assert settings_copy == DEFAULT_SETTINGS['dill.session'] - assert session_settings['filters'] == ModuleFilters(rules=()) - -class Test: - def __init__(self): - pass - -def test_settings(): - # byref and recurse - for option in ('byref', 'recurse'): - dill.reset_settings() - NON_DEFAULT = not dill.settings[option] - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - pickle1 = dill.dumps(Test) # default - pickle2 = dill.dumps(Test, **{option: NON_DEFAULT}) - dill.settings[option] = NON_DEFAULT - try: - assert pickle1 != pickle2 - assert dill.dumps(Test) == pickle2 - except AssertionError as error: - error.args = ("while testing option %r" % option,) - raise - - # ignore - dill.reset_settings() - NON_DEFAULT = not dill.settings['ignore'] - obj = Test() - copy1 = dill.copy(obj) # default - copy2 = dill.copy(obj, ignore=NON_DEFAULT) - dill.settings['ignore'] = NON_DEFAULT - copy3 = dill.copy(obj) - default_res = type(copy1) is Test - non_default_res = type(copy2) is Test - assert default_res is not non_default_res - assert (type(copy3) is Test) is non_default_res - - # protocol - # Only protocol zero doesn't have an opcode for empty tuple. - dill.reset_settings() - EMPTY_TUPLE_0 = b'(t.' - assert dill.dumps(()) != EMPTY_TUPLE_0 - dill.settings['protocol'] = 0 - assert dill.dumps(()) == EMPTY_TUPLE_0 - - # fmode - dill.reset_settings() - dill.settings['protocol'] = 0 - for fmode in (dill.HANDLE_FMODE, dill.CONTENTS_FMODE): - dill.settings['fmode'] = fmode - dump = optimize(dill.dumps(sys.stdin)) # remove momeize opcodes - assert dump.endswith(str(fmode).encode() + b'\nV\ntR.') - - # session.refimported - dill.reset_settings() - module = ModuleType('__test__') - module.BUILTIN_CONSTANTS = dill.session.BUILTIN_CONSTANTS - NON_DEFAULT = not session_settings['refimported'] - ## default - buf = io.BytesIO() - dill.dump_module(buf, module) # refimported=DEFAULT - buf.seek(0) - copy1 = dill.load_module(buf) - ## non-default - buf = io.BytesIO() - dill.dump_module(buf, module, refimported=NON_DEFAULT) - buf.seek(0) - copy2 = dill.load_module(buf) - ## non-default (settings) - session_settings['refimported'] = NON_DEFAULT - buf = io.BytesIO() - dill.dump_module(buf, module) - buf.seek(0) - copy3 = dill.load_module(buf) - ## tuple was saved by reference? - default_res = copy1.BUILTIN_CONSTANTS is dill.session.BUILTIN_CONSTANTS - non_default_res = copy2.BUILTIN_CONSTANTS is dill.session.BUILTIN_CONSTANTS - test_res = copy3.BUILTIN_CONSTANTS is dill.session.BUILTIN_CONSTANTS - assert default_res is not non_default_res - assert test_res is non_default_res - - # session.refonfail - dill.reset_settings() - assert session_settings['refonfail'] is True - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - - dill.dump_module(io.BytesIO(), sys) # should work - session_settings['refonfail'] = False - try: - dill.dump_module(io.BytesIO(), sys) - except Exception: - pass - else: - raise("saving 'sys' without 'refonfail' should have failed") - -if __name__ == '__main__': - if not dill._dill.IS_PYPY: - test_read_settings() - test_reset_settings() - test_settings() From 4ad152712c3f5069415a22f3ce2f5ae371b67a49 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 22 Aug 2022 14:14:04 -0300 Subject: [PATCH 077/109] restore settings.py from master --- dill/settings.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dill/settings.py b/dill/settings.py index 1a5bde79..b105d2e8 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -1,7 +1,6 @@ #!/usr/bin/env python # # Author: Mike McKerns (mmckerns @caltech and @uqfoundation) -# Author: Leonardo Gama (@leogama) # Copyright (c) 2008-2016 California Institute of Technology. # Copyright (c) 2016-2022 The Uncertainty Quantification Foundation. # License: 3-clause BSD. The full license text is available at: @@ -10,8 +9,6 @@ global settings for Pickler """ -__all__ = ['settings'] - from pickle import DEFAULT_PROTOCOL settings = { @@ -25,3 +22,4 @@ } del DEFAULT_PROTOCOL + From 035360b6828563fa1f27e087d7fe7f47256a0fb6 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 22 Aug 2022 16:59:46 -0300 Subject: [PATCH 078/109] update documentation of session settings --- dill/session.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/dill/session.py b/dill/session.py index c793fff8..67d7523a 100644 --- a/dill/session.py +++ b/dill/session.py @@ -190,9 +190,8 @@ def dump_module( Only a subset of the module's variables may be saved if exclusion/inclusion filters are specified. Filters apply to every variable name or value and determine if they should be saved or not. They can be set in - ``dill.settings['dump_module']['filters']`` or passed directly to the - ``exclude`` and ``include`` parameters. See :py:class:`ModuleFilters` for - details. + ``dill.session.settings['filters']`` or passed directly to the ``exclude`` + and ``include`` parameters. See :py:class:`ModuleFilters` for details. Parameters: filename: a path-like object or a writable stream. @@ -733,10 +732,10 @@ class ModuleFilters(FilterRules): exclusion/inclusion filters for specific modules and submodules. See the base class documentation to learn more about how to create and use filters. - This is the type of ``dill.settings['dump_module']['filters']``: + This is the type of ``dill.session.settings['filters']``: >>> import dill - >>> filters = dill.settings['dump_module']['filters'] + >>> filters = dill.session.settings['filters'] >>> filters From 2a3788fb6f9af919343454ee0bc836f8e0c473b3 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 22 Aug 2022 19:02:11 -0300 Subject: [PATCH 079/109] is_pickled_module(): distinguish between modules saved with dump() and dump_module() --- dill/_dill.py | 14 ++++++++------ dill/session.py | 9 ++++++--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 0d4535e4..430dbdd4 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -447,7 +447,8 @@ def save_numpy_array(pickler, obj): if self._session and obj is self._main: if not _is_builtin_module(self._main): raise - self.save_reduce(_import_module, (obj.__name__,), obj=obj) + # Save an empty dict as state to distinguish this from modules saved with dump(). + self.save_reduce(_import_module, (obj.__name__,), obj=obj, state={}) logger.trace(self, message, obj=obj) warnings.warn( "module %r saved by reference due to the unpickleable " @@ -1796,6 +1797,10 @@ def wrapper(referent): return value return wrapper +@_weak_cache(defaults={None: False}) +def _is_imported_module(module): + return getattr(module, '__loader__', None) is not None or module in sys.modules.values() + PYTHONPATH_PREFIXES = {getattr(sys, attr) for attr in ( 'base_prefix', 'prefix', 'base_exec_prefix', 'exec_prefix', 'real_prefix', # for old virtualenv versions @@ -1805,13 +1810,13 @@ def wrapper(referent): if OLD310: STDLIB_PREFIX = os.path.dirname(os.path.realpath(os.__file__)) -@_weak_cache(defaults={None: True}) +@_weak_cache(defaults={None: True}) #XXX: shouldn't return False for None? def _is_builtin_module(module): if module.__name__ in ('__main__', '__mp_main__'): return False mod_path = getattr(module, '__file__', None) if not mod_path: - return True + return _is_imported_module(module) # If a module file name starts with prefix, it should be a builtin # module, so should always be pickled as a reference. mod_path = os.path.realpath(mod_path) @@ -1834,9 +1839,6 @@ def _is_stdlib_module(module): else: return first_level in sys.stdlib_module_names -def _is_imported_module(module): - return getattr(module, '__loader__', None) is not None or module in sys.modules.values() - def _module_package(module): package = getattr(module, '__package__', None) return package.partition('.')[0] if package else None diff --git a/dill/session.py b/dill/session.py index 67d7523a..5a4a53ea 100644 --- a/dill/session.py +++ b/dill/session.py @@ -357,8 +357,8 @@ def _identify_module(file, main=None): module_name = arg if not ( next(opcodes)[0] in ('TUPLE1', 'TUPLE') and - next(opcodes)[0] == 'REDUCE' #and - #next(opcodes)[0] in ('EMPTY_DICT', 'DICT') + next(opcodes)[0] == 'REDUCE' and + next(opcodes)[0] in ('EMPTY_DICT', 'DICT') ): raise ValueError return module_name @@ -372,7 +372,10 @@ def _identify_module(file, main=None): raise UnpicklingError("unable to identify module") from error def is_pickled_module(filename, importable: bool = True) -> bool: - """Check if a file is a pickle file readable by :py:func:`load_module`. + """Check if a file can be loaded with :py:func:`load_module`. + + Check if the file is a pickle file generated with :py:func:`dump_module`, + and thus can be loaded with :py:func:`load_module`. Parameters: filename: a path-like object or a readable stream. From 106e28376364b0f458aff80f47b67fde8df1132c Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 22 Aug 2022 20:01:32 -0300 Subject: [PATCH 080/109] tests for is_pickled_module() --- dill/_dill.py | 2 +- dill/tests/test_session.py | 79 +++++++++++++++++++++++++++++--------- 2 files changed, 61 insertions(+), 20 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 430dbdd4..9efdb624 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -445,7 +445,7 @@ def save_numpy_array(pickler, obj): self.memo.popitem() # LIFO order is guaranteed since 3.7 # Handle modules specially. if self._session and obj is self._main: - if not _is_builtin_module(self._main): + if self._main is _main_module or not _is_imported_module(self._main): raise # Save an empty dict as state to distinguish this from modules saved with dump(). self.save_reduce(_import_module, (obj.__name__,), obj=obj, state={}) diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index a77ee3c7..f65cfe2b 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -196,12 +196,12 @@ def test_session_other(): assert module.selfref is module def test_runtime_module(): - modname = '__runtime__' - runtime = ModuleType(modname) - runtime.x = 42 + modname = 'runtime' + runtime_mod = ModuleType(modname) + runtime_mod.x = 42 - mod, _ = dill.session._stash_modules(runtime) - if mod is not runtime: + mod, _ = dill.session._stash_modules(runtime_mod) + if mod is not runtime_mod: print("There are objects to save by referenece that shouldn't be:", mod.__dill_imported, mod.__dill_imported_as, mod.__dill_imported_top_level, file=sys.stderr) @@ -210,23 +210,23 @@ def test_runtime_module(): # without imported objects in the namespace. It's a contrived example because # even dill can't be in it. This should work after fixing #462. session_buffer = BytesIO() - dill.dump_module(session_buffer, module=runtime, refimported=True) + dill.dump_module(session_buffer, module=runtime_mod, refimported=True) session_dump = session_buffer.getvalue() # Pass a new runtime created module with the same name. - runtime = ModuleType(modname) # empty - return_val = dill.load_module(BytesIO(session_dump), module=runtime) + runtime_mod = ModuleType(modname) # empty + return_val = dill.load_module(BytesIO(session_dump), module=runtime_mod) assert return_val is None - assert runtime.__name__ == modname - assert runtime.x == 42 - assert runtime not in sys.modules.values() + assert runtime_mod.__name__ == modname + assert runtime_mod.x == 42 + assert runtime_mod not in sys.modules.values() # Pass nothing as main. load_module() must create it. session_buffer.seek(0) - runtime = dill.load_module(BytesIO(session_dump)) - assert runtime.__name__ == modname - assert runtime.x == 42 - assert runtime not in sys.modules.values() + runtime_mod = dill.load_module(BytesIO(session_dump)) + assert runtime_mod.__name__ == modname + assert runtime_mod.x == 42 + assert runtime_mod not in sys.modules.values() def test_lookup_module(): assert not dill._dill._is_builtin_module(local_mod) and local_mod.__package__ == '' @@ -235,7 +235,7 @@ def lookup(mod, name, obj, lookup_by_name=True): from dill._dill import _lookup_module, _module_map return _lookup_module(_module_map(mod), name, obj, lookup_by_name) - name = '__test_obj' + name = '__unpickleable' obj = object() setattr(dill, name, obj) assert lookup(dill, name, obj) == (None, None, None) @@ -299,7 +299,7 @@ def test_refonfail_unpickleable(): from dill._dill import _global_string refonfail_default = dill.session.settings['refonfail'] dill.session.settings['refonfail'] = True - name = '__test_obj' + name = '__unpickleable' obj = memoryview(b'') assert dill._dill._is_builtin_module(builtin_mod) assert not dill._dill._is_builtin_module(local_mod) @@ -329,13 +329,13 @@ def dump_with_ref(mod, other_mod): assert _global_string(_local_mod.__name__, name) in dump_with_ref(__main__, _local_mod) assert _global_string('os', name) in dump_with_ref(__main__, os) local_mod = _local_mod - del _local_mod, __main__.__test_obj, local_mod.__test_obj, os.__test_obj + del _local_mod, __main__.__unpickleable, local_mod.__unpickleable, os.__unpickleable # "builtin" or "installed" modules assert _global_string(builtin_mod.__name__, name) in dump_with_ref(builtin_mod, builtin_mod) assert _global_string(builtin_mod.__name__, name) in dump_with_ref(builtin_mod, local_mod) assert _global_string('os', name) in dump_with_ref(builtin_mod, os) - del builtin_mod.__test_obj, local_mod.__test_obj, os.__test_obj + del builtin_mod.__unpickleable, local_mod.__unpickleable, os.__unpickleable dill.session.settings['refonfail'] = refonfail_default @@ -387,6 +387,46 @@ def namespace_matches(keep_history, should_keep_vars): assert namespace_matches(keep_history='both', should_keep_vars={'_i1', '_1'}) assert namespace_matches(keep_history='none', should_keep_vars=set()) +def test_is_pickled_module(): + import tempfile + import warnings + + # Module saved with dump(). + pickle_file = tempfile.NamedTemporaryFile(mode='wb') + dill.dump(os, pickle_file) + pickle_file.flush() + assert not dill.is_pickled_module(pickle_file.name) + assert not dill.is_pickled_module(pickle_file.name, importable=False) + pickle_file.close() + + # Importable module saved with dump_module(). + pickle_file = tempfile.NamedTemporaryFile(mode='wb') + dill.dump_module(pickle_file, local_mod) + pickle_file.flush() + assert dill.is_pickled_module(pickle_file.name) + assert not dill.is_pickled_module(pickle_file.name, importable=False) + pickle_file.close() + + # Module-type object saved with dump_module(). + pickle_file = tempfile.NamedTemporaryFile(mode='wb') + dill.dump_module(pickle_file, ModuleType('runtime')) + pickle_file.flush() + assert not dill.is_pickled_module(pickle_file.name) + assert dill.is_pickled_module(pickle_file.name, importable=False) + pickle_file.close() + + # Importable module saved by reference due to unpickleable object. + pickle_file = tempfile.NamedTemporaryFile(mode='wb') + local_mod.__unpickleable = memoryview(b'') + warnings.filterwarnings('ignore') + dill.dump_module(pickle_file, local_mod) + warnings.resetwarnings() + del local_mod.__unpickleable + pickle_file.flush() + assert dill.is_pickled_module(pickle_file.name) + assert not dill.is_pickled_module(pickle_file.name, importable=False) + pickle_file.close() + if __name__ == '__main__': test_session_main(refimported=False) test_session_main(refimported=True) @@ -397,3 +437,4 @@ def namespace_matches(keep_history, should_keep_vars): test_refonfail_unpickleable() test_load_module_asdict() test_ipython_filter() + test_is_pickled_module() From b05a0e0d00ce7459fb1373e715f23020d17ff15c Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 22 Aug 2022 22:20:38 -0300 Subject: [PATCH 081/109] removed outdated comments --- dill/_dill.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 9efdb624..683ba6d7 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -366,11 +366,10 @@ def save(self, obj, save_persistent_id=True): # object pickled. When 'refonfail' is True, it tries to save the object # by reference if pickling it fails with a common pickling error, as # defined by the constant UNPICKLEABLE_ERRORS. If that also fails, then - # the exception is risen and, if this was called indirectly from another - # Pickler.save() call, the parent objects will try to be saved by - # reference recursively, until it succeeds or the exception propagates - # beyond the topmost save() call. The extra 'name' argument is passed - # to StockPickler.save_global(). + # the exception is raised and, if this method was called indirectly from + # another Pickler.save() call, the parent objects will try to be saved + # by reference recursively, until it succeeds or the exception + # propagates beyond the topmost save() call. # numpy hack obj_type = type(obj) @@ -1312,11 +1311,8 @@ def _lookup_module(modmap, name, obj, lookup_by_id=True) -> typing.Tuple[str, st def _global_string(modname, name): return GLOBAL + bytes('%s\n%s\n' % (modname, name), 'UTF-8') -def _save_module_dict(pickler, obj): - # If an object doesn't have a '__name__' attribute, pass the object's name - # in the module's namespace to save(), so that it can be used with - # save_global() to increase the chances of finding the object for saving - # it by reference in the event of a failed serialization. +def _save_module_dict(pickler, main_dict): + """Save a module's dictionary, saving unpickleable variables by referece.""" main = getattr(pickler, '_original_main', pickler._main) modmap = getattr(pickler, '_modmap', None) # cached from _stash_modules() is_builtin = _is_builtin_module(main) @@ -1341,7 +1337,7 @@ def _save_module_dict(pickler, obj): pickler.write(SETITEM) def _repr_dict(obj): - """make a short string representation of a dictionary""" + """Make a short string representation of a dictionary.""" return "<%s object at %#012x>" % (type(obj).__name__, id(obj)) @register(dict) From d1b9156adbe96fa542eb58f1076af80987c3da6f Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 22 Aug 2022 22:23:59 -0300 Subject: [PATCH 082/109] fixup --- dill/_dill.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dill/_dill.py b/dill/_dill.py index 683ba6d7..bdfe9e7f 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -1317,7 +1317,7 @@ def _save_module_dict(pickler, main_dict): modmap = getattr(pickler, '_modmap', None) # cached from _stash_modules() is_builtin = _is_builtin_module(main) pickler.write(MARK + DICT) # don't need to memoize - for name, value in obj.items(): + for name, value in main_dict.items(): pickler.save(name) try: pickler.save(value) From 234d96cd9f46df7a28cecc6b06fea73e69644282 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 22 Aug 2022 22:26:26 -0300 Subject: [PATCH 083/109] minor --- dill/session.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dill/session.py b/dill/session.py index 5a4a53ea..2d099142 100644 --- a/dill/session.py +++ b/dill/session.py @@ -16,7 +16,8 @@ Contrary of using :py:func:`dill.dump` and :py:func:`dill.load` to save and load a module object, :py:func:`dill.dump_module` always tries to pickle the module -by value (including built-in modules). Also, options like +by value (including built-in modules). Modules saved with :py:func:`dill.dump` +can't be loaded with :py:func:`load_module`. Also, options like ``dill.settings['byref']`` and ``dill.settings['recurse']`` don't affect its behavior. From 982339625792b03b96978b0d4d994a3125b7e795 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 22 Aug 2022 23:00:10 -0300 Subject: [PATCH 084/109] minor --- dill/_dill.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dill/_dill.py b/dill/_dill.py index bdfe9e7f..fa153198 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -2030,7 +2030,7 @@ def save_function(pickler, obj): # If the globals is the __dict__ from the module being saved as a # session, substitute it by the dictionary being actually saved. if _original_main is not None and globs_copy is _original_main.__dict__: - globs_copy = getattr(pickler, '_main', _original_main).__dict__ + globs_copy = pickler._main.__dict__ globs = globs_copy # If the globals is a module __dict__, do not save it in the pickle. elif globs_copy is not None and obj.__module__ is not None and \ From 2f189f51a63a7d129f4c33a6180923b6b2faf04c Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 23 Aug 2022 09:15:04 -0300 Subject: [PATCH 085/109] docs: hide __weakref__ --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index e732a27a..6557ce48 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -75,6 +75,7 @@ 'exclude-members': ( #NOTE: this is a single string concatenation '__dict__,' # implementation detail (may be verbose) '__slots__,' # implementation detail + '__weakref__,' # implementation detail '__module__,' # implementation detail '__annotations__,' # redundant with signature documentation '__dataclass_fields__,' # dataclass automatic attribute, redundant From 23e9593cb0e70a281072cc665036d28a47d63b33 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 23 Aug 2022 18:23:10 -0300 Subject: [PATCH 086/109] remove duplicate __init__ documentation --- docs/source/conf.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 6557ce48..ebb91f57 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -77,13 +77,14 @@ '__slots__,' # implementation detail '__weakref__,' # implementation detail '__module__,' # implementation detail + '_abc_impl,' # implementation detail of abstract classes + '__init__,' # repeated in class docstring by "autoclass_content=both" '__annotations__,' # redundant with signature documentation '__dataclass_fields__,' # dataclass automatic attribute, redundant - '_abc_impl,' # implementation detail ) } autodoc_typehints = 'description' -napoleon_include_init_with_doc = True +autodoc_typehints_format = 'short' napoleon_include_private_with_doc = False napoleon_include_special_with_doc = True napoleon_use_ivar = True From fb247d86f594de7cc3d535d7cee436843ac5d4ce Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Wed, 24 Aug 2022 19:43:07 -0300 Subject: [PATCH 087/109] remove added variables after _stash_modules() and _filter_vars() --- dill/_utils.py | 4 ++++ dill/session.py | 22 ++++++++++++++-------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/dill/_utils.py b/dill/_utils.py index 4aed5c5e..29f267fb 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -136,6 +136,8 @@ def _open(file, mode, *, peekable=False, seekable=False): class NamedObject: """Simple container for a variable's name and value used by filter functions.""" __slots__ = 'name', 'value' + name: str + value: Any def __init__(self, name_value: Tuple[str, Any]): self.name, self.value = name_value def __eq__(self, other: Any) -> bool: @@ -147,6 +149,8 @@ def __eq__(self, other: Any) -> bool: raise TypeError("'==' not supported between instances of 'NamedObject' and %r" % type(other).__name__) return self.value is other.value and self.name == other.name + def __repr__(self): + return "NamedObject(%r, %r)" % (self.name, self.value) FilterFunction = Callable[[NamedObject], bool] Filter = Union[str, Pattern[str], int, type, FilterFunction] diff --git a/dill/session.py b/dill/session.py index 2d099142..c21b6ff2 100644 --- a/dill/session.py +++ b/dill/session.py @@ -36,7 +36,8 @@ __all__ = [ 'dump_module', 'load_module', 'load_module_asdict', 'is_pickled_module', - 'ModuleFilters', 'FilterRules', 'FilterSet', 'size_filter', 'ipython_filter', + 'ModuleFilters', 'NamedObject', 'FilterRules', 'FilterSet', 'size_filter', + 'ipython_filter', 'dump_session', 'load_session' # backward compatibility ] @@ -116,6 +117,7 @@ def _stash_modules(main_module): newmod.__dill_imported = imported newmod.__dill_imported_as = imported_as newmod.__dill_imported_top_level = imported_top_level + _discard_added_variables(newmod, main_module.__dict__) return newmod, modmap else: return main_module, modmap @@ -151,8 +153,18 @@ def _filter_vars(main_module, exclude, include, base_rules): newmod = ModuleType(main_module.__name__) newmod.__dict__.update(namespace) + _discard_added_variables(newmod, namespace) return newmod +def _discard_added_variables(main, original_namespace): + # Some empty attributes like __doc__ may have been added by ModuleType(). + added_names = set(main.__dict__) + added_names.discard('__name__') # required + added_names.difference_update(original_namespace) + added_names.difference_update('__dill_imported%s' % s for s in ('', '_as', '_top_level')) + for name in added_names: + delattr(main, name) + def _fix_module_namespace(main, original_main): # Self-references. for name, obj in main.__dict__.items(): @@ -160,12 +172,6 @@ def _fix_module_namespace(main, original_main): setattr(main, name, main) elif obj is original_main.__dict__: setattr(main, name, main.__dict__) - # Some empty attributes like __doc__ may have been added by ModuleType(). - added_names = set(main.__dict__) - added_names.difference_update(original_main.__dict__) - added_names.difference_update('__dill_imported%s' % s for s in ('', '_as', '_top_level')) - for name in added_names: - delattr(main, name) # Trick _is_imported_module(), forcing main to be saved as an imported module. if getattr(main, '__loader__', None) is None and _is_imported_module(original_main): main.__loader__ = True # will be discarded by _dill.save_module() @@ -710,7 +716,7 @@ def load_module_asdict( main_name = _identify_module(file) original_main = sys.modules.get(main_name) main = ModuleType(main_name) - del main.__doc__, main.__package__, main.__spec__ + del main.__doc__, main.__loader__, main.__package__, main.__spec__ if update: if original_main is None: original_main = _import_module(main_name) From f5804f2edfb51d81a20ea08781e7869926f211db Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 25 Aug 2022 16:40:24 -0300 Subject: [PATCH 088/109] fix load_module_asdict() and _filter_var() corner cases --- dill/session.py | 73 +++++++++++++++++++------------------- dill/tests/test_session.py | 2 +- 2 files changed, 37 insertions(+), 38 deletions(-) diff --git a/dill/session.py b/dill/session.py index c21b6ff2..1361b85c 100644 --- a/dill/session.py +++ b/dill/session.py @@ -133,9 +133,9 @@ def _restore_modules(unpickler, main_module): def _filter_vars(main_module, exclude, include, base_rules): """apply exclude/include filters from arguments *and* settings""" rules = FilterRules() - mod_filters = base_rules.get(main_module.__name__, base_rules) - rules.exclude |= mod_filters.get_filters(EXCLUDE) - rules.include |= mod_filters.get_filters(INCLUDE) + mod_rules = base_rules.get_rules(main_module.__name__) + rules.exclude |= mod_rules.get_filters(EXCLUDE) + rules.include |= mod_rules.get_filters(INCLUDE) if exclude is not None: rules.update([(EXCLUDE, exclude)]) if include is not None: @@ -656,8 +656,6 @@ def load_session(filename=str(TEMPDIR/'session.pkl'), main=None, **kwds): def load_module_asdict( filename = str(TEMPDIR/'session.pkl'), - *, - update: bool = False, **kwds ) -> dict: """ @@ -667,13 +665,11 @@ def load_module_asdict( lambda filename: vars(dill.load_module(filename)).copy() - however, does not alter the original module. Also, the path of - the loaded module is stored in the ``__session__`` attribute. + however, it does not alter the original module. Also, the path of + the loaded file is stored with the key ``'__session__'``. Parameters: filename: a path-like object or a readable stream - update: if `True`, initialize the dictionary with the current state - of the module prior to loading the state stored at filename. **kwds: extra keyword arguments passed to :py:class:`Unpickler()` Raises: @@ -683,11 +679,8 @@ def load_module_asdict( A copy of the restored module's dictionary. Note: - If ``update`` is True, the corresponding module may first be imported - into the current namespace before the saved state is loaded from - filename to the dictionary. Note that any module that is imported into - the current namespace as a side-effect of using ``update`` will not be - modified by loading the saved module in filename to a dictionary. + Even if not changed, the module refered in the file is always loaded + before its saved state is restored from `filename` to the dictionary. Example: >>> import dill @@ -707,33 +700,37 @@ def load_module_asdict( False >>> main['anum'] == anum # changed after the session was saved False - >>> new_var in main # would be True if the option 'update' was set - False + >>> new_var in main # it was initialized with the current state of __main__ + True """ if 'module' in kwds: raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") + with _open(filename, 'rb', peekable=True) as file: main_name = _identify_module(file) - original_main = sys.modules.get(main_name) - main = ModuleType(main_name) - del main.__doc__, main.__loader__, main.__package__, main.__spec__ - if update: - if original_main is None: - original_main = _import_module(main_name) - main.__dict__.update(original_main.__dict__) - else: - main.__builtins__ = __builtin__ + main = _import_module(main_name) + main_copy = ModuleType(main_name) + main_copy.__dict__.clear() + main_copy.__dict__.update(main.__dict__) + + parent_name = main_name.rpartition('.')[0] + if parent_name: + parent = sys.modules[parent_name] try: - sys.modules[main_name] = main + sys.modules[main_name] = main_copy + if parent_name and getattr(parent, main_name, None) is main: + setattr(parent, main_name, main_copy) load_module(file, **kwds) finally: - if original_main is None: - del sys.modules[main_name] - else: - sys.modules[main_name] = original_main - main.__session__ = str(filename) - return main.__dict__ + sys.modules[main_name] = main + if parent_name and getattr(parent, main_name, None) is main_copy: + setattr(parent, main_name, main) + if isinstance(getattr(filename, 'name', None), str): + main_copy.__session__ = filename.name + else: + main_copy.__session__ = str(filename) + return main_copy.__dict__ class ModuleFilters(FilterRules): """Stores default filtering rules for modules. @@ -871,11 +868,13 @@ def keys(self) -> List[str]: keys += mod_filters.keys() keys.sort() return keys - def get(self, name: str, default: Optional[ModuleFilters] = None) -> ModuleFilters: - try: - return self[name] - except AttributeError: - return default + def get_rules(self, name: str) -> ModuleFilters: + while name: + try: + return self[name] + except AttributeError: + name = name.rpartition('.')[0] + return self def get_filters(self, rule_type: RuleType) -> FilterSet: """Get exclude/include filters. If not set, fall back to parent module's or default filters.""" if not isinstance(rule_type, RuleType): diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index f65cfe2b..00634e81 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -359,7 +359,7 @@ def test_load_module_asdict(): assert main_vars['names'] == names assert main_vars['names'] is not names assert main_vars['x'] != x - assert 'y' not in main_vars + assert 'y' in main_vars assert 'empty' in main_vars def test_ipython_filter(): From 7c87a6737ea3bd535d1ef46fe8a86e3663483289 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 25 Aug 2022 17:53:57 -0300 Subject: [PATCH 089/109] add parameter 'identify' to is_pickled_module() --- dill/session.py | 25 +++++++++++++++++-------- dill/tests/test_session.py | 2 ++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/dill/session.py b/dill/session.py index 1361b85c..2ff4fed2 100644 --- a/dill/session.py +++ b/dill/session.py @@ -378,21 +378,25 @@ def _identify_module(file, main=None): return None raise UnpicklingError("unable to identify module") from error -def is_pickled_module(filename, importable: bool = True) -> bool: - """Check if a file can be loaded with :py:func:`load_module`. +def is_pickled_module( + filename, importable: bool = True, identify: bool = False +) -> Union[bool, str]: + """Check if a file can be loaded with :func:`load_module`. - Check if the file is a pickle file generated with :py:func:`dump_module`, - and thus can be loaded with :py:func:`load_module`. + Check if the file is a pickle file generated with :func:`dump_module`, + and thus can be loaded with :func:`load_module`. Parameters: filename: a path-like object or a readable stream. importable: expected kind of the file's saved module. Use `True` for importable modules (the default) or `False` for module-type objects. + identify: if `True`, return the module name if the test succeeds. Returns: `True` if the pickle file at ``filename`` was generated with - :py:func:`dump_module` **AND** the module whose state is saved in it is + :func:`dump_module` **AND** the module whose state is saved in it is of the kind specified by the ``importable`` argument. `False` otherwise. + If `identify` is set, return the name of the module instead of `True`. Examples: Create three types of pickle files: @@ -414,6 +418,8 @@ def is_pickled_module(filename, importable: bool = True) -> bool: False >>> dill.is_pickled_module('module_object.pkl', importable=False) True + >>> dill.is_pickled_module('module_object.pkl', importable=False, identify=True) + 'example' >>> dill.is_pickled_module('common_object.pkl') # always return False False >>> dill.is_pickled_module('common_object.pkl', importable=False) @@ -424,9 +430,12 @@ def is_pickled_module(filename, importable: bool = True) -> bool: pickle_main = _identify_module(file) except UnpicklingError: return False - else: - is_runtime_mod = pickle_main.startswith('__runtime__.') - return importable ^ is_runtime_mod + is_runtime_mod = pickle_main.startswith('__runtime__.') + res = importable ^ is_runtime_mod + if res and identify: + return pickle_main.partition('.')[-1] if is_runtime_mod else pickle_main + else: + return res def load_module( filename = str(TEMPDIR/'session.pkl'), diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index 00634e81..5021da63 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -405,6 +405,7 @@ def test_is_pickled_module(): pickle_file.flush() assert dill.is_pickled_module(pickle_file.name) assert not dill.is_pickled_module(pickle_file.name, importable=False) + assert dill.is_pickled_module(pickle_file.name, identify=True) == local_mod.__name__ pickle_file.close() # Module-type object saved with dump_module(). @@ -413,6 +414,7 @@ def test_is_pickled_module(): pickle_file.flush() assert not dill.is_pickled_module(pickle_file.name) assert dill.is_pickled_module(pickle_file.name, importable=False) + assert dill.is_pickled_module(pickle_file.name, importable=False, identify=True) == 'runtime' pickle_file.close() # Importable module saved by reference due to unpickleable object. From 6456c035c1795a084e279be23f601ad8072e3364 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 25 Aug 2022 18:41:46 -0300 Subject: [PATCH 090/109] improve documentation and code annotation --- dill/_dill.py | 23 +++++- dill/_utils.py | 45 ++++++----- dill/logger.py | 57 ++++++++------ dill/session.py | 205 +++++++++++++++++++++++++++++------------------- 4 files changed, 203 insertions(+), 127 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index fa153198..a7b77ede 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -8,6 +8,13 @@ """ dill: a utility for serialization of python objects +The main API of the package are the functions :func:`dump` and +:func:`dumps` for serialization ("pickling"), and :func:`load` +and :func:`loads` for deserialization ("unpickling"). The +functions :func:`~dill.session.dump_module` and +:func:`~dill.session.load_module` can be used to save and restore +the intepreter session. + Based on code written by Oren Tirosh and Armin Ronacher. Extended to a (near) full set of the builtin types (in types module), and coded to the pickle interface, by . @@ -15,6 +22,9 @@ Test against "all" python types (Std. Lib. CH 1-15 @ 2.7) by mmckerns. Test against CH16+ Std. Lib. ... TBD. """ + +from __future__ import annotations + __all__ = [ 'dump','dumps','load','loads','copy', 'Pickler','Unpickler','register','pickle','pickles','check', @@ -342,7 +352,14 @@ def _getopt(settings, key, arg=None, *, kwds=None): ### Extend the Picklers class Pickler(StockPickler): """python's Pickler extended to interpreter sessions""" - dispatch = MetaCatchingDict(StockPickler.dispatch.copy()) + dispatch: typing.Dict[type, typing.Callable[[Pickler, typing.Any], None]] \ + = MetaCatchingDict(StockPickler.dispatch.copy()) + """The dispatch table, a dictionary of serializing functions used + by Pickler to save objects of specific types. Use :func:`pickle` + or :func:`register` to associate types to custom functions. + + :meta hide-value: + """ _refimported = False _refonfail = False # True in session.settings _session = False @@ -520,12 +537,12 @@ def dispatch_table(): pickle_dispatch_copy = StockPickler.dispatch.copy() def pickle(t, func): - """expose dispatch table for user-created extensions""" + """expose :attr:`~Pickler.dispatch` table for user-created extensions""" Pickler.dispatch[t] = func return def register(t): - """register type to Pickler's dispatch table """ + """decorator to register types to Pickler's :attr:`~Pickler.dispatch` table""" def proxy(func): Pickler.dispatch[t] = func return func diff --git a/dill/_utils.py b/dill/_utils.py index 29f267fb..6f451e79 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -166,7 +166,7 @@ def _iter(filters): @dataclass class FilterSet(abc.MutableSet): - """A superset of exclude/include filter sets.""" + """A superset of exclusion/inclusion filter sets.""" names: Set[str] = field(default_factory=set) regexes: Set[Pattern[str]] = field(default_factory=set) ids: Set[int] = field(default_factory=set) @@ -289,10 +289,10 @@ def __delete__(self, obj): raise AttributeError(self.name) from None class FilterRules: - """Exclude and include rules for filtering a namespace. + """Exclusion and inclusion rules for filtering a namespace. - Namespace filtering rules can be of two types, ``EXCLUDE`` and ``INCLUDE`` - rules, and of five "flavors": + Namespace filtering rules can be of two types, :const:`EXCLUDE` and + :const:`INCLUDE` rules, and of five "flavors": - `name`: match a variable name exactly; - `regex`: match a variable name by regular expression; @@ -301,7 +301,7 @@ class FilterRules: - `func`: callable filter, match a variable name and/or value by an arbitrary logic. - A `name` filter is specified by a simple string, e.g. 'some_var'. If its + A `name` filter is specified by a simple string, e.g. ``'some_var'``. If its value is not a valid Python identifier, except for the special `type` case below, it is treated as a regular expression instead. @@ -309,7 +309,7 @@ class FilterRules: expression, e.g. ``r'\w+_\d+'``, or by a :py:class:`re.Pattern` object. An `id` filter is specified by an ``int`` that corresponds to the id of an - object. For example, to exclude a specific object ``obj`` that may me + object. For example, to exclude a specific object ``obj`` that may be assigned to multiple variables, just use ``id(obj)`` as an `id` filter. A `type` filter is specified by a type-object, e.g. ``list`` or @@ -318,20 +318,23 @@ class FilterRules: e.g. ``"type:function"`` or ``"type: FunctionType"``. These include all the types defined in the module :py:mod:`types` and many more. - A `func` filter can be any callable that accepts a single argument and - returns a boolean value, being it ``True`` if the object should be excluded - (or included) or ``False`` if it should *not* be excluded (or included). - The single argument is an object with two attributes: ``name`` is the + Finally, a `func` filter can be any callable that accepts a single argument and + returns a boolean value, being it `True` if the object should be excluded + (or included, depending on how the filter is used) or `False` if it should + *not* be excluded (or included). + + The single argument passed to `func` filters is an instance of + :py:class:`NamedObject`, an object with two attributes: ``name`` is the variable's name in the namespace and ``value`` is the object that it refers to. Below are some examples of `func` filters. - Exclude objects that were renamed after definition: + A strict type filter, exclude ``int`` but not ``bool`` (an ``int`` subclass): - >>> renamed_filter = lambda obj: obj.name != getattr(obj.value, '__name__', obj.name) + >>> int_filter = lambda obj: type(obj) == int - Strict type filter, exclude ``int`` but not ``bool`` (an ``int`` subclass): + Exclude objects that were renamed after definition: - >>> int_filter = lambda obj: type(obj) == int + >>> renamed_filter = lambda obj: obj.name != getattr(obj.value, '__name__', obj.name) Filters may be added interactively after creating an empty ``FilterRules`` object: @@ -350,12 +353,12 @@ class FilterRules: ... (INCLUDE, r'__\w+__'), ... ]) - The order that the exclude and include filters are added is irrelevant - because **exclude filters are always applied first**. Therefore, generally - the rules work as a blocklist, with include filters acting as exceptions to - the exclusion rules. However, **if there are only include filters, the - rules work as an allowlist** instead, and only the variables matched by the - include filters are kept. + The order that the exclusion and inclusion filters are added is irrelevant + because **exclusion filters are always applied first**. Therefore, + generally the rules work as a blocklist, with inclusion rules acting as + exceptions to the exclusion rules. However, **if there are only inclusion + filters, the rules work as an allowlist** instead, and only the variables + matched by the inclusion filters are kept. """ __slots__ = '_exclude', '_include', '__weakref__' exclude = _FilterSetDescriptor() @@ -478,7 +481,7 @@ def apply_filters(self, namespace: Dict[str, Any]) -> Dict[str, Any]: return namespace if len(exclude_objs) == len(namespace): warnings.warn( - "the exclude/include rules applied have excluded all %d items" % len(all_objs), + "the exclusion/inclusion rules applied have excluded all %d items" % len(all_objs), _dill.PicklingWarning, stacklevel=2 ) diff --git a/dill/logger.py b/dill/logger.py index dab0ae34..3f7a13ff 100644 --- a/dill/logger.py +++ b/dill/logger.py @@ -18,29 +18,32 @@ The trace shows a tree structure depicting the depth of each object serialized *with dill save functions*, but not the ones that use save functions from -'pickle._Pickler.dispatch'. If the information is available, it also displays +``pickle._Pickler.dispatch``. If the information is available, it also displays the size in bytes that the object contributed to the pickle stream (including its child objects). Sample trace output: - >>> import dill, dill.tests - >>> dill.detect.trace(True) - >>> dill.dump_session(main=dill.tests) - ┬ M1: - ├┬ F2: + >>> import dill + >>> import keyword + >>> with dill.detect.trace(): + ... dill.dump_module(module=keyword) + ┬ M1: + ├┬ F2: │└ # F2 [32 B] - ├┬ D2: + ├┬ D5: │├┬ T4: ││└ # T4 [35 B] - │├┬ D2: + │├┬ D2: ││├┬ T4: │││└ # T4 [50 B] - ││├┬ D2: - │││└ # D2 [84 B] - ││└ # D2 [413 B] - │└ # D2 [763 B] - └ # M1 [813 B] + ││├┬ D2: + │││└ # D2 [47 B] + ││└ # D2 [280 B] + │└ # D5 [1 KiB] + └ # M1 [1 KiB] """ +from __future__ import annotations + __all__ = ['adapter', 'logger', 'trace'] import codecs @@ -50,7 +53,7 @@ import math import os from functools import partial -from typing import TextIO, Union +from typing import Optional, TextIO, Union import dill from ._utils import _format_bytes_size @@ -235,7 +238,9 @@ def format(self, record): stderr_handler = logging._StderrHandler() adapter.addHandler(stderr_handler) -def trace(arg: Union[bool, TextIO, str, os.PathLike] = None, *, mode: str = 'a') -> None: +def trace( + arg: Union[bool, TextIO, str, os.PathLike] = None, *, mode: str = 'a' + ) -> Optional[TraceManager]: """print a trace through the stack when pickling; useful for debugging With a single boolean argument, enable or disable the tracing. @@ -248,10 +253,10 @@ def trace(arg: Union[bool, TextIO, str, os.PathLike] = None, *, mode: str = 'a') Alternatively, ``trace()`` can be used as a context manager. With no arguments, it just takes care of restoring the tracing state on exit. - Either a file handle, or a file name and (optionally) a file mode may be - specitfied to redirect the tracing output in the ``with`` block context. A - log function is yielded by the manager so the user can write extra - information to the file. + Either a file handle, or a file name and a file mode (optional) may be + specified to redirect the tracing output in the ``with`` block. A ``log()`` + function is yielded by the manager so the user can write extra information + to the file. Example usage: @@ -270,13 +275,17 @@ def trace(arg: Union[bool, TextIO, str, os.PathLike] = None, *, mode: str = 'a') >>> log("> squared = %r", squared) >>> dumps(squared) - Arguments: - arg: a boolean value, or an optional file-like or path-like object for the context manager - mode: mode string for ``open()`` if a file name is passed as the first argument + Parameters: + arg: a boolean value, or an optional file-like or path-like object for + the context manager + mode: mode string for ``open()`` if a file name is passed as the first + argument """ - if not isinstance(arg, bool): + if isinstance(arg, bool): + logger.setLevel(logging.INFO if arg else logging.WARNING) + return + else: return TraceManager(file=arg, mode=mode) - logger.setLevel(logging.INFO if arg else logging.WARNING) class TraceManager(contextlib.AbstractContextManager): """context manager version of trace(); can redirect the trace to a file""" diff --git a/dill/session.py b/dill/session.py index 2ff4fed2..cc6aae59 100644 --- a/dill/session.py +++ b/dill/session.py @@ -9,27 +9,29 @@ """ Pickle and restore the intepreter session or a module's state. -The functions :py:func:`dump_module`, :py:func:`load_module` and -:py:func:`load_module_asdict` are capable of saving and restoring, as long as +The functions :func:`dump_module`, :func:`load_module` and +:func:`load_module_asdict` are capable of saving and restoring, as long as objects are pickleable, the complete state of a module. For imported modules -that are pickled, `dill` assumes that they are importable when unpickling. - -Contrary of using :py:func:`dill.dump` and :py:func:`dill.load` to save and load -a module object, :py:func:`dill.dump_module` always tries to pickle the module -by value (including built-in modules). Modules saved with :py:func:`dill.dump` -can't be loaded with :py:func:`load_module`. Also, options like -``dill.settings['byref']`` and ``dill.settings['recurse']`` don't affect its -behavior. - -However, if a module contains references to objects originating from other -modules, that would prevent it from pickling or drastically increase its disk -size, they can be saved by reference instead of by value, using the option -``refimported``. - -With :py:func:`dump_module`, namespace filters may be used to restrict the list -of pickled variables to a subset of those in the module, based on their names or -values. Also, using :py:func:`load_module_asdict` allows one to load the -variables from different saved states of the same module into dictionaries. +that are pickled, `dill` requires them to be importable at unpickling. + +Options like ``dill.settings['byref']`` and ``dill.settings['recurse']`` don't +affect the behavior of :func:`dump_module`. However, if a module has variables +refering to objects from other modules that would prevent it from pickling or +drastically increase its disk size, using the option ``refimported`` forces them +to be saved by reference instead of by value. + +Also with :func:`dump_module`, namespace filters may be used to restrict the +list of pickled variables to a subset of those in the module, based on their +names and values. + +In turn, :func:`load_module_asdict` allows one to load the variables from +different saved states of the same module into dictionaries. + +Note: + Contrary of using :func:`dill.dump` and :func:`dill.load` to save and load + a module object, :func:`dill.dump_module` always tries to pickle the module + by value (including built-in modules). Modules saved with :func:`dill.dump` + can't be loaded with :func:`dill.load_module`. """ from __future__ import annotations @@ -57,7 +59,7 @@ from ._utils import FilterRules, FilterSet, _open, size_filter, EXCLUDE, INCLUDE # Type hints. -from typing import Iterable, Optional, Union +from typing import Any, Dict, Iterable, Optional, Union from ._utils import Filter, FilterFunction, NamedObject, Rule, RuleType import pathlib @@ -187,42 +189,52 @@ def dump_module( base_rules: Optional[ModuleFilters] = None, **kwds ) -> None: - R"""Pickle the current state of :py:mod:`__main__` or another module to a file. + """Pickle the current state of :mod:`__main__` or another module to a file. - Save the contents of :py:mod:`__main__` (e.g. from an interactive + Save the contents of :mod:`__main__` (e.g. from an interactive interpreter session), an imported module, or a module-type object (e.g. - built with :py:class:`~types.ModuleType`), to a file. The pickled - module can then be restored with the function :py:func:`load_module`. + built with :class:`~types.ModuleType`), to a file. The pickled + module can then be restored with the function :func:`load_module`. Only a subset of the module's variables may be saved if exclusion/inclusion - filters are specified. Filters apply to every variable name or value and - determine if they should be saved or not. They can be set in + filters are specified. Filters are applied to every pair of variable's name + and value to determine if they should be saved or not. They can be set in ``dill.session.settings['filters']`` or passed directly to the ``exclude`` - and ``include`` parameters. See :py:class:`ModuleFilters` for details. + and ``include`` parameters. + + See :class:`FilterRules` and :class:`ModuleFilters` for details. See + also the bundled "filter factories": :class:`size_filter` and + :func:`ipython_filter`. Parameters: filename: a path-like object or a writable stream. module: a module object or the name of an importable module. If `None` - (the default), :py:mod:`__main__` is saved. + (the default), :mod:`__main__` is saved. refimported: if `True`, all objects identified as having been imported into the module's namespace are saved by reference. *Note:* this is - similar but independent from ``dill.settings[`byref`]``, as + similar but independent from ``dill.settings['byref']``, as ``refimported`` refers to virtually all imported objects, while ``byref`` only affects select objects. refonfail: if `True` (the default), objects that fail to pickle by value will try to be saved by reference. If this also fails, saving their parent objects by reference will be attempted recursively. In the worst case scenario, the module itself may be saved by reference, - with a warning. Note: this option disables framing for pickle - protocol >= 4. Turning it off may improve unpickling speed, but may - cause a module to fail pickling. - exclude: here be dragons - include: here be dragons - base_rules: here be dragons - **kwds: extra keyword arguments passed to :py:class:`Pickler()`. + with a warning. *Note:* this has the side effect of disabling framing + for pickle protocol ≥ 4. Turning this option off may improve + unpickling speed, but may cause a module to fail pickling. + exclude: one or more variable `exclusion` filters (see + :class:`FilterRules`). + include: one or more variable `inclusion` filters. + base_rules: if passed, overwrites ``settings['filters']``. + **kwds: extra keyword arguments passed to :class:`Pickler()`. Raises: - :py:exc:`PicklingError`: if pickling fails. + :exc:`PicklingError`: if pickling fails. + :exc:`PicklingWarning`: if the module itself ends being saved by + reference due to unpickleable objects in its namespace. + + Default values for keyword-only arguments can be set in + `dill.session.settings`. Examples: @@ -247,8 +259,7 @@ def dump_module( >>> foo.values = [1,2,3] >>> import math >>> foo.sin = math.sin - >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) - FIXME: here be dragons + >>> dill.dump_module('foo_session.pkl', module=foo) - Save the state of a module with unpickleable objects: @@ -273,7 +284,23 @@ def dump_module( [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] >>> os = dill.load_module('os_session.pkl') >>> print(os.altsep.join('path')) - p\a\t\h + p\\a\\t\\h + + - Use `refimported` to save imported objects by reference: + + >>> import dill + >>> from html.entities import html5 + >>> type(html5), len(html5) + (dict, 2231) + >>> import io + >>> buf = io.BytesIO() + >>> dill.dump_module(buf) # saves __main__, with html5 saved by value + >>> len(buf.getvalue()) # pickle size in bytes + 71665 + >>> buf = io.BytesIO() + >>> dill.dump_module(buf, refimported=True) # html5 saved by reference + >>> len(buf.getvalue()) + 438 - Save current session but exclude some variables: @@ -442,39 +469,38 @@ def load_module( module: Optional[Union[ModuleType, str]] = None, **kwds ) -> Optional[ModuleType]: - """Update the selected module (default is :py:mod:`__main__`) with - the state saved at ``filename``. + """Update the selected module with the state saved at ``filename``. - Restore a module to the state saved with :py:func:`dump_module`. The - saved module can be :py:mod:`__main__` (e.g. an interpreter session), + Restore a module to the state saved with :func:`dump_module`. The + saved module can be :mod:`__main__` (e.g. an interpreter session), an imported module, or a module-type object (e.g. created with - :py:class:`~types.ModuleType`). + :class:`~types.ModuleType`). - When restoring the state of a non-importable module-type object, the + When restoring the state of a non-importable, module-type object, the current instance of this module may be passed as the argument ``module``. - Otherwise, a new instance is created with :py:class:`~types.ModuleType` + Otherwise, a new instance is created with :class:`~types.ModuleType` and returned. Parameters: filename: a path-like object or a readable stream. module: a module object or the name of an importable module; - the module name and kind (i.e. imported or non-imported) must + the module's name and kind (i.e. imported or non-imported) must match the name and kind of the module stored at ``filename``. - **kwds: extra keyword arguments passed to :py:class:`Unpickler()`. + **kwds: extra keyword arguments passed to :class:`Unpickler()`. Raises: - :py:exc:`UnpicklingError`: if unpickling fails. - :py:exc:`ValueError`: if the argument ``module`` and module saved - at ``filename`` are incompatible. + :exc:`UnpicklingError`: if unpickling fails. + :exc:`ValueError`: if the argument ``module`` and the module + saved at ``filename`` are incompatible. Returns: - A module object, if the saved module is not :py:mod:`__main__` or + A module object, if the saved module is not :mod:`__main__` and a module instance wasn't provided with the argument ``module``. Passing an argument to ``module`` forces `dill` to verify that the module being loaded is compatible with the argument value. Additionally, if the - argument is a module (instead of a module name), it supresses the return - value. Each case and behavior is exemplified below: + argument is a module instance (instead of a module name), it supresses the + return value. Each case and behavior is exemplified below: 1. `module`: ``None`` --- This call loads a previously saved state of the module ``math`` and returns it (the module object) at the end: @@ -587,10 +613,6 @@ def load_module( *Changed in version 0.3.6:* Function ``load_session()`` was renamed to ``load_module()``. Parameter ``main`` was renamed to ``module``. - - See also: - :py:func:`load_module_asdict` to load the contents of module saved - with :py:func:`dump_module` into a dictionary. """ if 'main' in kwds: warnings.warn( @@ -666,7 +688,7 @@ def load_session(filename=str(TEMPDIR/'session.pkl'), main=None, **kwds): def load_module_asdict( filename = str(TEMPDIR/'session.pkl'), **kwds -) -> dict: +) -> Dict[str, Any]: """ Load the contents of a saved module into a dictionary. @@ -679,10 +701,10 @@ def load_module_asdict( Parameters: filename: a path-like object or a readable stream - **kwds: extra keyword arguments passed to :py:class:`Unpickler()` + **kwds: extra keyword arguments passed to :class:`Unpickler()` Raises: - :py:exc:`UnpicklingError`: if unpickling fails + :exc:`UnpicklingError`: if unpickling fails Returns: A copy of the restored module's dictionary. @@ -744,7 +766,7 @@ def load_module_asdict( class ModuleFilters(FilterRules): """Stores default filtering rules for modules. - :py:class:`FilterRules` subclass with a tree-like structure that may hold + :class:`FilterRules` subclass with a tree-like structure that may hold exclusion/inclusion filters for specific modules and submodules. See the base class documentation to learn more about how to create and use filters. @@ -758,11 +780,12 @@ class ModuleFilters(FilterRules): Exclusion and inclusion filters for global variables can be added using the ``add()`` methods of the ``exclude`` and ``include`` attributes, or of the ``ModuleFilters`` object itself. In the latter case, the filter is added to - its ``exclude`` :py:class:`FilterSet` by default: + its ``exclude`` :class:`FilterSet` by default: >>> filters.add('some_var') # exclude a variable named 'some_var' >>> filters.exclude.add('_.*') # exclude any variable with a name prefixed by '_' >>> filters.include.add('_keep_this') # an exception to the rule above + >>> filters @@ -784,21 +807,40 @@ class ModuleFilters(FilterRules): To create filters specific for a module and its submodules, use the following syntax to add a child node to the default ``ModuleFilters``: + >>> import dill >>> from dill.session import EXCLUDE, INCLUDE + >>> filters = dill.session.settings['filters'] + >>> # set empty rules for module 'foo': + >>> # (these will override any existing default rules) >>> filters['foo'] = [] - >>> filters['foo'] # override default with empty rules for module 'foo' + >>> filters['foo'] - >>> filters['bar.baz'] = [(EXCLUDE, r'\w+\d+'), (INCLUDE, 'ERROR404')] - >>> filters['bar.baz'] # specific rules for the submodule 'bar.baz' + >>> # add a name (exclusion) filter: + >>> # (this filter will also apply to any submodule of 'foo') + >>> filters['foo'].add('ignore_this') + >>> filters['foo'] + + + Create a filter for a submodule: + + >>> filters['bar.baz'] = [ + ... (EXCLUDE, r'\w+\d+'), + ... (INCLUDE, ['ERROR403', 'ERROR404']) + ... ] + >>> # set specific rules for the submodule 'bar.baz': + >>> filters['bar.baz'] - >>> filters['bar'] # but the default rules would apply for the module 'bar' + include=FilterSet(names={'ERROR403', 'ERROR404'})> + >>> # note that the default rules still apply to the module 'bar' + >>> filters['bar'] Module-specific filter rules may be accessed using different syntaxes: - >>> filters['bar.baz'] is filters['bar']['baz'] is filters.bar.baz + >>> filters['bar.baz'] is filters['bar']['baz'] + True + >>> filters.bar.baz is filters['bar']['baz'] True Note, however, that using the attribute syntax to directly set rules for @@ -910,9 +952,9 @@ def get_filters(self, rule_type: RuleType) -> FilterSet: def ipython_filter(*, keep_history: str = 'input') -> FilterFunction: """Filter factory to exclude IPython hidden variables. - When saving the session with :py:func:`dump_module` in an IPython - interpreter, hidden variables, i.e. variables listed by ``dir()`` but - not listed by the ``%who`` magic command, are saved unless they are excluded + When saving the session with :func:`dump_module` from an IPython + interpreter, hidden variables (i.e. variables listed by ``dir()`` but + not listed by the ``%who`` magic command) are saved unless they are excluded by filters. This function generates a filter that will exclude these hidden variables from the list of saved variables, with the optional exception of command history variables. @@ -921,15 +963,20 @@ def ipython_filter(*, keep_history: str = 'input') -> FilterFunction: keep_history: whether to keep (i.e. not exclude) the input and output history of the IPython interactive session. Accepted values: - - `'input'`: the input history contained in the hidden variables + - `"input"`: the input history contained in the hidden variables ``In``, ``_ih``, ``_i``, ``_i1``, ``_i2``, etc. will be saved. - - `'output'`, the output history contained in the hidden variables + - `"output"`, the output history contained in the hidden variables ``Out``, ``_oh``, ``_``, ``_1``, ``_2``, etc. will be saved. - - `'both'`: both the input and output history will be saved. - - `'none'`: all the hidden history variables will be excluded. + - `"both"`: both the input and output history will be saved. + - `"none"`: all the hidden history variables will be excluded. Returns: - An exclude filter function to be used with :py:func:`dump_module`. + A variable exclusion filter function to be used with :func:`dump_module`. + + Important: + A filter of this kind should be created just before the call to + :func:`dump_module` where it's used, as it doesn't update the list of + hidden variables after its creation for performance reasons. Example: From 6de1c516bcbc3362b2ce496a1f96ed2c9b6f674f Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 25 Aug 2022 23:00:14 -0300 Subject: [PATCH 091/109] show variable name in trace; fix IPYTHON_SINGLETONS bug --- dill/_dill.py | 4 ++-- dill/logger.py | 12 ++++++++---- dill/session.py | 2 ++ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index a7b77ede..70cad702 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -1888,8 +1888,8 @@ def save_module(pickler, obj): for item in ('__builtins__', '__loader__'): main_dict.pop(item, None) for item in IPYTHON_SINGLETONS: - if getattr(item, '__module__', '').startswith('IPython'): - main_dict.pop(item, None) + if getattr(main_dict.get(item), '__module__', '').startswith('IPython'): + del main_dict[item] if is_session_main: pickler._main_dict_copy = main_dict pickler.save_reduce(_import_module, (mod_name,), obj=obj, state=main_dict) diff --git a/dill/logger.py b/dill/logger.py index 3f7a13ff..b2c9b6e1 100644 --- a/dill/logger.py +++ b/dill/logger.py @@ -52,6 +52,7 @@ import logging import math import os +from contextlib import suppress from functools import partial from typing import Optional, TextIO, Union @@ -167,7 +168,7 @@ def trace(self, pickler, msg, *args, obj=None, **kwargs): obj = args[-1] pickler._trace_stack.append(id(obj)) size = None - try: + with suppress(AttributeError, TypeError): # Streams are not required to be tellable. size = pickler._file_tell() frame = pickler.framer.current_frame @@ -176,11 +177,12 @@ def trace(self, pickler, msg, *args, obj=None, **kwargs): except AttributeError: # PyPy may use a BytesBuilder as frame size += len(frame) - except (AttributeError, TypeError): - pass if size is not None: if not pushed_obj: pickler._size_stack.append(size) + if len(pickler._size_stack) == 3: # module > dict > variable + with suppress(AttributeError, KeyError): + extra['varname'] = pickler._id_to_name.pop(id(obj)) else: size -= pickler._size_stack.pop() extra['size'] = size @@ -227,7 +229,9 @@ def format(self, record): if not self.is_utf8: prefix = prefix.translate(ASCII_MAP) + "-" fields['prefix'] = prefix + " " - if hasattr(record, 'size'): + if hasattr(record, 'varname'): + fields['suffix'] = " as %r" % record.varname + elif hasattr(record, 'size'): fields['suffix'] = " [%d %s]" % _format_bytes_size(record.size) vars(record).update(fields) return super().format(record) diff --git a/dill/session.py b/dill/session.py index cc6aae59..39518d31 100644 --- a/dill/session.py +++ b/dill/session.py @@ -363,6 +363,8 @@ def dump_module( if refimported: # Cache modmap for refonfail. pickler._modmap = modmap + if logger.isEnabledFor(logging.INFO): + pickler._id_to_name = {id(v): k for k, v in main.__dict__.items()} pickler.dump(main) return From f9ba8d4ea0795195409fbd078e54bce729314411 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 25 Aug 2022 23:03:32 -0300 Subject: [PATCH 092/109] help me Travis --- dill/tests/test_session.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index 5021da63..952d8486 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -171,7 +171,8 @@ def test_session_main(refimported): # Test session loading in the same session. session_buffer = BytesIO() - dill.dump_module(session_buffer, refimported=refimported) + with dill.detect.trace(): + dill.dump_module(session_buffer, refimported=refimported) session_buffer.seek(0) dill.load_module(session_buffer, module='__main__') ns.backup['_test_objects'](__main__, ns.backup, refimported) From c7176cd2a3f5aced9c441ada0727553925c46588 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 25 Aug 2022 23:39:38 -0300 Subject: [PATCH 093/109] don't execute __main__ dump/load test with COVERAGE --- dill/tests/test_session.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index 952d8486..50a21d62 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -171,8 +171,7 @@ def test_session_main(refimported): # Test session loading in the same session. session_buffer = BytesIO() - with dill.detect.trace(): - dill.dump_module(session_buffer, refimported=refimported) + dill.dump_module(session_buffer, refimported=refimported) session_buffer.seek(0) dill.load_module(session_buffer, module='__main__') ns.backup['_test_objects'](__main__, ns.backup, refimported) @@ -431,8 +430,9 @@ def test_is_pickled_module(): pickle_file.close() if __name__ == '__main__': - test_session_main(refimported=False) - test_session_main(refimported=True) + if os.getenv('COVERAGE') != 'true': + test_session_main(refimported=False) + test_session_main(refimported=True) test_session_other() test_runtime_module() test_lookup_module() From e26647b5a825e9d359d83182c20af9b20cec7586 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Fri, 26 Aug 2022 13:09:06 -0300 Subject: [PATCH 094/109] coverage, some tests, remove unreachable branch, "fix last fix" --- dill/_dill.py | 14 +++++--------- dill/session.py | 18 +++++++++--------- dill/tests/test_session.py | 34 +++++++++++++++++++++++++++++++--- 3 files changed, 45 insertions(+), 21 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 70cad702..efb68738 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -172,8 +172,8 @@ def get_file_type(*args, **kwargs): from multiprocessing.reduction import _reduce_socket as reduce_socket try: IS_IPYTHON = __IPYTHON__ # is True - ExitType = None # IPython.core.autocall.ExitAutocall - IPYTHON_SINGLETONS = ('exit', 'quit', 'get_ipython') + ExitType = None # IPython.core.autocall.ExitAutocall #pragma: no cover + IPYTHON_SINGLETONS = ('exit', 'quit', 'get_ipython') #pragma: no cover except NameError: IS_IPYTHON = False try: ExitType = type(exit) # apparently 'exit' can be removed @@ -459,7 +459,7 @@ def save_numpy_array(pickler, obj): # Roll back memo. for _ in range(len(self.memo) - memo_size): self.memo.popitem() # LIFO order is guaranteed since 3.7 - # Handle modules specially. + # Handle session main. if self._session and obj is self._main: if self._main is _main_module or not _is_imported_module(self._main): raise @@ -472,10 +472,6 @@ def save_numpy_array(pickler, obj): PicklingWarning, stacklevel=5, ) - elif (isinstance(obj, ModuleType) - and (_is_builtin_module(obj) or obj is sys.modules['dill'])): - self.save_reduce(_import_module, (obj.__name__,), obj=obj) - logger.trace(self, message, obj=obj) # Try to save object by reference. elif hasattr(obj, '__name__') or hasattr(obj, '__qualname__'): try: @@ -1366,7 +1362,7 @@ def save_module_dict(pickler, obj): logger.trace(pickler, "D1: %s", _repr_dict(obj), obj=obj) pickler.write(GLOBAL + b'__builtin__\n__main__\n') logger.trace(pickler, "# D1") - elif not is_pickler_dill and obj is _main_module.__dict__: + elif not is_pickler_dill and obj is _main_module.__dict__: #prama: no cover logger.trace(pickler, "D3: %s", _repr_dict(obj), obj=obj) pickler.write(GLOBAL + b'__main__\n__dict__\n') #XXX: works in general? logger.trace(pickler, "# D3") @@ -1887,7 +1883,7 @@ def save_module(pickler, obj): main_dict = obj.__dict__.copy() for item in ('__builtins__', '__loader__'): main_dict.pop(item, None) - for item in IPYTHON_SINGLETONS: + for item in IPYTHON_SINGLETONS: #pragma: no cover if getattr(main_dict.get(item), '__module__', '').startswith('IPython'): del main_dict[item] if is_session_main: diff --git a/dill/session.py b/dill/session.py index 39518d31..a9a15804 100644 --- a/dill/session.py +++ b/dill/session.py @@ -332,7 +332,7 @@ def dump_module( refimported = _getopt(settings, 'refimported', refimported) refonfail = _getopt(settings, 'refonfail', refonfail) base_rules = _getopt(settings, 'filters', base_rules) - if not isinstance(base_rules, ModuleFilters): + if not isinstance(base_rules, ModuleFilters): #pragma: no cover base_rules = ModuleFilters(base_rules) main = module @@ -740,22 +740,22 @@ def load_module_asdict( raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") with _open(filename, 'rb', peekable=True) as file: - main_name = _identify_module(file) - main = _import_module(main_name) - main_copy = ModuleType(main_name) + main_qualname = _identify_module(file) + main = _import_module(main_qualname) + main_copy = ModuleType(main_qualname) main_copy.__dict__.clear() main_copy.__dict__.update(main.__dict__) - parent_name = main_name.rpartition('.')[0] + parent_name, _, main_name = main_qualname.rpartition('.') if parent_name: parent = sys.modules[parent_name] try: - sys.modules[main_name] = main_copy + sys.modules[main_qualname] = main_copy if parent_name and getattr(parent, main_name, None) is main: setattr(parent, main_name, main_copy) load_module(file, **kwds) finally: - sys.modules[main_name] = main + sys.modules[main_qualname] = main if parent_name and getattr(parent, main_name, None) is main_copy: setattr(parent, main_name, main) @@ -987,12 +987,12 @@ def ipython_filter(*, keep_history: str = 'input') -> FilterFunction: >>> dill.dump_module(exclude=ipython_filter(keep_history='none')) """ HISTORY_OPTIONS = {'input', 'output', 'both', 'none'} - if keep_history not in HISTORY_OPTIONS: + if keep_history not in HISTORY_OPTIONS: #pramga: no cover raise ValueError( "invalid 'keep_history' argument: %r (must be one of %r)" % (keep_history, HISTORY_OPTIONS) ) - if not _dill.IS_IPYTHON: + if not _dill.IS_IPYTHON: #pragma: no cover # Return no-op filter if not in IPython. return (lambda x: False) diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index 50a21d62..cd33d815 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -269,29 +269,41 @@ def lookup(mod, name, obj, lookup_by_name=True): setattr(local_mod, name2, obj) assert lookup(dill, name2, obj) == (local_mod.__name__, name2, False) -def test_refimported_imported_as(): +def test_refimported(): import collections import concurrent.futures import types import typing mod = sys.modules['__test__'] = ModuleType('__test__') + mod.session = dill.session dill.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) mod.Dict = collections.UserDict # select by type mod.AsyncCM = typing.AsyncContextManager # select by __module__ mod.thread_exec = dill.executor # select by __module__ with regex + mod.local_mod = local_mod session_buffer = BytesIO() dill.dump_module(session_buffer, mod, refimported=True) session_buffer.seek(0) mod = dill.load(session_buffer) - del sys.modules['__test__'] + assert mod.__dill_imported == [('dill', 'session')] assert set(mod.__dill_imported_as) == { ('collections', 'UserDict', 'Dict'), ('typing', 'AsyncContextManager', 'AsyncCM'), ('dill', 'executor', 'thread_exec'), } + assert mod.__dill_imported_top_level == [(local_mod.__name__, 'local_mod')] + + session_buffer.seek(0) + dill.load_module(session_buffer, mod) + del sys.modules['__test__'] + assert mod.session is dill.session + assert mod.Dict is collections.UserDict + assert mod.AsyncCM is typing.AsyncContextManager + assert mod.thread_exec is dill.executor + assert mod.local_mod is local_mod def test_refonfail_unpickleable(): global local_mod @@ -362,6 +374,22 @@ def test_load_module_asdict(): assert 'y' in main_vars assert 'empty' in main_vars + # Test a submodule. + import html + from html import entities + entitydefs = entities.entitydefs + + session_buffer = BytesIO() + dill.dump_module(session_buffer, entities) + session_buffer.seek(0) + entities_vars = dill.load_module_asdict(session_buffer) + + assert entities is html.entities # restored + assert entities is sys.modules['html.entities'] # restored + assert entitydefs is entities.entitydefs # unchanged + assert entitydefs is not entities_vars['entitydefs'] # saved by value + assert entitydefs == entities_vars['entitydefs'] + def test_ipython_filter(): from itertools import filterfalse from types import SimpleNamespace @@ -436,7 +464,7 @@ def test_is_pickled_module(): test_session_other() test_runtime_module() test_lookup_module() - test_refimported_imported_as() + test_refimported() test_refonfail_unpickleable() test_load_module_asdict() test_ipython_filter() From 0e796a7f9bb53b4cc59480b4fae82727c430289a Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Fri, 26 Aug 2022 13:33:24 -0300 Subject: [PATCH 095/109] fix test that failed on Travis --- dill/tests/test_session.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index cd33d815..d724d76b 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -276,7 +276,7 @@ def test_refimported(): import typing mod = sys.modules['__test__'] = ModuleType('__test__') - mod.session = dill.session + mod.builtin_module_names = sys.builtin_module_names dill.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) mod.Dict = collections.UserDict # select by type mod.AsyncCM = typing.AsyncContextManager # select by __module__ @@ -288,7 +288,7 @@ def test_refimported(): session_buffer.seek(0) mod = dill.load(session_buffer) - assert mod.__dill_imported == [('dill', 'session')] + assert mod.__dill_imported == [('sys', 'builtin_module_names')] assert set(mod.__dill_imported_as) == { ('collections', 'UserDict', 'Dict'), ('typing', 'AsyncContextManager', 'AsyncCM'), @@ -299,7 +299,7 @@ def test_refimported(): session_buffer.seek(0) dill.load_module(session_buffer, mod) del sys.modules['__test__'] - assert mod.session is dill.session + assert mod.builtin_module_names is sys.builtin_module_names assert mod.Dict is collections.UserDict assert mod.AsyncCM is typing.AsyncContextManager assert mod.thread_exec is dill.executor From a091024c3211154d207faa7eca56a4372d47bc33 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Fri, 26 Aug 2022 17:37:22 -0300 Subject: [PATCH 096/109] tests for _open(); fix corner case --- dill/_dill.py | 2 +- dill/_utils.py | 12 +++- dill/tests/test_filtering.py | 24 ++++++++ dill/tests/test_session.py | 110 +++++++++++++---------------------- dill/tests/test_utils.py | 62 ++++++++++++++++++++ 5 files changed, 140 insertions(+), 70 deletions(-) create mode 100644 dill/tests/test_utils.py diff --git a/dill/_dill.py b/dill/_dill.py index efb68738..3a5fc341 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -453,7 +453,7 @@ def save_numpy_array(pickler, obj): "# X: fallback to save as global: <%s object at %#012x>" % (type(obj).__name__, id(obj)) ) - # Roll back the stream. + # Roll back the stream, stream.truncate(position) doesn't work for all types. self._file_seek(position) self._file_truncate() # Roll back memo. diff --git a/dill/_utils.py b/dill/_utils.py index 6f451e79..0d9d86e2 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -108,7 +108,17 @@ def _open(file, mode, *, peekable=False, seekable=False): # Wrap stream in a helper class if necessary. if peekable and not hasattr(file, 'peek'): # Try our best to return it as an object with a peek() method. - if hasattr(file, 'tell') and hasattr(file, 'seek'): + if hasattr(file, 'seekable'): + file_seekable = file.seekable() + elif hasattr(file, 'seek') and hasattr(file, 'tell'): + try: + file.seek(file.tell()) + file_seekable = True + except Exception: + file_seekable = False + else: + file_seekable = False + if file_seekable: file = _PeekableReader(file, closing=should_close) else: try: diff --git a/dill/tests/test_filtering.py b/dill/tests/test_filtering.py index 172a0de5..08d98708 100644 --- a/dill/tests/test_filtering.py +++ b/dill/tests/test_filtering.py @@ -89,6 +89,30 @@ def test_size_filter(): filter_size = [(EXCLUDE, size_filter(limit=5*small_size))] assert did_exclude(NS_copy, filter_size, excluded_subset={'large'}) +def test_ipython_filter(): + from itertools import filterfalse + from types import SimpleNamespace + _dill.IS_IPYTHON = True # trick ipython_filter + sys.modules['IPython'] = MockIPython = ModuleType('IPython') + + # Mimic the behavior of IPython namespaces at __main__. + user_ns_actual = {'user_var': 1, 'x': 2} + user_ns_hidden = {'x': 3, '_i1': '1 / 2', '_1': 0.5, 'hidden': 4} + user_ns = user_ns_hidden.copy() # user_ns == vars(__main__) + user_ns.update(user_ns_actual) + assert user_ns['x'] == user_ns_actual['x'] # user_ns.x masks user_ns_hidden.x + MockIPython.get_ipython = lambda: SimpleNamespace(user_ns=user_ns, user_ns_hidden=user_ns_hidden) + + # Test variations of keeping or dropping the interpreter history. + user_vars = set(user_ns_actual) + def namespace_matches(keep_history, should_keep_vars): + rules = FilterRules([(EXCLUDE, ipython_filter(keep_history=keep_history))]) + return set(rules.apply_filters(user_ns)) == user_vars | should_keep_vars + assert namespace_matches(keep_history='input', should_keep_vars={'_i1'}) + assert namespace_matches(keep_history='output', should_keep_vars={'_1'}) + assert namespace_matches(keep_history='both', should_keep_vars={'_i1', '_1'}) + assert namespace_matches(keep_history='none', should_keep_vars=set()) + if __name__ == '__main__': test_basic_filtering() test_exclude_include() diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index d724d76b..ea3d3eeb 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -228,6 +228,45 @@ def test_runtime_module(): assert runtime_mod.x == 42 assert runtime_mod not in sys.modules.values() +def test_load_module_asdict(): + with TestNamespace(): + session_buffer = BytesIO() + dill.dump_module(session_buffer) + + global empty, names, x, y + x = y = 0 # change x and create y + del empty + globals_state = globals().copy() + + session_buffer.seek(0) + main_vars = dill.load_module_asdict(session_buffer) + + assert main_vars is not globals() + assert globals() == globals_state + + assert main_vars['__name__'] == '__main__' + assert main_vars['names'] == names + assert main_vars['names'] is not names + assert main_vars['x'] != x + assert 'y' in main_vars + assert 'empty' in main_vars + + # Test a submodule. + import html + from html import entities + entitydefs = entities.entitydefs + + session_buffer = BytesIO() + dill.dump_module(session_buffer, entities) + session_buffer.seek(0) + entities_vars = dill.load_module_asdict(session_buffer) + + assert entities is html.entities # restored + assert entities is sys.modules['html.entities'] # restored + assert entitydefs is entities.entitydefs # unchanged + assert entitydefs is not entities_vars['entitydefs'] # saved by value + assert entitydefs == entities_vars['entitydefs'] + def test_lookup_module(): assert not dill._dill._is_builtin_module(local_mod) and local_mod.__package__ == '' @@ -305,7 +344,7 @@ def test_refimported(): assert mod.thread_exec is dill.executor assert mod.local_mod is local_mod -def test_refonfail_unpickleable(): +def test_unpickleable_var(): global local_mod import keyword as builtin_mod from dill._dill import _global_string @@ -351,70 +390,6 @@ def dump_with_ref(mod, other_mod): dill.session.settings['refonfail'] = refonfail_default -def test_load_module_asdict(): - with TestNamespace(): - session_buffer = BytesIO() - dill.dump_module(session_buffer) - - global empty, names, x, y - x = y = 0 # change x and create y - del empty - globals_state = globals().copy() - - session_buffer.seek(0) - main_vars = dill.load_module_asdict(session_buffer) - - assert main_vars is not globals() - assert globals() == globals_state - - assert main_vars['__name__'] == '__main__' - assert main_vars['names'] == names - assert main_vars['names'] is not names - assert main_vars['x'] != x - assert 'y' in main_vars - assert 'empty' in main_vars - - # Test a submodule. - import html - from html import entities - entitydefs = entities.entitydefs - - session_buffer = BytesIO() - dill.dump_module(session_buffer, entities) - session_buffer.seek(0) - entities_vars = dill.load_module_asdict(session_buffer) - - assert entities is html.entities # restored - assert entities is sys.modules['html.entities'] # restored - assert entitydefs is entities.entitydefs # unchanged - assert entitydefs is not entities_vars['entitydefs'] # saved by value - assert entitydefs == entities_vars['entitydefs'] - -def test_ipython_filter(): - from itertools import filterfalse - from types import SimpleNamespace - from dill._utils import FilterRules - dill._dill.IS_IPYTHON = True # trick ipython_filter - sys.modules['IPython'] = MockIPython = ModuleType('IPython') - - # Mimic the behavior of IPython namespaces at __main__. - user_ns_actual = {'user_var': 1, 'x': 2} - user_ns_hidden = {'x': 3, '_i1': '1 / 2', '_1': 0.5, 'hidden': 4} - user_ns = user_ns_hidden.copy() # user_ns == vars(__main__) - user_ns.update(user_ns_actual) - assert user_ns['x'] == user_ns_actual['x'] # user_ns.x masks user_ns_hidden.x - MockIPython.get_ipython = lambda: SimpleNamespace(user_ns=user_ns, user_ns_hidden=user_ns_hidden) - - # Test variations of keeping or dropping the interpreter history. - user_vars = set(user_ns_actual) - def namespace_matches(keep_history, should_keep_vars): - rules = FilterRules([(EXCLUDE, ipython_filter(keep_history=keep_history))]) - return set(rules.apply_filters(user_ns)) == user_vars | should_keep_vars - assert namespace_matches(keep_history='input', should_keep_vars={'_i1'}) - assert namespace_matches(keep_history='output', should_keep_vars={'_1'}) - assert namespace_matches(keep_history='both', should_keep_vars={'_i1', '_1'}) - assert namespace_matches(keep_history='none', should_keep_vars=set()) - def test_is_pickled_module(): import tempfile import warnings @@ -463,9 +438,8 @@ def test_is_pickled_module(): test_session_main(refimported=True) test_session_other() test_runtime_module() + test_load_module_asdict() test_lookup_module() test_refimported() - test_refonfail_unpickleable() - test_load_module_asdict() - test_ipython_filter() + test_unpickleable_var() test_is_pickled_module() diff --git a/dill/tests/test_utils.py b/dill/tests/test_utils.py new file mode 100644 index 00000000..b46d033b --- /dev/null +++ b/dill/tests/test_utils.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python + +# Author: Leonardo Gama (@leogama) +# Copyright (c) 2022 The Uncertainty Quantification Foundation. +# License: 3-clause BSD. The full license text is available at: +# - https://github.com/uqfoundation/dill/blob/master/LICENSE + +"""test general utilities in _utils.py (for filters, see test_filtering.py)""" + +import io +import os +import sys + +from dill import _utils + +def test_open(): + file_unpeekable = open(__file__, 'rb', buffering=0) + assert not hasattr(file_unpeekable, 'peek') + + content = file_unpeekable.read() + peeked_chars = content[:10] + first_line = content[:100].partition(b'\n')[0] + b'\n' + file_unpeekable.seek(0) + + # Test _PeekableReader for seekable stream + with _utils._open(file_unpeekable, 'r', peekable=True) as file: + assert file.peek(10)[:10] == peeked_chars + assert file.readline() == first_line + assert isinstance(file, _utils._PeekableReader) + assert not file_unpeekable.closed + file_unpeekable.close() + + _pipe_r, _pipe_w = os.pipe() + pipe_r, pipe_w = io.FileIO(_pipe_r, closefd=False), io.FileIO(_pipe_w, mode='w') + assert not hasattr(pipe_r, 'peek') + assert not pipe_r.seekable() + assert not pipe_w.seekable() + + # Test io.BufferedReader for unseekable stream + with _utils._open(pipe_r, 'r', peekable=True) as file: + pipe_w.write(content[:100]) + assert file.peek(10)[:10] == peeked_chars + assert file.readline() == first_line + assert isinstance(file, io.BufferedReader) + assert not pipe_r.closed + + # Test _Seekable writer for unseekable stream + with _utils._open(pipe_w, 'w', seekable=True) as file: + # pipe_r is closed here for some reason... + file.write(content) + file.flush() + file.seek(0) + file.truncate() + file.write(b'a line of text\n') + assert not pipe_w.closed + pipe_r = io.FileIO(_pipe_r) + assert pipe_r.readline() == b'a line of text\n' + pipe_r.close() + pipe_w.close() + +if __name__ == '__main__': + test_open() From 785f419cf8555882af7258502c5589c56059ade9 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Fri, 26 Aug 2022 17:54:46 -0300 Subject: [PATCH 097/109] tests for _format_bytes_size() --- dill/tests/test_utils.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/dill/tests/test_utils.py b/dill/tests/test_utils.py index b46d033b..4a247f72 100644 --- a/dill/tests/test_utils.py +++ b/dill/tests/test_utils.py @@ -31,7 +31,8 @@ def test_open(): file_unpeekable.close() _pipe_r, _pipe_w = os.pipe() - pipe_r, pipe_w = io.FileIO(_pipe_r, closefd=False), io.FileIO(_pipe_w, mode='w') + pipe_r = io.FileIO(_pipe_r, closefd=False) + pipe_w = io.FileIO(_pipe_w, mode='w') assert not hasattr(pipe_r, 'peek') assert not pipe_r.seekable() assert not pipe_w.seekable() @@ -44,7 +45,7 @@ def test_open(): assert isinstance(file, io.BufferedReader) assert not pipe_r.closed - # Test _Seekable writer for unseekable stream + # Test _SeekableWriter for unseekable stream with _utils._open(pipe_w, 'w', seekable=True) as file: # pipe_r is closed here for some reason... file.write(content) @@ -58,5 +59,14 @@ def test_open(): pipe_r.close() pipe_w.close() +def test_format_bytes(): + formatb = _utils._format_bytes_size + assert formatb(1000) == (1000, 'B') + assert formatb(1024) == (1, 'KiB') + assert formatb(1024 + 511) == (1, 'KiB') + assert formatb(1024 + 512) == (2, 'KiB') + assert formatb(10**9) == (954, 'MiB') + if __name__ == '__main__': test_open() + test_format_bytes() From a624f4406c36d583db71eb3abe0d07d308f7a37a Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 27 Aug 2022 14:01:04 -0300 Subject: [PATCH 098/109] tests for module-specific filters and fallbacks --- dill/tests/test_filtering.py | 112 ++++++++++++++++++++++++++++------- dill/tests/test_utils.py | 23 +++---- 2 files changed, 104 insertions(+), 31 deletions(-) diff --git a/dill/tests/test_filtering.py b/dill/tests/test_filtering.py index 08d98708..259b5fd6 100644 --- a/dill/tests/test_filtering.py +++ b/dill/tests/test_filtering.py @@ -6,12 +6,13 @@ # - https://github.com/uqfoundation/dill/blob/master/LICENSE import sys +from types import ModuleType +import dill from dill import _dill -from dill._utils import FilterRules, RuleType, size_filter - -EXCLUDE = RuleType.EXCLUDE -INCLUDE = RuleType.INCLUDE +from dill.session import ( + EXCLUDE, INCLUDE, FilterRules, RuleType, ipython_filter, size_filter, settings +) NS = { 'a': 1, @@ -72,22 +73,74 @@ def test_add_type(): NS_copy.update(F=test_basic_filtering, T=FilterRules, M=_dill) assert did_exclude(NS_copy, type_rules, excluded_subset={'F', 'T', 'M', 'Integer'}) -def test_size_filter(): - from sys import getsizeof - estimate = size_filter.estimate_size - - small = list(range(100)) - large = list(range(1000)) - reflarge = 10*[small] - small_size = getsizeof(small) + 100*getsizeof(0) - large_size = getsizeof(large) + 1000*getsizeof(0) - assert small_size <= estimate(small) < estimate(reflarge) < large_size <= estimate(large) - - NS_copy = NS.copy() # all base objects are small and should not be excluded - reflarge.append(reflarge) # recursive reference - NS_copy.update(small=small, large=large, reflarge=reflarge) - filter_size = [(EXCLUDE, size_filter(limit=5*small_size))] - assert did_exclude(NS_copy, filter_size, excluded_subset={'large'}) +def test_module_filters(): + R"""Test filters specific for a module and fallback to parent module or default. + + The settings['filers'] single-branched tree structure in these tests: + + exclude: {r'_.*[^_]'} None None + / / / + *-------------* *-------------* *-------------* *~~~~~~~~~~~~~* + module: | DEFAULT |-| foo* |-| foo.bar | | foo.bar.baz | + *-------------* *-------------* *-------------* *~~~~~~~~~~~~~* + \ \ \ \_____ _____/ + include: {'_keep'} None {} (empty) V + missing + (*) 'foo' is a placeholder node + """ + import io + foo = sys.modules['foo'] = ModuleType('foo') + foo.bar = sys.modules['foo.bar'] = ModuleType('foo.bar') + foo.bar.baz = sys.modules['foo.bar.baz'] = ModuleType('foo.bar.baz') + NS = {'_filter': 1, '_keep': 2} + + def _dump_load_dict(module): + module.__dict__.update(NS) + buf = io.BytesIO() + dill.dump_module(buf, module) + for var in NS: + delattr(module, var) + buf.seek(0) + return dill.load_module_asdict(buf) + + # Empty default filters + filters = settings['filters'] + saved = _dump_load_dict(foo) + assert '_filter' in saved + assert '_keep' in saved + + # Default filters + filters.exclude.add(r'_.*[^_]') + filters.include.add('_keep') + assert filters.get_rules('foo') is filters + saved = _dump_load_dict(foo) + assert '_filter' not in saved + assert '_keep' in saved + + # Add filters to 'foo.bar' and placeholder node for 'foo' + filters['foo.bar'] = () + del filters.foo.bar.exclude # remove empty exclude filters, fall back to default + assert not hasattr(filters.foo, 'exclude') and not hasattr(filters.foo, 'include') + assert not hasattr(filters.foo.bar, 'exclude') and hasattr(filters.foo.bar, 'include') + + # foo: placeholder node falling back to default + assert filters.foo.get_filters(EXCLUDE) is filters.exclude + saved = _dump_load_dict(foo) + assert '_filter' not in saved + assert '_keep' in saved + + # foo.bar: without exclude rules, with (empty) include rules + assert filters.foo.bar.get_filters(EXCLUDE) is filters.exclude + assert filters.foo.bar.get_filters(INCLUDE) is filters.foo.bar.include + saved = _dump_load_dict(foo.bar) + assert '_filter' not in saved + assert '_keep' not in saved + + # foo.bar.baz: without specific filters, falling back to foo.bar + assert filters.get_rules('foo.bar.baz') is filters.foo.bar + saved = _dump_load_dict(foo.bar.baz) + assert '_filter' not in saved + assert '_keep' not in saved def test_ipython_filter(): from itertools import filterfalse @@ -113,9 +166,28 @@ def namespace_matches(keep_history, should_keep_vars): assert namespace_matches(keep_history='both', should_keep_vars={'_i1', '_1'}) assert namespace_matches(keep_history='none', should_keep_vars=set()) +def test_size_filter(): + from sys import getsizeof + estimate = size_filter.estimate_size + + small = list(range(100)) + large = list(range(1000)) + reflarge = 10*[small] + small_size = getsizeof(small) + 100*getsizeof(0) + large_size = getsizeof(large) + 1000*getsizeof(0) + assert small_size <= estimate(small) < estimate(reflarge) < large_size <= estimate(large) + + NS_copy = NS.copy() # all base objects are small and should not be excluded + reflarge.append(reflarge) # recursive reference + NS_copy.update(small=small, large=large, reflarge=reflarge) + filter_size = [(EXCLUDE, size_filter(limit=5*small_size))] + assert did_exclude(NS_copy, filter_size, excluded_subset={'large'}) + if __name__ == '__main__': test_basic_filtering() test_exclude_include() test_add_type() + test_module_filters() + test_ipython_filter() if not _dill.IS_PYPY: test_size_filter() diff --git a/dill/tests/test_utils.py b/dill/tests/test_utils.py index 4a247f72..32757773 100644 --- a/dill/tests/test_utils.py +++ b/dill/tests/test_utils.py @@ -13,6 +13,14 @@ from dill import _utils +def test_format_bytes(): + formatb = _utils._format_bytes_size + assert formatb(1000) == (1000, 'B') + assert formatb(1024) == (1, 'KiB') + assert formatb(1024 + 511) == (1, 'KiB') + assert formatb(1024 + 512) == (2, 'KiB') + assert formatb(10**9) == (954, 'MiB') + def test_open(): file_unpeekable = open(__file__, 'rb', buffering=0) assert not hasattr(file_unpeekable, 'peek') @@ -24,9 +32,9 @@ def test_open(): # Test _PeekableReader for seekable stream with _utils._open(file_unpeekable, 'r', peekable=True) as file: + assert isinstance(file, _utils._PeekableReader) assert file.peek(10)[:10] == peeked_chars assert file.readline() == first_line - assert isinstance(file, _utils._PeekableReader) assert not file_unpeekable.closed file_unpeekable.close() @@ -39,15 +47,16 @@ def test_open(): # Test io.BufferedReader for unseekable stream with _utils._open(pipe_r, 'r', peekable=True) as file: + assert isinstance(file, io.BufferedReader) pipe_w.write(content[:100]) assert file.peek(10)[:10] == peeked_chars assert file.readline() == first_line - assert isinstance(file, io.BufferedReader) assert not pipe_r.closed # Test _SeekableWriter for unseekable stream with _utils._open(pipe_w, 'w', seekable=True) as file: # pipe_r is closed here for some reason... + assert isinstance(file, _utils._SeekableWriter) file.write(content) file.flush() file.seek(0) @@ -59,14 +68,6 @@ def test_open(): pipe_r.close() pipe_w.close() -def test_format_bytes(): - formatb = _utils._format_bytes_size - assert formatb(1000) == (1000, 'B') - assert formatb(1024) == (1, 'KiB') - assert formatb(1024 + 511) == (1, 'KiB') - assert formatb(1024 + 512) == (2, 'KiB') - assert formatb(10**9) == (954, 'MiB') - if __name__ == '__main__': - test_open() test_format_bytes() + test_open() From a7fe25e988199568f52d67795b60dd853faee9d0 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 27 Aug 2022 16:16:44 -0300 Subject: [PATCH 099/109] Renamed .logger to .logging and new TRACE log level - Renamed dill.logger to dill.logging, so that it can be used in place of logging internally, and the name logger can be used for the actual logger; - It exposes the logging levels (INFO, DEBUG, etc.) and getLogger(), which is all that a submodule needs in most cases; - It sets an new logging level named TRACE to be used for the pikcling trace - Other info messages, like listing variables excluded from a module pickle by filters, can be shown without showing the trace - Set the 'stacklevel' of some warnings to reduce clutter --- dill/__init__.py | 4 ++-- dill/_dill.py | 23 +++++++++++++++---- dill/detect.py | 2 +- dill/{logger.py => logging.py} | 21 ++++++++++++----- dill/session.py | 19 ++++++++++----- .../tests/{test_logger.py => test_logging.py} | 2 +- docs/source/dill.rst | 4 ++-- 7 files changed, 52 insertions(+), 23 deletions(-) rename dill/{logger.py => logging.py} (95%) rename dill/tests/{test_logger.py => test_logging.py} (97%) diff --git a/dill/__init__.py b/dill/__init__.py index 3571f54e..6a012270 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -34,13 +34,13 @@ dump_module, load_module, load_module_asdict, is_pickled_module, dump_session, load_session # backward compatibility ) -from . import detect, logger, session, source, temp +from . import detect, logging, session, source, temp # get global settings from .settings import settings # make sure "trace" is turned off -logger.trace(False) +logging.trace(False) objects = {} # local import of dill._objects diff --git a/dill/_dill.py b/dill/_dill.py index 3a5fc341..356b26fa 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -36,8 +36,9 @@ __module__ = 'dill' import warnings -from .logger import adapter as logger -from .logger import trace as _trace +from dill import logging +from .logging import adapter as logger +from .logging import trace as _trace import os import sys @@ -477,6 +478,7 @@ def save_numpy_array(pickler, obj): try: self.save_global(obj) logger.trace(self, message, obj=obj) + return True # for _saved_byref, ignored otherwise except PicklingError as error: # Roll back trace state. logger.roll_back(self, obj) @@ -1333,8 +1335,10 @@ def _save_module_dict(pickler, main_dict): for name, value in main_dict.items(): pickler.save(name) try: - pickler.save(value) + if pickler.save(value): + pickler._saved_byref.append(name) except UNPICKLEABLE_ERRORS as error_stack: + pickler._saved_byref.append(name) if modmap is None: modmap = _module_map(main) modname, objname, installed = _lookup_module(modmap, name, value) @@ -1964,9 +1968,18 @@ def save_type(pickler, obj, postproc_list=None): else: logger.trace(pickler, "T4: %s", obj) if incorrectly_named: - warnings.warn('Cannot locate reference to %r.' % (obj,), PicklingWarning) + warnings.warn( + "Cannot locate reference to %r." % (obj,), + PicklingWarning, + stacklevel=3, + ) if obj_recursive: - warnings.warn('Cannot pickle %r: %s.%s has recursive self-references that trigger a RecursionError.' % (obj, obj.__module__, obj_name), PicklingWarning) + warnings.warn( + "Cannot pickle %r: %s.%s has recursive self-references that " + "trigger a RecursionError." % (obj, obj.__module__, obj_name), + PicklingWarning, + stacklevel=3, + ) #print (obj.__dict__) #print ("%s\n%s" % (type(obj), obj.__name__)) #print ("%s\n%s" % (obj.__bases__, obj.__dict__)) diff --git a/dill/detect.py b/dill/detect.py index b6a6cb76..e6149d15 100644 --- a/dill/detect.py +++ b/dill/detect.py @@ -13,7 +13,7 @@ from inspect import ismethod, isfunction, istraceback, isframe, iscode from .pointers import parent, reference, at, parents, children -from .logger import trace +from .logging import trace __all__ = ['baditems','badobjects','badtypes','code','errors','freevars', 'getmodule','globalvars','nestedcode','nestedglobals','outermost', diff --git a/dill/logger.py b/dill/logging.py similarity index 95% rename from dill/logger.py rename to dill/logging.py index b2c9b6e1..428d6436 100644 --- a/dill/logger.py +++ b/dill/logging.py @@ -11,7 +11,9 @@ The 'logger' object is dill's top-level logger. The 'adapter' object wraps the logger and implements a 'trace()' method that -generates a detailed tree-style trace for the pickling call at log level INFO. +generates a detailed tree-style trace for the pickling call at log level +:const:`dill.logging.TRACE`, which has an intermediary value between +:const:`logging.INFO` and :const:`logging.DEGUB`. The 'trace()' function sets and resets dill's logger log level, enabling and disabling the pickling trace. @@ -44,7 +46,10 @@ from __future__ import annotations -__all__ = ['adapter', 'logger', 'trace'] +__all__ = [ + 'adapter', 'logger', 'trace', 'getLogger', + 'CRITICAL', 'ERROR', 'WARNING', 'INFO', 'TRACE', 'DEBUG', 'NOTSET', +] import codecs import contextlib @@ -53,12 +58,16 @@ import math import os from contextlib import suppress +from logging import getLogger, CRITICAL, ERROR, WARNING, INFO, DEBUG, NOTSET from functools import partial from typing import Optional, TextIO, Union import dill from ._utils import _format_bytes_size +# Intermediary logging level for tracing. +TRACE = (INFO + DEBUG) // 2 + # Tree drawing characters: Unicode to ASCII map. ASCII_MAP = str.maketrans({"│": "|", "├": "|", "┬": "+", "└": "`"}) @@ -144,7 +153,7 @@ def trace_setup(self, pickler): # Called by Pickler.dump(). if not dill._dill.is_dill(pickler, child=False): return - elif self.isEnabledFor(logging.INFO): + elif self.isEnabledFor(TRACE): pickler._trace_stack = [] pickler._size_stack = [] else: @@ -236,7 +245,7 @@ def format(self, record): vars(record).update(fields) return super().format(record) -logger = logging.getLogger('dill') +logger = getLogger('dill') logger.propagate = False adapter = TraceAdapter(logger) stderr_handler = logging._StderrHandler() @@ -286,7 +295,7 @@ def trace( argument """ if isinstance(arg, bool): - logger.setLevel(logging.INFO if arg else logging.WARNING) + logger.setLevel(TRACE if arg else WARNING) return else: return TraceManager(file=arg, mode=mode) @@ -308,7 +317,7 @@ def __enter__(self): adapter.removeHandler(stderr_handler) adapter.addHandler(self.handler) self.old_level = adapter.getEffectiveLevel() - adapter.setLevel(logging.INFO) + adapter.setLevel(TRACE) return adapter.info def __exit__(self, *exc_info): adapter.setLevel(self.old_level) diff --git a/dill/session.py b/dill/session.py index a9a15804..a9e5603d 100644 --- a/dill/session.py +++ b/dill/session.py @@ -43,14 +43,12 @@ 'dump_session', 'load_session' # backward compatibility ] -import logging -logger = logging.getLogger(__name__) - import re import sys import warnings -from dill import _dill, Pickler, Unpickler, UnpicklingError +from dill import _dill, logging +from dill import Pickler, Unpickler, UnpicklingError from ._dill import ( BuiltinMethodType, FunctionType, MethodType, ModuleType, TypeType, _getopt, _import_module, _is_builtin_module, _is_imported_module, @@ -58,6 +56,8 @@ ) from ._utils import FilterRules, FilterSet, _open, size_filter, EXCLUDE, INCLUDE +logger = logging.getLogger(__name__) + # Type hints. from typing import Any, Dict, Iterable, Optional, Union from ._utils import Filter, FilterFunction, NamedObject, Rule, RuleType @@ -151,7 +151,7 @@ def _filter_vars(main_module, exclude, include, base_rules): excluded = {name: type(value).__name__ for name, value in sorted(main_module.__dict__.items()) if name not in namespace} excluded = str(excluded).translate({ord(","): "\n ", ord("'"): None}) - logger.info("Objects excluded from dump_session():\n %s", excluded) + logger.info("[dump_module] Variables excluded by filtering:\n %s", excluded) newmod = ModuleType(main_module.__name__) newmod.__dict__.update(namespace) @@ -360,12 +360,19 @@ def dump_module( pickler._refonfail = True # False by default pickler._file_seek = file.seek pickler._file_truncate = file.truncate + pickler._saved_byref = [] if refimported: # Cache modmap for refonfail. pickler._modmap = modmap - if logger.isEnabledFor(logging.INFO): + if logger.isEnabledFor(logging.TRACE): pickler._id_to_name = {id(v): k for k, v in main.__dict__.items()} pickler.dump(main) + if pickler._saved_byref and logger.isEnabledFor(logging.INFO): + import textwrap + pickler._saved_byref.sort() + message = "[dump_module] Variables saved by reference (refonfail): " + message += str(pickler._saved_byref).replace("'", "")[1:-1] + logger.info("\n".join(textwrap.wrap(message, width=80))) return # Backward compatibility. diff --git a/dill/tests/test_logger.py b/dill/tests/test_logging.py similarity index 97% rename from dill/tests/test_logger.py rename to dill/tests/test_logging.py index b4e4881a..ed33e6c4 100644 --- a/dill/tests/test_logger.py +++ b/dill/tests/test_logging.py @@ -11,7 +11,7 @@ import dill from dill import detect -from dill.logger import stderr_handler, adapter as logger +from dill.logging import stderr_handler, adapter as logger try: from StringIO import StringIO diff --git a/docs/source/dill.rst b/docs/source/dill.rst index e18607db..0708e6f3 100644 --- a/docs/source/dill.rst +++ b/docs/source/dill.rst @@ -13,10 +13,10 @@ detect module .. automodule:: dill.detect .. :exclude-members: +ismethod, isfunction, istraceback, isframe, iscode, parent, reference, at, parents, children -logger module +logging module ------------- -.. automodule:: dill.logger +.. automodule:: dill.logging :exclude-members: +trace objtypes module From 04c837810ea197f0fd98a5b55f4976b736b98aa6 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 27 Aug 2022 17:00:23 -0300 Subject: [PATCH 100/109] allow trace() to set a custom log level for showing INFO messages --- dill/logging.py | 21 ++++++++++++++------- dill/session.py | 5 +++++ docs/source/dill.rst | 4 ++-- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/dill/logging.py b/dill/logging.py index 428d6436..92386e0c 100644 --- a/dill/logging.py +++ b/dill/logging.py @@ -68,6 +68,11 @@ # Intermediary logging level for tracing. TRACE = (INFO + DEBUG) // 2 +_nameOrBoolToLevel = logging._nameToLevel.copy() +_nameOrBoolToLevel['TRACE'] = TRACE +_nameOrBoolToLevel[False] = WARNING +_nameOrBoolToLevel[True] = TRACE + # Tree drawing characters: Unicode to ASCII map. ASCII_MAP = str.maketrans({"│": "|", "├": "|", "┬": "+", "└": "`"}) @@ -252,11 +257,12 @@ def format(self, record): adapter.addHandler(stderr_handler) def trace( - arg: Union[bool, TextIO, str, os.PathLike] = None, *, mode: str = 'a' + arg: Union[bool, str, TextIO, os.PathLike] = None, *, mode: str = 'a' ) -> Optional[TraceManager]: """print a trace through the stack when pickling; useful for debugging - With a single boolean argument, enable or disable the tracing. + With a single boolean argument, enable or disable the tracing. Or, with a + logging level name (not ``int``), set the logging level of the dill logger. Example usage: @@ -289,13 +295,14 @@ def trace( >>> dumps(squared) Parameters: - arg: a boolean value, or an optional file-like or path-like object for - the context manager + arg: a boolean value, the name of a logging level (including "TRACE") + or an optional file-like or path-like object for the context manager mode: mode string for ``open()`` if a file name is passed as the first - argument + argument """ - if isinstance(arg, bool): - logger.setLevel(TRACE if arg else WARNING) + level = _nameOrBoolToLevel.get(arg) if isinstance(arg, (bool, str)) else None + if level is not None: + logger.setLevel(level) return else: return TraceManager(file=arg, mode=mode) diff --git a/dill/session.py b/dill/session.py index a9e5603d..37694fdd 100644 --- a/dill/session.py +++ b/dill/session.py @@ -27,6 +27,11 @@ In turn, :func:`load_module_asdict` allows one to load the variables from different saved states of the same module into dictionaries. +Using :func:`dill.detect.trace` enables the complete pickling trace of a +module. Alternatively, ``dill.detect.trace('INFO')`` enables only the messages +about variables excluded by filtering or unpickleable variables saved by +reference in the pickled module's namespace. + Note: Contrary of using :func:`dill.dump` and :func:`dill.load` to save and load a module object, :func:`dill.dump_module` always tries to pickle the module diff --git a/docs/source/dill.rst b/docs/source/dill.rst index 0708e6f3..67839523 100644 --- a/docs/source/dill.rst +++ b/docs/source/dill.rst @@ -14,7 +14,7 @@ detect module .. :exclude-members: +ismethod, isfunction, istraceback, isframe, iscode, parent, reference, at, parents, children logging module -------------- +-------------- .. automodule:: dill.logging :exclude-members: +trace @@ -32,7 +32,7 @@ pointers module .. :exclude-members: + session module ---------------- +-------------- .. automodule:: dill.session :exclude-members: +dump_session, load_session From 6d706536195696367ebfbd993b8109bb9f1a8376 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 27 Aug 2022 17:04:56 -0300 Subject: [PATCH 101/109] complement warning about modules saved by reference --- dill/_dill.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dill/_dill.py b/dill/_dill.py index 356b26fa..a060cce2 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -469,7 +469,9 @@ def save_numpy_array(pickler, obj): logger.trace(self, message, obj=obj) warnings.warn( "module %r saved by reference due to the unpickleable " - "variable %r" % (self._main.__name__, error_stack.name), + "variable %r. No changes to the module were saved. " + "Unpickleable variables can be ignored with filters." + % (self._main.__name__, error_stack.name), PicklingWarning, stacklevel=5, ) From 52642796c9dbb0ea5dc6edb056372560f5a5f059 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 27 Aug 2022 22:19:12 -0300 Subject: [PATCH 102/109] refonfail/refimported: exceptions to saving as reference to other modules' objects --- dill/_dill.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index a060cce2..6771ffc7 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -1266,6 +1266,7 @@ def _module_map(main_module): by_name=defaultdict(list), by_id=defaultdict(list), top_level={}, # top-level modules + module = main_module.__name__, package = _module_package(main_module), ) for modname, module in sys.modules.items(): @@ -1297,6 +1298,13 @@ def _lookup_module(modmap, name, obj, lookup_by_id=True) -> typing.Tuple[str, st and a boolean flag, which is `True` if the module falls under categories (1) to (3) from the hierarchy, or `False` if it's in category (4). """ + not_found = None, None, None + # Don't look for objects likely related to the module itself. + obj_module = getattr(obj, '__module__', type(obj).__module__) + if obj_module == modmap.module: + return not_found + obj_package = _module_package(_import_module(obj_module, safe=True)) + for map, by_id in [(modmap.by_name, False), (modmap.by_id, True)]: if by_id and not lookup_by_id: break @@ -1304,12 +1312,18 @@ def _lookup_module(modmap, name, obj, lookup_by_id=True) -> typing.Tuple[str, st key = id(obj) if by_id else name for other, modname in map[key]: if by_id or other is obj: - other_module = sys.modules[modname] other_name = other if by_id else name + other_module = sys.modules[modname] + other_package = _module_package(other_module) + # Don't return a reference to a module of another package + # if the object is likely from the same package. + if (modmap.package and obj_package == modmap.package + and other_package != modmap.package): + continue # Prefer modules imported earlier (first found). if _is_stdlib_module(other_module): return modname, other_name, True - elif modmap.package and modmap.package == _module_package(other_module): + elif modmap.package and modmap.package == other_package: if _2nd_choice: continue _2nd_choice = modname, other_name, True elif not _2nd_choice: @@ -1323,7 +1337,7 @@ def _lookup_module(modmap, name, obj, lookup_by_id=True) -> typing.Tuple[str, st found = _2nd_choice or _3rd_choice or _4th_choice if found: return found - return None, None, None + return not_found def _global_string(modname, name): return GLOBAL + bytes('%s\n%s\n' % (modname, name), 'UTF-8') @@ -1854,6 +1868,7 @@ def _is_stdlib_module(module): else: return first_level in sys.stdlib_module_names +@_weak_cache(defaults={None: None}) def _module_package(module): package = getattr(module, '__package__', None) return package.partition('.')[0] if package else None From 989ebe431f9e89461dc2e1fb0c0f771cf6f6a1b0 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 30 Aug 2022 20:16:57 -0300 Subject: [PATCH 103/109] refimported: use original_main for building modmap; exclude small integers and empty strings --- dill/session.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dill/session.py b/dill/session.py index 37694fdd..d6d5a27b 100644 --- a/dill/session.py +++ b/dill/session.py @@ -97,6 +97,8 @@ def _stash_modules(main_module): for name, obj in main_module.__dict__.items(): # Avoid incorrectly matching a singleton value in another package (e.g. __doc__ == None). if (any(obj is constant for constant in BUILTIN_CONSTANTS) # must compare by identity + or type(obj) is str and obj == '' # for cases like: __package__ == '' + or type(obj) is int and -128 <= obj <= 256 # small values or CPython-internalized or isinstance(obj, ModuleType) and _is_builtin_module(obj) # always saved by ref or obj is main_module or obj is main_module.__dict__): original[name] = obj @@ -350,7 +352,7 @@ def dump_module( original_main = main main = _filter_vars(main, exclude, include, base_rules) if refimported: - main, modmap = _stash_modules(main) + main, modmap = _stash_modules(original_main) with _open(filename, 'wb', seekable=True) as file: pickler = Pickler(file, protocol, **kwds) pickler._main = main #FIXME: dill.settings are disabled From 6650a99f710abba0ee6b19e6a5802832c20f1d6f Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 30 Aug 2022 20:17:37 -0300 Subject: [PATCH 104/109] dump_module: also list variables saved by reference with 'refimported' --- dill/_dill.py | 6 ++++-- dill/session.py | 29 ++++++++++++++++++++--------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 6771ffc7..62c38659 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -1352,16 +1352,18 @@ def _save_module_dict(pickler, main_dict): pickler.save(name) try: if pickler.save(value): - pickler._saved_byref.append(name) + global_name = getattr(value, '__qualname__', value.__name__) + pickler._saved_byref.append((name, value.__module__, global_name)) except UNPICKLEABLE_ERRORS as error_stack: - pickler._saved_byref.append(name) if modmap is None: modmap = _module_map(main) modname, objname, installed = _lookup_module(modmap, name, value) if modname and (installed or not is_builtin): pickler.write(_global_string(modname, objname)) + pickler._saved_byref.append((name, modname, objname)) elif is_builtin: pickler.write(_global_string(main.__name__, name)) + pickler._saved_byref.append((name, main.__name__, name)) else: error = PicklingError("can't save variable %r as global" % name) error.name = name diff --git a/dill/session.py b/dill/session.py index d6d5a27b..074c7418 100644 --- a/dill/session.py +++ b/dill/session.py @@ -29,8 +29,9 @@ Using :func:`dill.detect.trace` enables the complete pickling trace of a module. Alternatively, ``dill.detect.trace('INFO')`` enables only the messages -about variables excluded by filtering or unpickleable variables saved by -reference in the pickled module's namespace. +about variables excluded by filtering or variables saved by reference (by +effect of the `refimported` or the `refoonfail` option) in the pickled module's +namespace. Note: Contrary of using :func:`dill.dump` and :func:`dill.load` to save and load @@ -48,6 +49,7 @@ 'dump_session', 'load_session' # backward compatibility ] +import pprint import re import sys import warnings @@ -127,6 +129,14 @@ def _stash_modules(main_module): newmod.__dill_imported_as = imported_as newmod.__dill_imported_top_level = imported_top_level _discard_added_variables(newmod, main_module.__dict__) + + if logger.isEnabledFor(logging.INFO): + refimported = [(name, "%s.%s" % (mod, name)) for mod, name in imported] + refimported += [(name, "%s.%s" % (mod, objname)) for mod, objname, name in imported_as] + refimported += [(name, mod) for mod, name in imported_top_level] + message = "[dump_module] Variables saved by reference (refimported):\n" + logger.info(message + _format_log_dict(dict(refimported))) + return newmod, modmap else: return main_module, modmap @@ -139,6 +149,9 @@ def _restore_modules(unpickler, main_module): for modname, name in main_module.__dict__.pop('__dill_imported_top_level', ()): main_module.__dict__[name] = _import_module(modname) +def _format_log_dict(dict): + return pprint.pformat(dict, compact=True, sort_dicts=True).replace("'", "") + def _filter_vars(main_module, exclude, include, base_rules): """apply exclude/include filters from arguments *and* settings""" rules = FilterRules() @@ -157,8 +170,8 @@ def _filter_vars(main_module, exclude, include, base_rules): if logger.isEnabledFor(logging.INFO): excluded = {name: type(value).__name__ for name, value in sorted(main_module.__dict__.items()) if name not in namespace} - excluded = str(excluded).translate({ord(","): "\n ", ord("'"): None}) - logger.info("[dump_module] Variables excluded by filtering:\n %s", excluded) + message = "[dump_module] Variables excluded by filtering:\n" + logger.info(message + _format_log_dict(excluded)) newmod = ModuleType(main_module.__name__) newmod.__dict__.update(namespace) @@ -375,11 +388,9 @@ def dump_module( pickler._id_to_name = {id(v): k for k, v in main.__dict__.items()} pickler.dump(main) if pickler._saved_byref and logger.isEnabledFor(logging.INFO): - import textwrap - pickler._saved_byref.sort() - message = "[dump_module] Variables saved by reference (refonfail): " - message += str(pickler._saved_byref).replace("'", "")[1:-1] - logger.info("\n".join(textwrap.wrap(message, width=80))) + saved_byref = {var: "%s.%s" % (mod, obj) for var, mod, obj in pickler._saved_byref} + message = "[dump_module] Variables saved by reference (refonfail):\n" + logger.info(message + _format_log_dict(saved_byref)) return # Backward compatibility. From c802c5d72e4ec134971ccdfe611a4a3b64fe44ae Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 30 Aug 2022 23:18:14 -0300 Subject: [PATCH 105/109] minor --- dill/_dill.py | 44 +++++++++++++++++++++++--------------------- dill/session.py | 4 ++-- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 62c38659..787cb7ac 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -50,7 +50,7 @@ #XXX: get types from .objtypes ? import builtins as __builtin__ from pickle import _Pickler as StockPickler, Unpickler as StockUnpickler -from pickle import DICT, EMPTY_DICT, GLOBAL, MARK, POP, SETITEM +from pickle import DICT, GLOBAL, MARK, POP, SETITEM from _thread import LockType from _thread import RLock as RLockType #from io import IOBase @@ -171,10 +171,10 @@ def get_file_type(*args, **kwargs): from socket import socket as SocketType #FIXME: additionally calls ForkingPickler.register several times from multiprocessing.reduction import _reduce_socket as reduce_socket -try: +try: #pragma: no cover IS_IPYTHON = __IPYTHON__ # is True - ExitType = None # IPython.core.autocall.ExitAutocall #pragma: no cover - IPYTHON_SINGLETONS = ('exit', 'quit', 'get_ipython') #pragma: no cover + ExitType = None # IPython.core.autocall.ExitAutocall + IPYTHON_SINGLETONS = ('exit', 'quit', 'get_ipython') except NameError: IS_IPYTHON = False try: ExitType = type(exit) # apparently 'exit' can be removed @@ -330,16 +330,15 @@ class UnpicklingWarning(PickleWarning, UnpicklingError): pass def _getopt(settings, key, arg=None, *, kwds=None): - """Get option from 'kwds' or named 'arg', falling back to settings. + """Get option from named argument 'arg' or 'kwds', falling back to settings. Examples: - # With an explict named argument: + # With an explicitly named argument: protocol = int(_getopt(settings, 'protocol', protocol)) # With a named argument in **kwds: self._byref = _getopt(settings, 'byref', kwds=kwds) - """ # Sanity check, it's a bug in calling code if False. assert kwds is None or arg is None @@ -361,11 +360,12 @@ class Pickler(StockPickler): :meta hide-value: """ + from .settings import settings + # Flags set by dump_module() is dill.session: _refimported = False - _refonfail = False # True in session.settings + _refonfail = False _session = False _first_pass = False - from .settings import settings def __init__(self, file, *args, **kwds): settings = Pickler.settings @@ -442,7 +442,8 @@ def save_numpy_array(pickler, obj): ## Save with 'refonfail' ## - # Disable framing (right after the framer.init_framing() call at dump()). + # Disable framing. This must be set right after the + # framer.init_framing() call at StockPickler.dump()). self.framer.current_frame = None # Store initial state. position = self._file_tell() @@ -450,11 +451,11 @@ def save_numpy_array(pickler, obj): try: StockPickler.save(self, obj, save_persistent_id) except UNPICKLEABLE_ERRORS as error_stack: - message = ( + trace_message = ( "# X: fallback to save as global: <%s object at %#012x>" % (type(obj).__name__, id(obj)) ) - # Roll back the stream, stream.truncate(position) doesn't work for all types. + # Roll back the stream. Note: truncate(position) doesn't always work. self._file_seek(position) self._file_truncate() # Roll back memo. @@ -464,9 +465,9 @@ def save_numpy_array(pickler, obj): if self._session and obj is self._main: if self._main is _main_module or not _is_imported_module(self._main): raise - # Save an empty dict as state to distinguish this from modules saved with dump(). + # Save an empty dict as state to distinguish from modules saved with dump(). self.save_reduce(_import_module, (obj.__name__,), obj=obj, state={}) - logger.trace(self, message, obj=obj) + logger.trace(self, trace_message, obj=obj) warnings.warn( "module %r saved by reference due to the unpickleable " "variable %r. No changes to the module were saved. " @@ -479,7 +480,7 @@ def save_numpy_array(pickler, obj): elif hasattr(obj, '__name__') or hasattr(obj, '__qualname__'): try: self.save_global(obj) - logger.trace(self, message, obj=obj) + logger.trace(self, trace_message, obj=obj) return True # for _saved_byref, ignored otherwise except PicklingError as error: # Roll back trace state. @@ -1263,9 +1264,9 @@ def _module_map(main_module): from collections import defaultdict from types import SimpleNamespace modmap = SimpleNamespace( - by_name=defaultdict(list), - by_id=defaultdict(list), - top_level={}, # top-level modules + by_name = defaultdict(list), + by_id = defaultdict(list), + top_level = {}, # top-level modules module = main_module.__name__, package = _module_package(main_module), ) @@ -1289,7 +1290,7 @@ def _lookup_module(modmap, name, obj, lookup_by_id=True) -> typing.Tuple[str, st hierarchy: 1. Standard Library modules - 2. modules of the same package as the module being saved (if it's part of a module) + 2. modules of the same top-level package as the module being saved (if it's part of a package) 3. installed modules in general 4. non-installed modules @@ -1316,11 +1317,11 @@ def _lookup_module(modmap, name, obj, lookup_by_id=True) -> typing.Tuple[str, st other_module = sys.modules[modname] other_package = _module_package(other_module) # Don't return a reference to a module of another package - # if the object is likely from the same package. + # if the object is likely from the same top-level package. if (modmap.package and obj_package == modmap.package and other_package != modmap.package): continue - # Prefer modules imported earlier (first found). + # Prefer modules imported earlier (the first found). if _is_stdlib_module(other_module): return modname, other_name, True elif modmap.package and modmap.package == other_package: @@ -1872,6 +1873,7 @@ def _is_stdlib_module(module): @_weak_cache(defaults={None: None}) def _module_package(module): + """get the top-level package of a module, if any""" package = getattr(module, '__package__', None) return package.partition('.')[0] if package else None diff --git a/dill/session.py b/dill/session.py index 074c7418..465ac5fe 100644 --- a/dill/session.py +++ b/dill/session.py @@ -99,8 +99,8 @@ def _stash_modules(main_module): for name, obj in main_module.__dict__.items(): # Avoid incorrectly matching a singleton value in another package (e.g. __doc__ == None). if (any(obj is constant for constant in BUILTIN_CONSTANTS) # must compare by identity - or type(obj) is str and obj == '' # for cases like: __package__ == '' - or type(obj) is int and -128 <= obj <= 256 # small values or CPython-internalized + or type(obj) is str and obj == '' # internalized, for cases like: __package__ == '' + or type(obj) is int and -128 <= obj <= 256 # possibly cached by compiler/interpreter or isinstance(obj, ModuleType) and _is_builtin_module(obj) # always saved by ref or obj is main_module or obj is main_module.__dict__): original[name] = obj From 56beb89db0c68557d9cdd3fd2294033c869b892c Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Wed, 31 Aug 2022 20:42:31 -0300 Subject: [PATCH 106/109] test dump and load of StdLib modules; fix missing flag test --- dill/session.py | 2 +- dill/tests/test_session.py | 9 +- dill/tests/test_stdlib_modules.py | 132 ++++++++++++++++++++++++++++++ 3 files changed, 138 insertions(+), 5 deletions(-) create mode 100644 dill/tests/test_stdlib_modules.py diff --git a/dill/session.py b/dill/session.py index 465ac5fe..9b52393c 100644 --- a/dill/session.py +++ b/dill/session.py @@ -387,7 +387,7 @@ def dump_module( if logger.isEnabledFor(logging.TRACE): pickler._id_to_name = {id(v): k for k, v in main.__dict__.items()} pickler.dump(main) - if pickler._saved_byref and logger.isEnabledFor(logging.INFO): + if refonfail and pickler._saved_byref and logger.isEnabledFor(logging.INFO): saved_byref = {var: "%s.%s" % (mod, obj) for var, mod, obj in pickler._saved_byref} message = "[dump_module] Variables saved by reference (refonfail):\n" logger.info(message + _format_log_dict(saved_byref)) diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index ea3d3eeb..de3e97af 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -14,6 +14,7 @@ from types import ModuleType import dill +from dill import _dill from dill.session import ipython_filter, EXCLUDE, INCLUDE session_file = os.path.join(os.path.dirname(__file__), 'session-refimported-%s.pkl') @@ -268,7 +269,7 @@ def test_load_module_asdict(): assert entitydefs == entities_vars['entitydefs'] def test_lookup_module(): - assert not dill._dill._is_builtin_module(local_mod) and local_mod.__package__ == '' + assert not _dill._is_builtin_module(local_mod) and local_mod.__package__ == '' def lookup(mod, name, obj, lookup_by_name=True): from dill._dill import _lookup_module, _module_map @@ -352,12 +353,12 @@ def test_unpickleable_var(): dill.session.settings['refonfail'] = True name = '__unpickleable' obj = memoryview(b'') - assert dill._dill._is_builtin_module(builtin_mod) - assert not dill._dill._is_builtin_module(local_mod) + assert _dill._is_builtin_module(builtin_mod) + assert not _dill._is_builtin_module(local_mod) # assert not dill.pickles(obj) try: dill.dumps(obj) - except dill._dill.UNPICKLEABLE_ERRORS: + except _dill.UNPICKLEABLE_ERRORS: pass else: raise Exception("test object should be unpickleable") diff --git a/dill/tests/test_stdlib_modules.py b/dill/tests/test_stdlib_modules.py new file mode 100644 index 00000000..e49760a2 --- /dev/null +++ b/dill/tests/test_stdlib_modules.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python + +# Author: Leonardo Gama (@leogama) +# Copyright (c) 2022 The Uncertainty Quantification Foundation. +# License: 3-clause BSD. The full license text is available at: +# - https://github.com/uqfoundation/dill/blob/master/LICENSE + +import io +import itertools +import logging +import multiprocessing +import os +import sys +import warnings + +import dill + +if not dill._dill.OLD310: + STDLIB_MODULES = list(sys.stdlib_module_names) + STDLIB_MODULES += [ + # From https://docs.python.org/3.11/library/ + 'collections.abc', 'concurrent.futures', 'curses.ascii', 'curses.panel', 'curses.textpad', + 'html.entities', 'html.parser', 'http.client', 'http.cookiejar', 'http.cookies', 'http.server', + 'importlib.metadata', 'importlib.resources', 'importlib.resources.abc', 'logging.config', + 'logging.handlers', 'multiprocessing.shared_memory', 'os.path', 'test.support', + 'test.support.bytecode_helper', 'test.support.import_helper', 'test.support.os_helper', + 'test.support.script_helper', 'test.support.socket_helper', 'test.support.threading_helper', + 'test.support.warnings_helper', 'tkinter.colorchooser', 'tkinter.dnd', 'tkinter.font', + 'tkinter.messagebox', 'tkinter.scrolledtext', 'tkinter.tix', 'tkinter.ttk', 'unittest.mock', + 'urllib.error', 'urllib.parse', 'urllib.request', 'urllib.response', 'urllib.robotparser', + 'xml.dom', 'xml.dom.minidom', 'xml.dom.pulldom', 'xml.etree.ElementTree', 'xml.parsers.expat', + 'xml.sax', 'xml.sax.handler', 'xml.sax.saxutils', 'xml.sax.xmlreader', 'xmlrpc.client', + 'xmlrpc.server', + ] + STDLIB_MODULES.sort() +else: + STDLIB_MODULES = [ + # From https://docs.python.org/3.9/library/ + '__future__', '_thread', 'abc', 'aifc', 'argparse', 'array', 'ast', 'asynchat', 'asyncio', + 'asyncore', 'atexit', 'audioop', 'base64', 'bdb', 'binascii', 'binhex', 'bisect', 'builtins', + 'bz2', 'calendar', 'cgi', 'cgitb', 'chunk', 'cmath', 'cmd', 'code', 'codecs', 'codeop', + 'collections', 'collections.abc', 'colorsys', 'compileall', 'concurrent', 'concurrent.futures', + 'configparser', 'contextlib', 'contextvars', 'copy', 'copyreg', 'crypt', 'csv', 'ctypes', + 'curses', 'curses.ascii', 'curses.panel', 'curses.textpad', 'dataclasses', 'datetime', 'dbm', + 'decimal', 'difflib', 'dis', 'distutils', 'doctest', 'email', 'ensurepip', 'enum', 'errno', + 'faulthandler', 'fcntl', 'filecmp', 'fileinput', 'fnmatch', 'formatter', 'fractions', 'ftplib', + 'functools', 'gc', 'getopt', 'getpass', 'gettext', 'glob', 'graphlib', 'grp', 'gzip', 'hashlib', + 'heapq', 'hmac', 'html', 'html.entities', 'html.parser', 'http', 'http.client', + 'http.cookiejar', 'http.cookies', 'http.server', 'imaplib', 'imghdr', 'imp', 'importlib', + 'importlib.metadata', 'inspect', 'io', 'ipaddress', 'itertools', 'json', 'keyword', 'linecache', + 'locale', 'logging', 'logging.config', 'logging.handlers', 'lzma', 'mailbox', 'mailcap', + 'marshal', 'math', 'mimetypes', 'mmap', 'modulefinder', 'msilib', 'msvcrt', 'multiprocessing', + 'multiprocessing.shared_memory', 'netrc', 'nis', 'nntplib', 'numbers', 'operator', 'optparse', + 'os', 'os.path', 'ossaudiodev', 'parser', 'pathlib', 'pdb', 'pickle', 'pickletools', 'pipes', + 'pkgutil', 'platform', 'plistlib', 'poplib', 'posix', 'pprint', 'pty', 'pwd', 'py_compile', + 'pyclbr', 'pydoc', 'queue', 'quopri', 'random', 're', 'readline', 'reprlib', 'resource', + 'rlcompleter', 'runpy', 'sched', 'secrets', 'select', 'selectors', 'shelve', 'shlex', 'shutil', + 'signal', 'site', 'site', 'smtpd', 'smtplib', 'sndhdr', 'socket', 'socketserver', 'spwd', + 'sqlite3', 'ssl', 'stat', 'statistics', 'string', 'stringprep', 'struct', 'subprocess', 'sunau', + 'symbol', 'symtable', 'sys', 'sysconfig', 'syslog', 'tabnanny', 'tarfile', 'telnetlib', + 'tempfile', 'termios', 'test', 'test.support', 'test.support.bytecode_helper', + 'test.support.script_helper', 'test.support.socket_helper', 'textwrap', 'threading', 'time', + 'timeit', 'tkinter', 'tkinter.colorchooser', 'tkinter.dnd', 'tkinter.font', + 'tkinter.messagebox', 'tkinter.scrolledtext', 'tkinter.tix', 'tkinter.ttk', 'token', 'tokenize', + 'trace', 'traceback', 'tracemalloc', 'tty', 'turtle', 'types', 'typing', 'unicodedata', + 'unittest', 'unittest.mock', 'urllib', 'urllib.error', 'urllib.parse', 'urllib.request', + 'urllib.response', 'urllib.robotparser', 'uu', 'uuid', 'venv', 'warnings', 'wave', 'weakref', + 'webbrowser', 'winreg', 'winsound', 'wsgiref', 'xdrlib', 'xml.dom', 'xml.dom.minidom', + 'xml.dom.pulldom', 'xml.etree.ElementTree', 'xml.parsers.expat', 'xml.sax', 'xml.sax.handler', + 'xml.sax.saxutils', 'xml.sax.xmlreader', 'xmlrpc', 'xmlrpc.client', 'xmlrpc.server', 'zipapp', + 'zipfile', 'zipimport', 'zlib', 'zoneinfo', +] + +def _dump_load_module(module_name, refonfail): + try: + __import__(module_name) + except ImportError: + return None, None + success_load = None + buf = io.BytesIO() + try: + dill.dump_module(buf, module_name, refonfail=refonfail) + except Exception: + print("F", end="") + success_dump = False + return success_dump, success_load + print(":", end="") + success_dump = True + buf.seek(0) + try: + module = dill.load_module(buf) + except Exception: + success_load = False + return success_dump, success_load + success_load = True + return success_dump, success_load + +def test_stdlib_modules(): + modules = [x for x in STDLIB_MODULES if + not x.startswith('_') + and not x.startswith('test') + and x not in ('antigravity', 'this')] + + + print("\nTesting pickling and unpickling of Standard Library modules...") + message = "Success rate (%s_module, refonfail=%s): %.1f%% [%d/%d]" + with multiprocessing.Pool(maxtasksperchild=1) as pool: + for refonfail in (False, True): + args = zip(modules, itertools.repeat(refonfail)) + result = pool.starmap(_dump_load_module, args, chunksize=1) + dump_successes = sum(dumped for dumped, loaded in result if dumped is not None) + load_successes = sum(loaded for dumped, loaded in result if loaded is not None) + dump_failures = sum(not dumped for dumped, loaded in result if dumped is not None) + load_failures = sum(not loaded for dumped, loaded in result if loaded is not None) + dump_total = dump_successes + dump_failures + load_total = load_successes + load_failures + dump_percent = 100 * dump_successes / dump_total + load_percent = 100 * load_successes / load_total + print() + print(message % ("dump", refonfail, dump_percent, dump_successes, dump_total)) + print(message % ("load", refonfail, load_percent, load_successes, load_total)) + if refonfail: + failed_dump = [mod for mod, (dumped, _) in zip(modules, result) if dumped is False] + failed_load = [mod for mod, (_, loaded) in zip(modules, result) if loaded is False] + logging.info("dump_module() fails: %s", failed_dump) + logging.info("load_module() fails: %s", failed_load) + assert dump_percent > 95 + +if __name__ == '__main__': + logging.basicConfig(level=os.environ.get('PYTHONLOGLEVEL', 'WARNING')) + warnings.simplefilter('ignore') + test_stdlib_modules() From 71b4f7728e04bd767683553b202450dc51f5e679 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 1 Sep 2022 00:16:58 -0300 Subject: [PATCH 107/109] minor --- dill/tests/test_stdlib_modules.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/dill/tests/test_stdlib_modules.py b/dill/tests/test_stdlib_modules.py index e49760a2..15cb0767 100644 --- a/dill/tests/test_stdlib_modules.py +++ b/dill/tests/test_stdlib_modules.py @@ -116,15 +116,19 @@ def test_stdlib_modules(): load_total = load_successes + load_failures dump_percent = 100 * dump_successes / dump_total load_percent = 100 * load_successes / load_total - print() - print(message % ("dump", refonfail, dump_percent, dump_successes, dump_total)) - print(message % ("load", refonfail, load_percent, load_successes, load_total)) + if logging.getLogger().isEnabledFor(logging.INFO): print() + logging.info(message, "dump", refonfail, dump_percent, dump_successes, dump_total) + logging.info(message, "load", refonfail, load_percent, load_successes, load_total) if refonfail: failed_dump = [mod for mod, (dumped, _) in zip(modules, result) if dumped is False] failed_load = [mod for mod, (_, loaded) in zip(modules, result) if loaded is False] - logging.info("dump_module() fails: %s", failed_dump) - logging.info("load_module() fails: %s", failed_load) - assert dump_percent > 95 + if failed_dump: + logging.info("dump_module() FAILURES: %s", str(failed_dump).replace("'", "")[1:-1]) + if failed_load: + logging.info("load_module() FAILURES: %s", str(failed_load).replace("'", "")[1:-1]) + assert dump_percent > 99 + assert load_percent > 85 #FIXME: many important modules fail to unpickle + print() if __name__ == '__main__': logging.basicConfig(level=os.environ.get('PYTHONLOGLEVEL', 'WARNING')) From 4244d3b2e22a4aa3abff81f80cf45fa6e40934ef Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Wed, 14 Sep 2022 09:53:23 -0300 Subject: [PATCH 108/109] fix _stash_modules() arguments --- dill/_dill.py | 2 ++ dill/session.py | 11 ++++++++--- dill/tests/test_session.py | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 787cb7ac..3ad7ead0 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -39,6 +39,7 @@ from dill import logging from .logging import adapter as logger from .logging import trace as _trace +_logger = logging.getLogger(__name__) import os import sys @@ -1350,6 +1351,7 @@ def _save_module_dict(pickler, main_dict): is_builtin = _is_builtin_module(main) pickler.write(MARK + DICT) # don't need to memoize for name, value in main_dict.items(): + _logger.debug("Pickling %r (%s)", name, type(value).__name__) pickler.save(name) try: if pickler.save(value): diff --git a/dill/session.py b/dill/session.py index 9b52393c..9fc4ef56 100644 --- a/dill/session.py +++ b/dill/session.py @@ -87,9 +87,9 @@ BUILTIN_CONSTANTS = (None, False, True, NotImplemented) -def _stash_modules(main_module): +def _stash_modules(main_module, original_main): """pop imported variables to be saved by reference in the __dill_imported* attributes""" - modmap = _module_map(main_module) + modmap = _module_map(original_main) newmod = ModuleType(main_module.__name__) original = {} imported = [] @@ -136,6 +136,7 @@ def _stash_modules(main_module): refimported += [(name, mod) for mod, name in imported_top_level] message = "[dump_module] Variables saved by reference (refimported):\n" logger.info(message + _format_log_dict(dict(refimported))) + logger.debug("main namespace after _stash_modules(): %s", dir(newmod)) return newmod, modmap else: @@ -176,6 +177,7 @@ def _filter_vars(main_module, exclude, include, base_rules): newmod = ModuleType(main_module.__name__) newmod.__dict__.update(namespace) _discard_added_variables(newmod, namespace) + logger.debug("main namespace after _filter_vars(): %s", dir(newmod)) return newmod def _discard_added_variables(main, original_namespace): @@ -363,9 +365,12 @@ def dump_module( if not isinstance(main, ModuleType): raise TypeError("%r is not a module" % main) original_main = main + + logger.debug("original main namespace: %s", dir(main)) main = _filter_vars(main, exclude, include, base_rules) if refimported: - main, modmap = _stash_modules(original_main) + main, modmap = _stash_modules(main, original_main) + with _open(filename, 'wb', seekable=True) as file: pickler = Pickler(file, protocol, **kwds) pickler._main = main #FIXME: dill.settings are disabled diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index de3e97af..e5341b25 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -201,7 +201,7 @@ def test_runtime_module(): runtime_mod = ModuleType(modname) runtime_mod.x = 42 - mod, _ = dill.session._stash_modules(runtime_mod) + mod, _ = dill.session._stash_modules(runtime_mod, runtime_mod) if mod is not runtime_mod: print("There are objects to save by referenece that shouldn't be:", mod.__dill_imported, mod.__dill_imported_as, mod.__dill_imported_top_level, From 78f5e2d37d7c6d13caaf29c360d3c36bc70a42c5 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Wed, 28 Sep 2022 11:34:01 -0300 Subject: [PATCH 109/109] tests: FilterSet methods; added extra filter type check --- dill/_utils.py | 9 +++--- dill/tests/test_filtering.py | 58 +++++++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 5 deletions(-) diff --git a/dill/_utils.py b/dill/_utils.py index 0d9d86e2..0aaf65a5 100644 --- a/dill/_utils.py +++ b/dill/_utils.py @@ -166,11 +166,12 @@ def __repr__(self): Filter = Union[str, Pattern[str], int, type, FilterFunction] Rule = Tuple[RuleType, Union[Filter, Iterable[Filter]]] -def _iter(filters): - if isinstance(filters, str): +def _iter(obj): + """return iterator of object if it's not a string""" + if isinstance(obj, (str, bytes)): return None try: - return iter(filters) + return iter(obj) except TypeError: return None @@ -199,7 +200,7 @@ def _match_type(self, filter: Filter) -> Tuple[filter, str]: else: filter = re.compile(filter) field = 'regexes' - elif filter_type is re.Pattern: + elif filter_type is re.Pattern and type(filter.pattern) is str: field = 'regexes' elif filter_type is int: field = 'ids' diff --git a/dill/tests/test_filtering.py b/dill/tests/test_filtering.py index 259b5fd6..3bcc0c9c 100644 --- a/dill/tests/test_filtering.py +++ b/dill/tests/test_filtering.py @@ -11,9 +11,64 @@ import dill from dill import _dill from dill.session import ( - EXCLUDE, INCLUDE, FilterRules, RuleType, ipython_filter, size_filter, settings + EXCLUDE, INCLUDE, FilterRules, FilterSet, RuleType, ipython_filter, size_filter, settings ) +def test_filterset(): + import re + + name = 'test' + regex1 = re.compile(r'\w+\d+') + regex2 = r'_\w+' + id_ = id(FilterSet) + type1 = FilterSet + type2 = 'type:List' + func = lambda obj: obj.name == 'Arthur' + + empty_filters = FilterSet() + assert bool(empty_filters) is False + assert len(empty_filters) == 0 + assert len([*empty_filters]) == 0 + + # also tests add() and __ior__() for non-FilterSet other + filters = FilterSet._from_iterable([name, regex1, regex2, id_, type1, type2, func]) + assert filters.names == {name} + assert filters.regexes == {regex1, re.compile(regex2)} + assert filters.ids == {id_} + assert filters.types == {type1, list} + assert filters.funcs == {func} + + assert bool(filters) is True + assert len(filters) == 7 + assert all(x in filters for x in [name, regex1, id_, type1, func]) + + try: + filters.add(re.compile(b'an 8-bit string regex')) + except ValueError: + pass + else: + raise AssertionError("adding invalid filter should raise error") + + filters_copy = filters.copy() + for field in FilterSet._fields: + original, copy = getattr(filters, field), getattr(filters_copy, field) + assert copy is not original + assert copy == original + + filters.remove(re.compile(regex2)) + assert filters.regexes == {regex1} + filters.discard(list) + filters.discard(list) # should not raise error + assert filters.types == {type1} + assert [*filters] == [name, regex1, id_, type1, func] + + # also tests __ior__() for FilterSet other + filters.update(filters_copy) + assert filters.types == {type1, list} + + filters.clear() + assert len(filters) == 0 + NS = { 'a': 1, 'aa': 2, @@ -184,6 +239,7 @@ def test_size_filter(): assert did_exclude(NS_copy, filter_size, excluded_subset={'large'}) if __name__ == '__main__': + test_filterset() test_basic_filtering() test_exclude_include() test_add_type()