From 8e7565472f00c60dea840dd53bf62eaa0c885d36 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Mon, 20 May 2019 19:14:30 +0200 Subject: [PATCH 01/70] MNT use the is_dynamic for both classes and functions --- cloudpickle/cloudpickle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index 563f1af55..dd4e15733 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -183,8 +183,8 @@ def _is_global(obj, name=None): # supported, as the standard pickle does not support it either. return False - # module has been added to sys.modules, but it can still be dynamic. if _is_dynamic(module): + # module has been added to sys.modules, but it can still be dynamic. return False try: From 51df1aa5968abcd873c161dc3db5cd832778029a Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 22 May 2019 15:36:31 +0200 Subject: [PATCH 02/70] FIX fix is_dynamic for some builtin packages in pypy Previously, this bug was not revealed because we tried Pickle.save_global(obj) (which would succeed in these cases) before calling is_dynamic(obj.__module__), but now the order is reversed --- cloudpickle/cloudpickle.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index dd4e15733..9747cc4ba 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -102,6 +102,11 @@ PY2 = False from importlib._bootstrap import _find_spec + if sys.implementation.name == 'pypy': + from importlib._bootstrap import _find_spec + else: + from _frozen_importlib import _find_spec + def _ensure_tracking(class_def): with _DYNAMIC_CLASS_TRACKER_LOCK: From b89b259c331be63150fbabca723895d048a8bc32 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 22 May 2019 15:51:41 +0200 Subject: [PATCH 03/70] FIX python2-3 compat --- cloudpickle/cloudpickle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index 9747cc4ba..cfa7e54cd 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -102,7 +102,7 @@ PY2 = False from importlib._bootstrap import _find_spec - if sys.implementation.name == 'pypy': + if platform.python_implementation() == 'PyPy': from importlib._bootstrap import _find_spec else: from _frozen_importlib import _find_spec From 161924b318df220db784fe68f1bfb67ca421475b Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 22 Feb 2019 10:01:26 +0100 Subject: [PATCH 04/70] ENH extend _pickle.Pickler --- cloudpickle/__init__.py | 2 +- cloudpickle/cloudpickle_fast.py | 752 ++++++++++++++++++++++++++++++++ tests/cloudpickle_test.py | 5 +- tests/testutils.py | 2 +- 4 files changed, 758 insertions(+), 3 deletions(-) create mode 100644 cloudpickle/cloudpickle_fast.py diff --git a/cloudpickle/__init__.py b/cloudpickle/__init__.py index 1af671683..21658795e 100644 --- a/cloudpickle/__init__.py +++ b/cloudpickle/__init__.py @@ -1,5 +1,5 @@ from __future__ import absolute_import -from cloudpickle.cloudpickle import * +from cloudpickle.cloudpickle_fast import * __version__ = '1.2.0.dev0' diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py new file mode 100644 index 000000000..c4ac50da8 --- /dev/null +++ b/cloudpickle/cloudpickle_fast.py @@ -0,0 +1,752 @@ +""" +New, fast version of the Cloudpickler. + +This new Cloudpickler class can now extend the fast C Pickler instead of the +previous pythonic Pickler. Because this functionality is only available for +python versions 3.8+, a lot of backward-compatibilty code is also removed. +""" +import abc +import dis +import io +import logging +import opcode +import _pickle +import pickle +import sys +import types +import weakref + +from _pickle import Pickler + +load, loads = _pickle.load, _pickle.loads + + +# XXX: Uncovered code in cloudpickle is currently removed, as they lack a +# specific use case justifying their presence. Functions/Methods removed: +# - _restore_attr +# - _get_module_builtins +# - print_exec +# - _modules_to_main +# - _gen_ellipsis +# - everyting after (if obj.__dict__) in save_global + +# cloudpickle is meant for inter process communication: we expect all +# communicating processes to run the same Python version hence we favor +# communication speed over compatibility: +DEFAULT_PROTOCOL = pickle.HIGHEST_PROTOCOL + + +# relevant opcodes, used to detect global variables manipulation +# XXX: I think STORE_GLOBAL can actually be removed. +STORE_GLOBAL = opcode.opmap["STORE_GLOBAL"] +DELETE_GLOBAL = opcode.opmap["DELETE_GLOBAL"] +LOAD_GLOBAL = opcode.opmap["LOAD_GLOBAL"] +GLOBAL_OPS = (STORE_GLOBAL, DELETE_GLOBAL, LOAD_GLOBAL) + + +# map a type to its name in the types module. +_BUILTIN_TYPE_NAMES = {} +for k, v in types.__dict__.items(): + if type(v) is type: + _BUILTIN_TYPE_NAMES[v] = k + + +# Shorthands similar to pickle.dump/pickle.dumps + + +def dump(obj, file, protocol=None): + """Serialize obj as bytes streamed into file + + protocol defaults to cloudpickle.DEFAULT_PROTOCOL which is an alias to + pickle.HIGHEST_PROTOCOL. This setting favors maximum communication speed + between processes running the same Python version. + + Set protocol=pickle.DEFAULT_PROTOCOL instead if you need to ensure + compatibility with older versions of Python. + """ + CloudPickler(file, protocol=protocol).dump(obj) + + +def dumps(obj, protocol=None): + """Serialize obj as a string of bytes allocated in memory + + protocol defaults to cloudpickle.DEFAULT_PROTOCOL which is an alias to + pickle.HIGHEST_PROTOCOL. This setting favors maximum communication speed + between processes running the same Python version. + + Set protocol=pickle.DEFAULT_PROTOCOL instead if you need to ensure + compatibility with older versions of Python. + """ + file = io.BytesIO() + try: + cp = CloudPickler(file, protocol=protocol) + cp.dump(obj) + return file.getvalue() + finally: + file.close() + + +# Utility functions introspecting objects to extract useful properties about +# them. + + +def islambda(func): + return getattr(func, "__name__") == "" + + +def _is_dynamic(module): + """ Check if the module is importable by name + + Notable exceptions include modules created dynamically using + types.ModuleType + """ + # Quick check: module that have __file__ attribute are not dynamic modules. + if hasattr(module, "__file__"): + return False + + # XXX: there used to be backwad compat code for python 2 here. + if hasattr(module, "__spec__"): + return module.__spec__ is None + + +def _find_loaded_submodules(globals, closure, co_names): + """ + Find submodules used by a function but not listed in its globals. + + In the example below: + + ``` + import xml.etree + import cloudpickle + + + def func(): + x = xml.etree.ElementTree + + + if __name__ == '__main__': + cloudpickle.dumps(func) + ``` + + the expression xml.etree.ElementTree generates a LOAD_GLOBAL for xml, but + simply LOAD_ATTR for etree and ElementTree - cloudpickle cannot detect + such submodules by bytecode inspection. There is actually no exact way of + detecting them, the method below is simply "good enough". For instance: + + import xml.etree + + def f(): + def g(): + return xml.etree + return g + + pickling f and trying to call f()() will raise a NameError + + """ + + referenced_submodules = {} + top_level_dependencies = list(globals.values()) + for cell in closure: + try: + top_level_dependencies.append(cell.cell_contents) + except ValueError: + continue + + # top_level_dependencies are variables that generated a LOAD_GlOBAL or a + # LOAD_DEREF opcode in code. + for x in top_level_dependencies: + if ( + isinstance(x, types.ModuleType) + and getattr(x, "__package__", None) is not None + ): + # check if the package has any currently loaded sub-imports + prefix = x.__name__ + "." + # A concurrent thread could mutate sys.modules, + # make sure we iterate over a copy to avoid exceptions + for name in list(sys.modules): + # Older versions of pytest will add a "None" module to + # sys.modules. + if name is not None and name.startswith(prefix): + # check whether the function can address the sub-module + tokens = set(name[len(prefix) :].split(".")) + if not tokens - set(co_names): + # ensure unpickler executes this import + referenced_submodules[name] = sys.modules[name] + return referenced_submodules + + +def extract_code_globals(code, globals): + """ + Find all globals names read or written to by codeblock co + """ + # XXX: there used to be a cache lookup based on the code object to get its + # corresponding global variable names. I removed it for the first version, + # I don't know if it is worth keeping it. + code_globals = {} + # PyPy "builtin-code" do not have this structure + if hasattr(code, "co_names"): + # first, find, potential submodules that are hard to identify + instructions = dis.get_instructions(code) + for ins in instructions: + varname = ins.argval + if ins.opcode in GLOBAL_OPS and varname in globals: + code_globals[varname] = globals[varname] + + # co.co_consts refers to any constant variable used by co. + # lines such as print("foo") or a = 1 will result in a new addition to + # the co_consts tuple ("foo" or 1). + # However, name resolution is done at run-time, so assignment of the + # form a = b will not yield a new item in co_consts (as the compiler + # has no idea what b is at declaration time). + + # Declaring a function inside another one using the "def ..." syntax + # generates a constant code object corresonding to the one of the + # nested function's. This code object is added into the co_consts + # attribute of the enclosing's function code. As the nested function + # may itself need global variables, we need to introspect its code, + # extract its globals, (look for code object in it's co_consts + # attribute..) and add the result to the global variables lists + if code.co_consts: + for c in code.co_consts: + if isinstance(c, types.CodeType): + code_globals.update(extract_code_globals(c, globals)) + + return code_globals + + +# COLLECTION OF OBJECTS __getnewargs__-like methods +# ------------------------------------------------- + + +def function_getnewargs(func, globals_ref): + code = func.__code__ + + # base_globals represents the future global namespace of func at + # unpickling time. Looking it up and storing it in globals_ref allow + # functions sharing the same globals at pickling time to also + # share them once unpickled, at one condition: since globals_ref is + # an attribute of a Cloudpickler instance, and that a new CloudPickler is + # created each time pickle.dump or pickle.dumps is called, functions + # also need to be saved within the same invokation of + # cloudpickle.dump/cloudpickle.dumps + # (for example: cloudpickle.dumps([f1, f2])). There + # is no such limitation when using Cloudpickler.dump, as long as the + # multiple invokations are bound to the same Cloudpickler. + base_globals = globals_ref.setdefault(id(func.__globals__), {}) + + # Do not bind the free variables before the function is created to avoid + # infinite recursion. + if func.__closure__ is None: + closure = None + else: + closure = tuple(types.CellType() for _ in range(len(code.co_freevars))) + + return code, base_globals, None, None, closure + + +# COLLECTION OF OBJECTS RECONSTRUCTORS +# ------------------------------------ + +# Builtin types are types defined in the python language source code, that are +# not defined in an importable python module (Lib/* for pure python module, +# Modules/* for C-implemented modules). The most wildely used ones (such as +# tuple, dict, list) are made accessible in any interpreter session by exposing +# them in the builtin namespace at startup time. + +# By construction, builtin types do not have a module. Trying to access their +# __module__ attribute will default to 'builtins', that only contains builtin +# types accessible at interpreter startup. Therefore, trying to pickle the +# other ones using classic module attribute lookup instructions will fail. + +# Fortunately, many non-accessible builtin-types are mirrored in the types +# module. For those types, we pickle the function builtin_type_reconstructor +# instead, that contains instruction to look them up via the types module. +def builtin_type_reconstructor(name): + """Return a builtin-type using attribute lookup from the types module""" + return getattr(types, name) + + +# XXX: what does "not working as desired" means? +# hack for __import__ not working as desired +def module_reconstructor(name): + __import__(name) + return sys.modules[name] + + +def dynamic_module_reconstructor(name, vars): + mod = types.ModuleType(name) + mod.__dict__.update(vars) + return mod + + +def file_reconstructor(retval): + return retval + + +# COLLECTION OF OBJECTS STATE GETTERS +# ----------------------------------- +def function_getstate(func): + # * Put func's dynamic attributes (stored in func.__dict__) in state. These + # attributes will be restored at unpickling time using + # f.__dict__.update(state) + # * Put func's members into slotstate. Such attributes will be restored at + # unpickling time by iterating over slotstate and calling setattr(func, + # slotname, slotvalue) + slotstate = { + "__name__": func.__name__, + "__qualname__": func.__qualname__, + "__annotations__": func.__annotations__, + "__kwdefaults__": func.__kwdefaults__, + "__defaults__": func.__defaults__, + "__module__": func.__module__, + "__doc__": func.__doc__, + "__closure__": func.__closure__, + } + + f_globals = extract_code_globals(func.__code__, func.__globals__) + + # extract submodules referenced by attribute lookup (no global opcode) + f_globals["__submodules__"] = _find_loaded_submodules( + f_globals, slotstate["__closure__"] or (), func.__code__.co_names + ) + slotstate["__globals__"] = f_globals + + state = func.__dict__ + return state, slotstate + + +# COLLECTIONS OF OBJECTS REDUCERS +# ------------------------------- +# A reducer is a function taking a single argument (obj), and that returns a +# tuple with all the necessary data to re-construct obj. Apart from a few +# exceptions (list, dicts, bytes, ints, etc.), a reducer is necessary to +# correclty pickle an object. +# While many built-in objects (Exceptions objects, instances of the "object" +# class, etc), are shipped with their own built-in reducer (invoked using +# obj.__reduce__), some do not. The following methods were created to "fill +# these holes". + +# XXX: no itemgetter/attrgetter reducer support implemented as the test seem to +# pass even without them + + +def builtin_type_reduce(obj): + return builtin_type_reconstructor, (_BUILTIN_TYPE_NAMES[obj],) + + +def code_reduce(obj): + """codeobject reducer""" + args = ( + obj.co_argcount, + obj.co_kwonlyargcount, + obj.co_nlocals, + obj.co_stacksize, + obj.co_flags, + obj.co_code, + obj.co_consts, + obj.co_names, + obj.co_varnames, + obj.co_filename, + obj.co_name, + obj.co_firstlineno, + obj.co_lnotab, + obj.co_freevars, + obj.co_cellvars, + ) + return types.CodeType, args + + +def cell_reduce(obj): + """Cell (containing values of a function's free variables) reducer""" + try: + obj.cell_contents + except ValueError: # cell is empty + return types.CellType, () + else: + return types.CellType, (obj.cell_contents,) + + +def classmethod_reduce(obj): + orig_func = obj.__func__ + return type(obj), (orig_func,) + + +def file_reduce(obj): + """Save a file""" + try: + import StringIO as pystringIO # we can't use cStringIO as it lacks the name attribute + except ImportError: + import io as pystringIO + + if not hasattr(obj, "name") or not hasattr(obj, "mode"): + raise pickle.PicklingError( + "Cannot pickle files that do not map to an actual file" + ) + if obj is sys.stdout: + return getattr, (sys, "stdout") + if obj is sys.stderr: + return getattr, (sys, "stderr") + if obj is sys.stdin: + raise pickle.PicklingError("Cannot pickle standard input") + if obj.closed: + raise pickle.PicklingError("Cannot pickle closed files") + if hasattr(obj, "isatty") and obj.isatty(): + raise pickle.PicklingError( + "Cannot pickle files that map to tty objects" + ) + if "r" not in obj.mode and "+" not in obj.mode: + raise pickle.PicklingError( + "Cannot pickle files that are not opened for reading: %s" % obj.mode + ) + + name = obj.name + + retval = pystringIO.StringIO() + + try: + # Read the whole file + curloc = obj.tell() + obj.seek(0) + contents = obj.read() + obj.seek(curloc) + except IOError: + raise pickle.PicklingError( + "Cannot pickle file %s as it cannot be read" % name + ) + retval.write(contents) + retval.seek(curloc) + + retval.name = name + return file_reconstructor, (retval,) + + +def mappingproxy_reduce(obj): + return types.MappingProxyType, (dict(obj),) + + +def memoryview_reduce(obj): + return bytes, (obj.tobytes(),) + + +def module_reduce(obj): + """Module reducer""" + if _is_dynamic(obj): + return dynamic_module_reconstructor, (obj.__name__, vars(obj)) + else: + return module_reconstructor, (obj.__name__,) + + +def method_reduce(obj): + return (types.MethodType, (obj.__func__, obj.__self__)) + + +def logger_reduce(obj): + return logging.getLogger, (obj.name,) + + +def root_logger_reduce(obj): + return logging.getLogger, () + + +def weakset_reduce(obj): + return weakref.WeakSet, (list(obj),) + + +def dynamic_function_reduce(func, globals_ref): + """Reduce a function that is not pickleable via attribute loookup. + """ + # XXX: should globals_ref be a global variable instead? The reason is + # purely cosmetic. There is no risk of references leaking, we would have to + # limit the growing of globals_ref, by making it a lru cache for example. + newargs = function_getnewargs(func, globals_ref) + state = function_getstate(func) + return types.FunctionType, newargs, state, None, None, function_setstate + + +def function_reduce(obj, globals_ref): + """Select the reducer depending on obj's dynamic nature + + This functions starts by replicating save_global: trying to retrieve obj + from an attribute lookup of a file-backed module. If this check fails, then + a custom reducer is called. + """ + # XXX: should we implement the cache of builtin-type constructors? + + name = obj.__name__ + try: + modname = pickle.whichmodule(obj, name) + except Exception: + modname = None + + try: + themodule = sys.modules[modname] + except KeyError: + # eval'd items such as namedtuple give invalid items for their function + # __module__ + modname = "__main__" + + if modname == "__main__": + # we do not want the incoming module attribute lookup to succeed for + # the __main__ module. + themodule = None + + try: + lookedup_by_name = getattr(themodule, name, None) + except Exception: + lookedup_by_name = None + + if lookedup_by_name is obj: # in this case, module is None + # if obj exists in a static module, let the builtin pickle saving + # routines save obj + return NotImplementedError + + # XXX: the special handling of builtin_function_or_method is removed as + # currently this hook is not called for such instances, as opposed to + # cloudpickle. + + # if func is lambda, def'ed at prompt, is in main, or is nested, then + # we'll pickle the actual function object rather than simply saving a + # reference (as is done in default pickler), via save_function_tuple. + if ( + islambda(obj) + or getattr(obj.__code__, "co_filename", None) == "" + or themodule is None + ): + return dynamic_function_reduce(obj, globals_ref=globals_ref) + + # TODO:this is cleanable: the if/else conditions + the NotImplementedError + # cover all cases. + else: + # func is nested + if lookedup_by_name is None or lookedup_by_name is not obj: + return dynamic_function_reduce(obj, globals_ref=globals_ref) + + +def dynamic_class_reduce(obj): + """ + Save a class that can't be stored as module global. + + This method is used to serialize classes that are defined inside + functions, or that otherwise can't be serialized as attribute lookups + from global modules. + """ + # XXX: This code is nearly untouch with regards to the legacy cloudpickle. + # It is pretty and hard to understand. Maybe refactor it by dumping + # potential python2 specific code and making a trading off optimizations in + # favor of readbility. + clsdict = dict(obj.__dict__) # copy dict proxy to a dict + clsdict.pop("__weakref__", None) + + # XXX: I am trying to add the abc-registered subclasses into the class + # reconstructor, because using save_reduce semantics prevents us to perform + # any other operation than state updating after obj is created. + + # I may encounter reference cycles, although there seems to be checks + # preventing this to happen. + if "_abc_impl" in clsdict: + (registry, _, _, _) = abc._get_dump(obj) + clsdict["_abc_impl"] = [ + subclass_weakref() for subclass_weakref in registry + ] + + # On PyPy, __doc__ is a readonly attribute, so we need to include it in + # the initial skeleton class. This is safe because we know that the + # doc can't participate in a cycle with the original class. + type_kwargs = {"__doc__": clsdict.pop("__doc__", None)} + + if hasattr(obj, "__slots__"): + type_kwargs["__slots__"] = obj.__slots__ + # Pickle string length optimization: member descriptors of obj are + # created automatically from obj's __slots__ attribute, no need to + # save them in obj's state + if isinstance(obj.__slots__, str): + clsdict.pop(obj.__slots__) + else: + for k in obj.__slots__: + clsdict.pop(k, None) + + # If type overrides __dict__ as a property, include it in the type kwargs. + # In Python 2, we can't set this attribute after construction. + # XXX: removed special handling of __dict__ for python2 + __dict__ = clsdict.pop("__dict__", None) + if isinstance(__dict__, property): + type_kwargs["__dict__"] = __dict__ + __dict__ = None + + return ( + type(obj), + (obj.__name__, obj.__bases__, type_kwargs), + (clsdict, {}), + None, + None, + class_setstate, + ) + + +def class_reduce(obj): + """Select the reducer depending on the dynamic nature of the class obj""" + # XXX: there used to be special handling for NoneType, EllipsisType and + # NotImplementedType. As for now this module handles only python3.8+, this + # code has been removed. + if obj.__module__ == "__main__": + return dynamic_class_reduce(obj) + + try: + # All classes are caught in this function: pickleable classes are + # filtered out by creating a Pickler with no custom class reducer + # (thus, falling back to save_global). If it fails to save obj, then + # obj is either a non-pickleable builtin or dynamic. + pickle.dumps(obj) + except Exception: + # XXX: previously, we also looked for the __builtin__ module, but this + # is python 2 specific. + if obj.__module__ == "builtins": + if obj in _BUILTIN_TYPE_NAMES: + return builtin_type_reduce(obj) + + typ = type(obj) + if typ is not obj and isinstance(obj, type): # noqa: E721 + return dynamic_class_reduce(obj) + + else: + # if pickle.dumps worked out fine, then simply pickle by attribute + return NotImplementedError + + +# COLLECTIONS OF OBJECTS STATE SETTERS +# ------------------------------------ +# state setters are called at unpickling time, once the object is created and +# it has to be updated to how it was at unpickling time. + + +def function_setstate(obj, state, slotstate): + """Update the state of a dynaamic function. + + As __closure__ and __globals__ are readonly attributes of a function, we + cannot rely on the native setstate routine of pickle.load_build, that calls + setattr on items of the slotstate. Instead, we have to modify them inplace. + """ + obj.__dict__.update(state) + + obj_globals = slotstate.pop("__globals__") + obj_closure = slotstate.pop("__closure__") + + # remove uncessary references to submodules + obj_globals.pop("__submodules__") + obj.__globals__.update(obj_globals) + obj.__globals__["__builtins__"] = __builtins__ + + if obj_closure is not None: + for i, cell in enumerate(obj_closure): + try: + value = cell.cell_contents + except ValueError: # cell is empty + continue + obj.__closure__[i].cell_contents = value + + for k, v in slotstate.items(): + setattr(obj, k, v) + + +def class_setstate(obj, state, slotstate): + registry = None + for attrname, attr in state.items(): + if attrname == "_abc_impl": + registry = attr + else: + setattr(obj, attrname, attr) + if registry is not None: + for subclass in registry: + obj.register(subclass) + + return obj + + +# Arbitration between builtin-save method and user-defined callbacks +# ------------------------------------------------------------------ +# This set of functions aim at deciding whether an object can be properly +# pickler by the c Pickler, or if it needs to be serialized using cloudpickle's +# reducers. +def hook(pickler, obj): + """Custom reducing instructions for un-picklable functions and classes + """ + # Classes deriving from custom, dynamic metaclasses won't get caught inside + # the hook_dispatch dict. In the legacy cloudpickle, this was not really a + # problem because not being present in the dispatch table meant falling + # back to save_global, which was already overriden by cloudpickle. Using + # the c pickler, save_global cannot be overriden, so we have manually check + # is obj's comes from a custom metaclass, and in this case, direct the + # object to save_global. + t = type(obj) + + try: + is_metaclass = issubclass(t, type) + except TypeError: # t is not a class (old Boost; see SF #502085) + is_metaclass = False + + if is_metaclass: + return class_reduce(obj) + elif isinstance(obj, types.FunctionType): + return function_reduce(obj, pickler.globals_ref) + else: + return NotImplementedError + + +class CloudPickler(Pickler): + """Fast C Pickler extension with additional reducing routines + + + Cloudpickler's extensions exist into into: + + * it's dispatch_table containing methods that are called only if ALL + built-in saving functions were previously discarded. + * it's callback_dispatch, containing methods that are called only if ALL + built-in saving functions except save_global were previously discarded. + + Both tables contains reducers, that take a single argument (obj), and + preturn a tuple with all the necessary data to re-construct obj. + + """ + + dispatch = {} + dispatch[types.CellType] = cell_reduce + dispatch[types.ModuleType] = module_reduce + dispatch[types.MethodType] = method_reduce + dispatch[logging.Logger] = logger_reduce + dispatch[logging.RootLogger] = root_logger_reduce + dispatch[types.CodeType] = code_reduce + dispatch[classmethod] = classmethod_reduce + dispatch[staticmethod] = classmethod_reduce + dispatch[weakref.WeakSet] = weakset_reduce + dispatch[types.ModuleType] = module_reduce + dispatch[types.MappingProxyType] = mappingproxy_reduce + dispatch[memoryview] = memoryview_reduce + # dispatch[io.TextIOWrapper] = file_reduce + + # dispatch[operator.attrgetter] = attrgetter_reduce + # dispatch[operator.itemgetter] = itemgetter_reduce + + def __init__(self, file, protocol=None): + if protocol is None: + protocol = DEFAULT_PROTOCOL + Pickler.__init__(self, file, protocol=protocol) + # map functions __globals__ attribute ids, to ensure that functions + # sharing the same global namespace at pickling time also share their + # global namespace at unpickling time. + self.globals_ref = {} + self.dispatch_table = self.dispatch + self.global_hook = hook + self.proto = int(protocol) + + def dump(self, obj): + try: + return Pickler.dump(self, obj) + except RuntimeError as e: + if "recursion" in e.args[0]: + msg = ( + "Could not pickle object as excessively deep recursion " + "required." + ) + raise pickle.PicklingError(msg) + else: + raise diff --git a/tests/cloudpickle_test.py b/tests/cloudpickle_test.py index e260a35ec..d9955f6e4 100644 --- a/tests/cloudpickle_test.py +++ b/tests/cloudpickle_test.py @@ -1076,7 +1076,7 @@ def f(): # some setup is required to allow pytest apimodules to be correctly # serializable. from cloudpickle import CloudPickler - CloudPickler.dispatch[type(py.builtin)] = CloudPickler.save_module + CloudPickler.dispatch[type(py.builtin)] = cloudpickle.module_reduce g = cloudpickle.loads(cloudpickle.dumps(f, protocol=self.protocol)) result = g() @@ -1100,6 +1100,9 @@ def func(x): cloned = pickle_depickle(func, protocol=self.protocol) self.assertEqual(cloned.__qualname__, func.__qualname__) + # @pytest.mark.skipif(sys.version_info >= (3, 8), + # reason="pickling namedtuple is broken on 3.8") + def test_namedtuple(self): MyTuple = collections.namedtuple('MyTuple', ['a', 'b', 'c']) t1 = MyTuple(1, 2, 3) diff --git a/tests/testutils.py b/tests/testutils.py index 3ad1eb98e..fba7b6be7 100644 --- a/tests/testutils.py +++ b/tests/testutils.py @@ -72,7 +72,7 @@ def subprocess_pickle_echo(input_data, protocol=None, timeout=TIMEOUT): """ # run then pickle_echo(protocol=protocol) in __main__: - cmd = [sys.executable, __file__, "--protocol", str(protocol)] + cmd = [sys.executable, '-W ignore', __file__, "--protocol", str(protocol)] cwd, env = _make_cwd_env() proc = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=cwd, env=env, bufsize=4096) From 7da9aafc610a63cc76c036b0fb0e17a9f740f92e Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 6 Mar 2019 15:39:14 +0100 Subject: [PATCH 05/70] MNT load cloudpickle_fast for recent (>3.8) python --- cloudpickle/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cloudpickle/__init__.py b/cloudpickle/__init__.py index 21658795e..9d458368f 100644 --- a/cloudpickle/__init__.py +++ b/cloudpickle/__init__.py @@ -1,5 +1,11 @@ from __future__ import absolute_import -from cloudpickle.cloudpickle_fast import * +import sys + + +if sys.version_info[:2] >= (3, 8): + from cloudpickle.cloudpickle_fast import * +else: + from cloudpickle.cloudpickle import * __version__ = '1.2.0.dev0' From d44fd1e7884d889be3a11cd8c3958867ab36f669 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 6 Mar 2019 15:49:22 +0100 Subject: [PATCH 06/70] MNT remove test_namedtuple skip after cpython changes --- tests/cloudpickle_test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/cloudpickle_test.py b/tests/cloudpickle_test.py index d9955f6e4..6950783f0 100644 --- a/tests/cloudpickle_test.py +++ b/tests/cloudpickle_test.py @@ -1100,9 +1100,6 @@ def func(x): cloned = pickle_depickle(func, protocol=self.protocol) self.assertEqual(cloned.__qualname__, func.__qualname__) - # @pytest.mark.skipif(sys.version_info >= (3, 8), - # reason="pickling namedtuple is broken on 3.8") - def test_namedtuple(self): MyTuple = collections.namedtuple('MyTuple', ['a', 'b', 'c']) t1 = MyTuple(1, 2, 3) From daee8596634af5741ecdf16d199bd1bdc85a981a Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 6 Mar 2019 16:26:13 +0100 Subject: [PATCH 07/70] MNT add cloudpickle's code_globals_cache --- cloudpickle/cloudpickle_fast.py | 62 ++++++++++++++++----------------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index c4ac50da8..5f85fb255 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -175,41 +175,39 @@ def g(): return referenced_submodules -def extract_code_globals(code, globals): +_extract_code_globals_cache = ( + weakref.WeakKeyDictionary() + if not hasattr(sys, "pypy_version_info") + else {} +) + + +def extract_code_globals(code, globals_): """ Find all globals names read or written to by codeblock co """ - # XXX: there used to be a cache lookup based on the code object to get its - # corresponding global variable names. I removed it for the first version, - # I don't know if it is worth keeping it. - code_globals = {} - # PyPy "builtin-code" do not have this structure - if hasattr(code, "co_names"): - # first, find, potential submodules that are hard to identify - instructions = dis.get_instructions(code) - for ins in instructions: - varname = ins.argval - if ins.opcode in GLOBAL_OPS and varname in globals: - code_globals[varname] = globals[varname] - - # co.co_consts refers to any constant variable used by co. - # lines such as print("foo") or a = 1 will result in a new addition to - # the co_consts tuple ("foo" or 1). - # However, name resolution is done at run-time, so assignment of the - # form a = b will not yield a new item in co_consts (as the compiler - # has no idea what b is at declaration time). - - # Declaring a function inside another one using the "def ..." syntax - # generates a constant code object corresonding to the one of the - # nested function's. This code object is added into the co_consts - # attribute of the enclosing's function code. As the nested function - # may itself need global variables, we need to introspect its code, - # extract its globals, (look for code object in it's co_consts - # attribute..) and add the result to the global variables lists - if code.co_consts: - for c in code.co_consts: - if isinstance(c, types.CodeType): - code_globals.update(extract_code_globals(c, globals)) + code_globals = _extract_code_globals_cache.get(code) + if code_globals is None: + code_globals = {} + # PyPy "builtin-code" do not have this structure + if hasattr(code, "co_names"): + # first, find, potential submodules that are hard to identify + instructions = dis.get_instructions(code) + for ins in instructions: + varname = ins.argval + if ins.opcode in GLOBAL_OPS and varname in globals_: + code_globals[varname] = globals_[varname] + + # Declaring a function inside another one using the "def ..." + # syntax generates a constant code object corresonding to the one + # of the nested function's As the nested function may itself need + # global variables, we need to introspect its code, extract its + # globals, (look for code object in it's co_consts attribute..) and + # add the result to code_globals + if code.co_consts: + for c in code.co_consts: + if isinstance(c, types.CodeType): + code_globals.update(extract_code_globals(c, globals_)) return code_globals From c44f75be9703c4c62585e5b9fcb1810788895eb9 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 6 Mar 2019 16:27:24 +0100 Subject: [PATCH 08/70] CLN comment cosmetics --- cloudpickle/cloudpickle_fast.py | 40 +++++++++++++-------------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 5f85fb255..7265884be 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -24,9 +24,6 @@ # XXX: Uncovered code in cloudpickle is currently removed, as they lack a # specific use case justifying their presence. Functions/Methods removed: # - _restore_attr -# - _get_module_builtins -# - print_exec -# - _modules_to_main # - _gen_ellipsis # - everyting after (if obj.__dict__) in save_global @@ -212,7 +209,7 @@ def extract_code_globals(code, globals_): return code_globals -# COLLECTION OF OBJECTS __getnewargs__-like methods +# COLLECTION OF OBJECTS __getnewargs__-LIKE METHODS # ------------------------------------------------- @@ -284,10 +281,10 @@ def file_reconstructor(retval): # COLLECTION OF OBJECTS STATE GETTERS # ----------------------------------- def function_getstate(func): - # * Put func's dynamic attributes (stored in func.__dict__) in state. These + # - Put func's dynamic attributes (stored in func.__dict__) in state. These # attributes will be restored at unpickling time using # f.__dict__.update(state) - # * Put func's members into slotstate. Such attributes will be restored at + # - Put func's members into slotstate. Such attributes will be restored at # unpickling time by iterating over slotstate and calling setattr(func, # slotname, slotvalue) slotstate = { @@ -324,8 +321,8 @@ def function_getstate(func): # obj.__reduce__), some do not. The following methods were created to "fill # these holes". -# XXX: no itemgetter/attrgetter reducer support implemented as the test seem to -# pass even without them +# XXX: no itemgetter/attrgetter reducer support implemented as the tests seem +# to pass even without them def builtin_type_reduce(obj): @@ -427,7 +424,6 @@ def memoryview_reduce(obj): def module_reduce(obj): - """Module reducer""" if _is_dynamic(obj): return dynamic_module_reconstructor, (obj.__name__, vars(obj)) else: @@ -454,8 +450,7 @@ def dynamic_function_reduce(func, globals_ref): """Reduce a function that is not pickleable via attribute loookup. """ # XXX: should globals_ref be a global variable instead? The reason is - # purely cosmetic. There is no risk of references leaking, we would have to - # limit the growing of globals_ref, by making it a lru cache for example. + # purely cosmetic. newargs = function_getnewargs(func, globals_ref) state = function_getstate(func) return types.FunctionType, newargs, state, None, None, function_setstate @@ -494,8 +489,8 @@ def function_reduce(obj, globals_ref): lookedup_by_name = None if lookedup_by_name is obj: # in this case, module is None - # if obj exists in a static module, let the builtin pickle saving - # routines save obj + # if obj exists in a filesytem-backed module, let the builtin pickle + # saving routines save obj return NotImplementedError # XXX: the special handling of builtin_function_or_method is removed as @@ -512,8 +507,8 @@ def function_reduce(obj, globals_ref): ): return dynamic_function_reduce(obj, globals_ref=globals_ref) - # TODO:this is cleanable: the if/else conditions + the NotImplementedError - # cover all cases. + # this whole code section may be cleanable: the if/else conditions + the + # NotImplementedError look like they cover nearly all cases. else: # func is nested if lookedup_by_name is None or lookedup_by_name is not obj: @@ -693,17 +688,12 @@ def hook(pickler, obj): class CloudPickler(Pickler): """Fast C Pickler extension with additional reducing routines + Cloudpickler's extensions exist into into: - Cloudpickler's extensions exist into into: - - * it's dispatch_table containing methods that are called only if ALL - built-in saving functions were previously discarded. - * it's callback_dispatch, containing methods that are called only if ALL - built-in saving functions except save_global were previously discarded. - - Both tables contains reducers, that take a single argument (obj), and - preturn a tuple with all the necessary data to re-construct obj. - + * it's dispatch_table containing reducers that are called only if ALL + built-in saving functions were previously discarded. + * a special callback, invoked before standard function/class + builtin-saving method (save_global), to serialize dynamic functions """ dispatch = {} From 7aae29b18a41528f476d080f433290f292772353 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 6 Mar 2019 16:28:51 +0100 Subject: [PATCH 09/70] MNT remove python2-compat lines from file_reduce --- cloudpickle/cloudpickle_fast.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 7265884be..9efd0ae1b 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -368,10 +368,7 @@ def classmethod_reduce(obj): def file_reduce(obj): """Save a file""" - try: - import StringIO as pystringIO # we can't use cStringIO as it lacks the name attribute - except ImportError: - import io as pystringIO + import io if not hasattr(obj, "name") or not hasattr(obj, "mode"): raise pickle.PicklingError( @@ -391,12 +388,13 @@ def file_reduce(obj): ) if "r" not in obj.mode and "+" not in obj.mode: raise pickle.PicklingError( - "Cannot pickle files that are not opened for reading: %s" % obj.mode + "Cannot pickle files that are not opened for reading: %s" + % obj.mode ) name = obj.name - retval = pystringIO.StringIO() + retval = io.StringIO() try: # Read the whole file From ec461ab487a616547286ddce48a67f762c395419 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 6 Mar 2019 16:39:17 +0100 Subject: [PATCH 10/70] CLN various style/cosmetics --- cloudpickle/cloudpickle_fast.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 9efd0ae1b..963397de9 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -658,8 +658,13 @@ def class_setstate(obj, state, slotstate): # This set of functions aim at deciding whether an object can be properly # pickler by the c Pickler, or if it needs to be serialized using cloudpickle's # reducers. -def hook(pickler, obj): - """Custom reducing instructions for un-picklable functions and classes +def reduce_global(pickler, obj): + """Custom reducing callback for functions and classes + + This function is the analog of a custom save_global. However, the C Pickler + API does not expose low-level instructions such as save or write. Instead, + we return a reduce value the the Pickler will internally serialize via + save_reduce. """ # Classes deriving from custom, dynamic metaclasses won't get caught inside # the hook_dispatch dict. In the legacy cloudpickle, this was not really a @@ -680,6 +685,7 @@ def hook(pickler, obj): elif isinstance(obj, types.FunctionType): return function_reduce(obj, pickler.globals_ref) else: + # fallback to save_global return NotImplementedError @@ -695,22 +701,18 @@ class CloudPickler(Pickler): """ dispatch = {} - dispatch[types.CellType] = cell_reduce - dispatch[types.ModuleType] = module_reduce - dispatch[types.MethodType] = method_reduce + dispatch[classmethod] = classmethod_reduce + dispatch[io.TextIOWrapper] = file_reduce dispatch[logging.Logger] = logger_reduce dispatch[logging.RootLogger] = root_logger_reduce - dispatch[types.CodeType] = code_reduce - dispatch[classmethod] = classmethod_reduce + dispatch[memoryview] = memoryview_reduce dispatch[staticmethod] = classmethod_reduce - dispatch[weakref.WeakSet] = weakset_reduce + dispatch[types.CellType] = cell_reduce + dispatch[types.CodeType] = code_reduce dispatch[types.ModuleType] = module_reduce + dispatch[types.MethodType] = method_reduce dispatch[types.MappingProxyType] = mappingproxy_reduce - dispatch[memoryview] = memoryview_reduce - # dispatch[io.TextIOWrapper] = file_reduce - - # dispatch[operator.attrgetter] = attrgetter_reduce - # dispatch[operator.itemgetter] = itemgetter_reduce + dispatch[weakref.WeakSet] = weakset_reduce def __init__(self, file, protocol=None): if protocol is None: @@ -721,7 +723,7 @@ def __init__(self, file, protocol=None): # global namespace at unpickling time. self.globals_ref = {} self.dispatch_table = self.dispatch - self.global_hook = hook + self.global_hook = reduce_global self.proto = int(protocol) def dump(self, obj): From e75f4669087697dab6499aa3aee26ea52c808ad6 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 6 Mar 2019 16:39:52 +0100 Subject: [PATCH 11/70] MNT re-use builtin-type constructor cache --- cloudpickle/cloudpickle_fast.py | 39 ++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 963397de9..3c8507a8c 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -48,6 +48,33 @@ _BUILTIN_TYPE_NAMES[v] = k +def _make__new__factory(type_): + def _factory(): + return type_.__new__ + + return _factory + + +# NOTE: These need to be module globals so that they're pickleable as globals. +_get_dict_new = _make__new__factory(dict) +_get_frozenset_new = _make__new__factory(frozenset) +_get_list_new = _make__new__factory(list) +_get_set_new = _make__new__factory(set) +_get_tuple_new = _make__new__factory(tuple) +_get_object_new = _make__new__factory(object) + +# Pre-defined set of builtin_function_or_method instances that can be +# serialized. +_BUILTIN_TYPE_CONSTRUCTORS = { + dict.__new__: _get_dict_new, + frozenset.__new__: _get_frozenset_new, + set.__new__: _get_set_new, + list.__new__: _get_list_new, + tuple.__new__: _get_tuple_new, + object.__new__: _get_object_new, +} + + # Shorthands similar to pickle.dump/pickle.dumps @@ -461,7 +488,17 @@ def function_reduce(obj, globals_ref): from an attribute lookup of a file-backed module. If this check fails, then a custom reducer is called. """ - # XXX: should we implement the cache of builtin-type constructors? + if obj in _BUILTIN_TYPE_CONSTRUCTORS: + # We keep a special-cased cache of built-in type constructors at + # global scope, because these functions are structured very + # differently in different python versions and implementations (for + # example, they're instances of types.BuiltinFunctionType in + # CPython, but they're ordinary types.FunctionType instances in + # PyPy). + # + # If the function we've received is in that cache, we just + # serialize it as a lookup into the cache. + return _BUILTIN_TYPE_CONSTRUCTORS[obj], () name = obj.__name__ try: From 7e311ae7a002ebde4e5219b7084f37d87956b763 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 6 Mar 2019 16:52:45 +0100 Subject: [PATCH 12/70] DOC explain why warnings are filtered in tests --- tests/testutils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/testutils.py b/tests/testutils.py index fba7b6be7..264d739ce 100644 --- a/tests/testutils.py +++ b/tests/testutils.py @@ -72,6 +72,10 @@ def subprocess_pickle_echo(input_data, protocol=None, timeout=TIMEOUT): """ # run then pickle_echo(protocol=protocol) in __main__: + + # Protect stderr from any warning, as we will assume an error will happen + # if it is not empty. A concrete example is pytest using the imp module, + # which is deprecated in python 3.8 cmd = [sys.executable, '-W ignore', __file__, "--protocol", str(protocol)] cwd, env = _make_cwd_env() proc = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=cwd, env=env, From 009e4a227833e97fcac046f5c50048dcbb505cd9 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 6 Mar 2019 18:52:35 +0100 Subject: [PATCH 13/70] TST silent deprecation warning in some tests --- tests/cloudpickle_test.py | 2 +- tests/testutils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/cloudpickle_test.py b/tests/cloudpickle_test.py index 6950783f0..c81c6ae8f 100644 --- a/tests/cloudpickle_test.py +++ b/tests/cloudpickle_test.py @@ -961,7 +961,7 @@ def check_logger(self, name): logger = cloudpickle.loads(base64.b32decode(b'{}')) logger.info('hello') """.format(base64.b32encode(dumped).decode('ascii')) - proc = subprocess.Popen([sys.executable, "-c", code], + proc = subprocess.Popen([sys.executable, "-W ignore", "-c", code], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) out, _ = proc.communicate() diff --git a/tests/testutils.py b/tests/testutils.py index 264d739ce..e26849758 100644 --- a/tests/testutils.py +++ b/tests/testutils.py @@ -190,7 +190,7 @@ def assert_run_python_script(source_code, timeout=TIMEOUT): try: with open(source_file, 'wb') as f: f.write(source_code.encode('utf-8')) - cmd = [sys.executable, source_file] + cmd = [sys.executable, '-W ignore', source_file] cwd, env = _make_cwd_env() kwargs = { 'cwd': cwd, From c28d156e9ae03a23aa565e77859e5c648bbde53c Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 7 Mar 2019 15:03:30 +0100 Subject: [PATCH 14/70] CI test cloudpickle against python3.8 with hooks --- .travis.yml | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7661144e2..c99177d59 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,12 +3,23 @@ dist: xenial matrix: include: - - os: windows - language: sh - env: PYTHON_ROOT="/c/Python37" PYTHON_CHOCO_PKG="python3" - - os: windows - language: sh - env: PYTHON_ROOT="/c/Python27" PYTHON_CHOCO_PKG="python2" + # - os: windows + # language: sh + # env: PYTHON_ROOT="/c/Python37" PYTHON_CHOCO_PKG="python3" + # - os: windows + # language: sh + # env: PYTHON_ROOT="/c/Python27" PYTHON_CHOCO_PKG="python2" + # - os: linux + # dist: trusty + # python: "pypy3" + # - os: linux + # python: 3.7 + # - os: linux + # python: 3.6 + # - os: linux + # python: 3.5 + # - os: linux + # python: 2.7 - os: linux dist: trusty python: "pypy3" From dfea4f5f9af0e8e296e9d46973f85196a22ee850 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 27 Mar 2019 15:59:06 +0100 Subject: [PATCH 15/70] CLN de-duplicate utility functions --- cloudpickle/cloudpickle_fast.py | 81 ++------------------------------- 1 file changed, 5 insertions(+), 76 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 3c8507a8c..004b8496d 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -18,62 +18,12 @@ from _pickle import Pickler -load, loads = _pickle.load, _pickle.loads - - -# XXX: Uncovered code in cloudpickle is currently removed, as they lack a -# specific use case justifying their presence. Functions/Methods removed: -# - _restore_attr -# - _gen_ellipsis -# - everyting after (if obj.__dict__) in save_global - -# cloudpickle is meant for inter process communication: we expect all -# communicating processes to run the same Python version hence we favor -# communication speed over compatibility: -DEFAULT_PROTOCOL = pickle.HIGHEST_PROTOCOL - - -# relevant opcodes, used to detect global variables manipulation -# XXX: I think STORE_GLOBAL can actually be removed. -STORE_GLOBAL = opcode.opmap["STORE_GLOBAL"] -DELETE_GLOBAL = opcode.opmap["DELETE_GLOBAL"] -LOAD_GLOBAL = opcode.opmap["LOAD_GLOBAL"] -GLOBAL_OPS = (STORE_GLOBAL, DELETE_GLOBAL, LOAD_GLOBAL) - - -# map a type to its name in the types module. -_BUILTIN_TYPE_NAMES = {} -for k, v in types.__dict__.items(): - if type(v) is type: - _BUILTIN_TYPE_NAMES[v] = k - - -def _make__new__factory(type_): - def _factory(): - return type_.__new__ - - return _factory - - -# NOTE: These need to be module globals so that they're pickleable as globals. -_get_dict_new = _make__new__factory(dict) -_get_frozenset_new = _make__new__factory(frozenset) -_get_list_new = _make__new__factory(list) -_get_set_new = _make__new__factory(set) -_get_tuple_new = _make__new__factory(tuple) -_get_object_new = _make__new__factory(object) - -# Pre-defined set of builtin_function_or_method instances that can be -# serialized. -_BUILTIN_TYPE_CONSTRUCTORS = { - dict.__new__: _get_dict_new, - frozenset.__new__: _get_frozenset_new, - set.__new__: _get_set_new, - list.__new__: _get_list_new, - tuple.__new__: _get_tuple_new, - object.__new__: _get_object_new, -} +from .cloudpickle import ( + islambda, _is_dynamic, GLOBAL_OPS, _BUILTIN_TYPE_CONSTRUCTORS, + _BUILTIN_TYPE_NAMES, DEFAULT_PROTOCOL +) +load, loads = _pickle.load, _pickle.loads # Shorthands similar to pickle.dump/pickle.dumps @@ -112,27 +62,6 @@ def dumps(obj, protocol=None): # Utility functions introspecting objects to extract useful properties about # them. - - -def islambda(func): - return getattr(func, "__name__") == "" - - -def _is_dynamic(module): - """ Check if the module is importable by name - - Notable exceptions include modules created dynamically using - types.ModuleType - """ - # Quick check: module that have __file__ attribute are not dynamic modules. - if hasattr(module, "__file__"): - return False - - # XXX: there used to be backwad compat code for python 2 here. - if hasattr(module, "__spec__"): - return module.__spec__ is None - - def _find_loaded_submodules(globals, closure, co_names): """ Find submodules used by a function but not listed in its globals. From 3df299feb7d5f2843be0ccc43be56bf2f4f7acad Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 27 Mar 2019 16:58:43 +0100 Subject: [PATCH 16/70] CLN de-duplicate complex utilities functions --- cloudpickle/cloudpickle.py | 10 ++- cloudpickle/cloudpickle_fast.py | 123 ++++---------------------------- 2 files changed, 21 insertions(+), 112 deletions(-) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index cfa7e54cd..b9ffef961 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -681,10 +681,16 @@ def save_function_tuple(self, func): save(_fill_function) # skeleton function updater write(pickle.MARK) # beginning of tuple that _fill_function expects - self._save_subimports( + subimports = _find_loaded_submodules( code, itertools.chain(f_globals.values(), closure_values or ()), ) + for s in subimports: + # ensure that subimport s is loaded at unpickling time + self.save(s) + # then discards the reference to it + self.write(pickle.POP) + # create a skeleton function object and memoize it save(_make_skel_func) @@ -746,7 +752,7 @@ def extract_func_data(self, func): code = func.__code__ # extract all global ref's - func_global_refs = self.extract_code_globals(code) + func_global_refs = extract_code_globals(code) # process all variables referenced by global environment f_globals = {} diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 004b8496d..fbc1a7fe1 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -8,6 +8,7 @@ import abc import dis import io +import itertools import logging import opcode import _pickle @@ -19,8 +20,9 @@ from _pickle import Pickler from .cloudpickle import ( - islambda, _is_dynamic, GLOBAL_OPS, _BUILTIN_TYPE_CONSTRUCTORS, - _BUILTIN_TYPE_NAMES, DEFAULT_PROTOCOL + islambda, _is_dynamic, extract_code_globals, GLOBAL_OPS, + _BUILTIN_TYPE_CONSTRUCTORS, _BUILTIN_TYPE_NAMES, DEFAULT_PROTOCOL, + _find_loaded_submodules, _get_cell_contents ) load, loads = _pickle.load, _pickle.loads @@ -60,111 +62,6 @@ def dumps(obj, protocol=None): file.close() -# Utility functions introspecting objects to extract useful properties about -# them. -def _find_loaded_submodules(globals, closure, co_names): - """ - Find submodules used by a function but not listed in its globals. - - In the example below: - - ``` - import xml.etree - import cloudpickle - - - def func(): - x = xml.etree.ElementTree - - - if __name__ == '__main__': - cloudpickle.dumps(func) - ``` - - the expression xml.etree.ElementTree generates a LOAD_GLOBAL for xml, but - simply LOAD_ATTR for etree and ElementTree - cloudpickle cannot detect - such submodules by bytecode inspection. There is actually no exact way of - detecting them, the method below is simply "good enough". For instance: - - import xml.etree - - def f(): - def g(): - return xml.etree - return g - - pickling f and trying to call f()() will raise a NameError - - """ - - referenced_submodules = {} - top_level_dependencies = list(globals.values()) - for cell in closure: - try: - top_level_dependencies.append(cell.cell_contents) - except ValueError: - continue - - # top_level_dependencies are variables that generated a LOAD_GlOBAL or a - # LOAD_DEREF opcode in code. - for x in top_level_dependencies: - if ( - isinstance(x, types.ModuleType) - and getattr(x, "__package__", None) is not None - ): - # check if the package has any currently loaded sub-imports - prefix = x.__name__ + "." - # A concurrent thread could mutate sys.modules, - # make sure we iterate over a copy to avoid exceptions - for name in list(sys.modules): - # Older versions of pytest will add a "None" module to - # sys.modules. - if name is not None and name.startswith(prefix): - # check whether the function can address the sub-module - tokens = set(name[len(prefix) :].split(".")) - if not tokens - set(co_names): - # ensure unpickler executes this import - referenced_submodules[name] = sys.modules[name] - return referenced_submodules - - -_extract_code_globals_cache = ( - weakref.WeakKeyDictionary() - if not hasattr(sys, "pypy_version_info") - else {} -) - - -def extract_code_globals(code, globals_): - """ - Find all globals names read or written to by codeblock co - """ - code_globals = _extract_code_globals_cache.get(code) - if code_globals is None: - code_globals = {} - # PyPy "builtin-code" do not have this structure - if hasattr(code, "co_names"): - # first, find, potential submodules that are hard to identify - instructions = dis.get_instructions(code) - for ins in instructions: - varname = ins.argval - if ins.opcode in GLOBAL_OPS and varname in globals_: - code_globals[varname] = globals_[varname] - - # Declaring a function inside another one using the "def ..." - # syntax generates a constant code object corresonding to the one - # of the nested function's As the nested function may itself need - # global variables, we need to introspect its code, extract its - # globals, (look for code object in it's co_consts attribute..) and - # add the result to code_globals - if code.co_consts: - for c in code.co_consts: - if isinstance(c, types.CodeType): - code_globals.update(extract_code_globals(c, globals_)) - - return code_globals - - # COLLECTION OF OBJECTS __getnewargs__-LIKE METHODS # ------------------------------------------------- @@ -254,12 +151,18 @@ def function_getstate(func): "__closure__": func.__closure__, } - f_globals = extract_code_globals(func.__code__, func.__globals__) + f_globals_ref = extract_code_globals(func.__code__) + f_globals = {k: func.__globals__[k] for k in f_globals_ref if k in + func.__globals__} + + closure_values = ( + list(map(_get_cell_contents, func.__closure__)) + if func.__closure__ is not None else () + ) # extract submodules referenced by attribute lookup (no global opcode) f_globals["__submodules__"] = _find_loaded_submodules( - f_globals, slotstate["__closure__"] or (), func.__code__.co_names - ) + func.__code__, itertools.chain(f_globals.values(), closure_values)) slotstate["__globals__"] = f_globals state = func.__dict__ From 78dd8c70405e52a9cf9a724c952b8cef3fbb3ae0 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 27 Mar 2019 16:59:18 +0100 Subject: [PATCH 17/70] TST fix test for cloudpickle <= 3.7 --- tests/cloudpickle_test.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/cloudpickle_test.py b/tests/cloudpickle_test.py index c81c6ae8f..ac2b07a21 100644 --- a/tests/cloudpickle_test.py +++ b/tests/cloudpickle_test.py @@ -1076,7 +1076,11 @@ def f(): # some setup is required to allow pytest apimodules to be correctly # serializable. from cloudpickle import CloudPickler - CloudPickler.dispatch[type(py.builtin)] = cloudpickle.module_reduce + if sys.version_info[:2] >= (3, 8): + CloudPickler.dispatch[type(py.builtin)] = cloudpickle.module_reduce + else: + CloudPickler.dispatch[type(py.builtin)] = CloudPickler.save_module + g = cloudpickle.loads(cloudpickle.dumps(f, protocol=self.protocol)) result = g() From c364505e5a99658890c7e0429cfce8c0c8cf2cc0 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 27 Mar 2019 17:02:55 +0100 Subject: [PATCH 18/70] DOC more explicit save_global fallback comment --- cloudpickle/cloudpickle_fast.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index fbc1a7fe1..f1e042acb 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -469,7 +469,9 @@ def class_reduce(obj): return dynamic_class_reduce(obj) else: - # if pickle.dumps worked out fine, then simply pickle by attribute + # if pickle.dumps worked out fine, then simply fallback to the + # traditional pickle by attribute # implemented in the builtin + # `Pickler.save_global`. return NotImplementedError From de675308a21291650b8ed8fbb1bdf33cbc0c5523 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 27 Mar 2019 18:02:33 +0100 Subject: [PATCH 19/70] CLN make reducers private --- cloudpickle/cloudpickle_fast.py | 106 ++++++++++++++++---------------- tests/cloudpickle_test.py | 4 +- 2 files changed, 56 insertions(+), 54 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index f1e042acb..70b189939 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -66,7 +66,7 @@ def dumps(obj, protocol=None): # ------------------------------------------------- -def function_getnewargs(func, globals_ref): +def _function_getnewargs(func, globals_ref): code = func.__code__ # base_globals represents the future global namespace of func at @@ -109,31 +109,31 @@ def function_getnewargs(func, globals_ref): # Fortunately, many non-accessible builtin-types are mirrored in the types # module. For those types, we pickle the function builtin_type_reconstructor # instead, that contains instruction to look them up via the types module. -def builtin_type_reconstructor(name): +def _builtin_type_reconstructor(name): """Return a builtin-type using attribute lookup from the types module""" return getattr(types, name) # XXX: what does "not working as desired" means? # hack for __import__ not working as desired -def module_reconstructor(name): +def _module_reconstructor(name): __import__(name) return sys.modules[name] -def dynamic_module_reconstructor(name, vars): +def _dynamic_module_reconstructor(name, vars): mod = types.ModuleType(name) mod.__dict__.update(vars) return mod -def file_reconstructor(retval): +def _file_reconstructor(retval): return retval # COLLECTION OF OBJECTS STATE GETTERS # ----------------------------------- -def function_getstate(func): +def _function_getstate(func): # - Put func's dynamic attributes (stored in func.__dict__) in state. These # attributes will be restored at unpickling time using # f.__dict__.update(state) @@ -184,11 +184,11 @@ def function_getstate(func): # to pass even without them -def builtin_type_reduce(obj): - return builtin_type_reconstructor, (_BUILTIN_TYPE_NAMES[obj],) +def _builtin_type_reduce(obj): + return _builtin_type_reconstructor, (_BUILTIN_TYPE_NAMES[obj],) -def code_reduce(obj): +def _code_reduce(obj): """codeobject reducer""" args = ( obj.co_argcount, @@ -210,7 +210,7 @@ def code_reduce(obj): return types.CodeType, args -def cell_reduce(obj): +def _cell_reduce(obj): """Cell (containing values of a function's free variables) reducer""" try: obj.cell_contents @@ -220,12 +220,12 @@ def cell_reduce(obj): return types.CellType, (obj.cell_contents,) -def classmethod_reduce(obj): +def _classmethod_reduce(obj): orig_func = obj.__func__ return type(obj), (orig_func,) -def file_reduce(obj): +def _file_reduce(obj): """Save a file""" import io @@ -269,51 +269,51 @@ def file_reduce(obj): retval.seek(curloc) retval.name = name - return file_reconstructor, (retval,) + return _file_reconstructor, (retval,) -def mappingproxy_reduce(obj): +def _mappingproxy_reduce(obj): return types.MappingProxyType, (dict(obj),) -def memoryview_reduce(obj): +def _memoryview_reduce(obj): return bytes, (obj.tobytes(),) -def module_reduce(obj): +def _module_reduce(obj): if _is_dynamic(obj): - return dynamic_module_reconstructor, (obj.__name__, vars(obj)) + return _dynamic_module_reconstructor, (obj.__name__, vars(obj)) else: - return module_reconstructor, (obj.__name__,) + return _module_reconstructor, (obj.__name__,) -def method_reduce(obj): +def _method_reduce(obj): return (types.MethodType, (obj.__func__, obj.__self__)) -def logger_reduce(obj): +def _logger_reduce(obj): return logging.getLogger, (obj.name,) -def root_logger_reduce(obj): +def _root_logger_reduce(obj): return logging.getLogger, () -def weakset_reduce(obj): +def _weakset_reduce(obj): return weakref.WeakSet, (list(obj),) -def dynamic_function_reduce(func, globals_ref): +def _dynamic_function_reduce(func, globals_ref): """Reduce a function that is not pickleable via attribute loookup. """ # XXX: should globals_ref be a global variable instead? The reason is # purely cosmetic. - newargs = function_getnewargs(func, globals_ref) - state = function_getstate(func) - return types.FunctionType, newargs, state, None, None, function_setstate + newargs = _function_getnewargs(func, globals_ref) + state = _function_getstate(func) + return types.FunctionType, newargs, state, None, None, _function_setstate -def function_reduce(obj, globals_ref): +def _function_reduce(obj, globals_ref): """Select the reducer depending on obj's dynamic nature This functions starts by replicating save_global: trying to retrieve obj @@ -372,17 +372,17 @@ def function_reduce(obj, globals_ref): or getattr(obj.__code__, "co_filename", None) == "" or themodule is None ): - return dynamic_function_reduce(obj, globals_ref=globals_ref) + return _dynamic_function_reduce(obj, globals_ref=globals_ref) # this whole code section may be cleanable: the if/else conditions + the # NotImplementedError look like they cover nearly all cases. else: # func is nested if lookedup_by_name is None or lookedup_by_name is not obj: - return dynamic_function_reduce(obj, globals_ref=globals_ref) + return _dynamic_function_reduce(obj, globals_ref=globals_ref) -def dynamic_class_reduce(obj): +def _dynamic_class_reduce(obj): """ Save a class that can't be stored as module global. @@ -439,17 +439,17 @@ def dynamic_class_reduce(obj): (clsdict, {}), None, None, - class_setstate, + _class_setstate, ) -def class_reduce(obj): +def _class_reduce(obj): """Select the reducer depending on the dynamic nature of the class obj""" # XXX: there used to be special handling for NoneType, EllipsisType and # NotImplementedType. As for now this module handles only python3.8+, this # code has been removed. if obj.__module__ == "__main__": - return dynamic_class_reduce(obj) + return _dynamic_class_reduce(obj) try: # All classes are caught in this function: pickleable classes are @@ -462,11 +462,11 @@ def class_reduce(obj): # is python 2 specific. if obj.__module__ == "builtins": if obj in _BUILTIN_TYPE_NAMES: - return builtin_type_reduce(obj) + return _builtin_type_reduce(obj) typ = type(obj) if typ is not obj and isinstance(obj, type): # noqa: E721 - return dynamic_class_reduce(obj) + return _dynamic_class_reduce(obj) else: # if pickle.dumps worked out fine, then simply fallback to the @@ -481,7 +481,7 @@ def class_reduce(obj): # it has to be updated to how it was at unpickling time. -def function_setstate(obj, state, slotstate): +def _function_setstate(obj, state, slotstate): """Update the state of a dynaamic function. As __closure__ and __globals__ are readonly attributes of a function, we @@ -510,7 +510,7 @@ def function_setstate(obj, state, slotstate): setattr(obj, k, v) -def class_setstate(obj, state, slotstate): +def _class_setstate(obj, state, slotstate): registry = None for attrname, attr in state.items(): if attrname == "_abc_impl": @@ -529,7 +529,7 @@ def class_setstate(obj, state, slotstate): # This set of functions aim at deciding whether an object can be properly # pickler by the c Pickler, or if it needs to be serialized using cloudpickle's # reducers. -def reduce_global(pickler, obj): +def _reduce_global(pickler, obj): """Custom reducing callback for functions and classes This function is the analog of a custom save_global. However, the C Pickler @@ -552,9 +552,9 @@ def reduce_global(pickler, obj): is_metaclass = False if is_metaclass: - return class_reduce(obj) + return _class_reduce(obj) elif isinstance(obj, types.FunctionType): - return function_reduce(obj, pickler.globals_ref) + return _function_reduce(obj, pickler.globals_ref) else: # fallback to save_global return NotImplementedError @@ -572,18 +572,18 @@ class CloudPickler(Pickler): """ dispatch = {} - dispatch[classmethod] = classmethod_reduce - dispatch[io.TextIOWrapper] = file_reduce - dispatch[logging.Logger] = logger_reduce - dispatch[logging.RootLogger] = root_logger_reduce - dispatch[memoryview] = memoryview_reduce - dispatch[staticmethod] = classmethod_reduce - dispatch[types.CellType] = cell_reduce - dispatch[types.CodeType] = code_reduce - dispatch[types.ModuleType] = module_reduce - dispatch[types.MethodType] = method_reduce - dispatch[types.MappingProxyType] = mappingproxy_reduce - dispatch[weakref.WeakSet] = weakset_reduce + dispatch[classmethod] = _classmethod_reduce + dispatch[io.TextIOWrapper] = _file_reduce + dispatch[logging.Logger] = _logger_reduce + dispatch[logging.RootLogger] = _root_logger_reduce + dispatch[memoryview] = _memoryview_reduce + dispatch[staticmethod] = _classmethod_reduce + dispatch[types.CellType] = _cell_reduce + dispatch[types.CodeType] = _code_reduce + dispatch[types.ModuleType] = _module_reduce + dispatch[types.MethodType] = _method_reduce + dispatch[types.MappingProxyType] = _mappingproxy_reduce + dispatch[weakref.WeakSet] = _weakset_reduce def __init__(self, file, protocol=None): if protocol is None: @@ -594,7 +594,7 @@ def __init__(self, file, protocol=None): # global namespace at unpickling time. self.globals_ref = {} self.dispatch_table = self.dispatch - self.global_hook = reduce_global + self.global_hook = _reduce_global self.proto = int(protocol) def dump(self, obj): diff --git a/tests/cloudpickle_test.py b/tests/cloudpickle_test.py index ac2b07a21..32e58af87 100644 --- a/tests/cloudpickle_test.py +++ b/tests/cloudpickle_test.py @@ -1077,7 +1077,9 @@ def f(): # serializable. from cloudpickle import CloudPickler if sys.version_info[:2] >= (3, 8): - CloudPickler.dispatch[type(py.builtin)] = cloudpickle.module_reduce + from cloudpickle import cloudpickle_fast as cp_fast + CloudPickler.dispatch[ + type(py.builtin)] = cp_fast._module_reduce else: CloudPickler.dispatch[type(py.builtin)] = CloudPickler.save_module From eb776463eeb063d342a3b805d5eb7a7d7bed37b0 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 28 Mar 2019 09:55:09 +0100 Subject: [PATCH 20/70] MNT backport 0.8.1 patch into cludpickle_fast --- cloudpickle/cloudpickle_fast.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 70b189939..02fc9be84 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -82,6 +82,15 @@ def _function_getnewargs(func, globals_ref): # multiple invokations are bound to the same Cloudpickler. base_globals = globals_ref.setdefault(id(func.__globals__), {}) + if base_globals == {}: + # Add module attributes used to resolve relative imports + # instructions inside func. + for k in ["__package__", "__name__", "__path__", "__file__"]: + # Some built-in functions/methods such as object.__new__ have + # their __globals__ set to None in PyPy + if func.__globals__ is not None and k in func.__globals__: + base_globals[k] = func.__globals__[k] + # Do not bind the free variables before the function is created to avoid # infinite recursion. if func.__closure__ is None: From 1dea4ab3f8db07dec6006b33f3b9187c0a27595d Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 28 Mar 2019 10:41:33 +0100 Subject: [PATCH 21/70] CLN unused imports --- cloudpickle/cloudpickle_fast.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 02fc9be84..9c9f416e7 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -6,11 +6,9 @@ python versions 3.8+, a lot of backward-compatibilty code is also removed. """ import abc -import dis import io import itertools import logging -import opcode import _pickle import pickle import sys @@ -20,9 +18,9 @@ from _pickle import Pickler from .cloudpickle import ( - islambda, _is_dynamic, extract_code_globals, GLOBAL_OPS, - _BUILTIN_TYPE_CONSTRUCTORS, _BUILTIN_TYPE_NAMES, DEFAULT_PROTOCOL, - _find_loaded_submodules, _get_cell_contents + islambda, _is_dynamic, extract_code_globals, _BUILTIN_TYPE_CONSTRUCTORS, + _BUILTIN_TYPE_NAMES, DEFAULT_PROTOCOL, _find_loaded_submodules, + _get_cell_contents ) load, loads = _pickle.load, _pickle.loads From 10afbe831b41643cc629be7dbad1312ded154b11 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 11 Apr 2019 16:15:57 +0200 Subject: [PATCH 22/70] CLN naming (subimports -> submodules) --- cloudpickle/cloudpickle.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index b9ffef961..edc1c17ca 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -681,11 +681,11 @@ def save_function_tuple(self, func): save(_fill_function) # skeleton function updater write(pickle.MARK) # beginning of tuple that _fill_function expects - subimports = _find_loaded_submodules( + submodules = _find_loaded_submodules( code, itertools.chain(f_globals.values(), closure_values or ()), ) - for s in subimports: + for s in submodules: # ensure that subimport s is loaded at unpickling time self.save(s) # then discards the reference to it From df3b5a2ca9cd09c6f7c6b1cba33a949a507ff508 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 11 Apr 2019 16:17:22 +0200 Subject: [PATCH 23/70] CLN handle the file in a context manager --- cloudpickle/cloudpickle_fast.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 9c9f416e7..7d7f92edb 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -51,14 +51,10 @@ def dumps(obj, protocol=None): Set protocol=pickle.DEFAULT_PROTOCOL instead if you need to ensure compatibility with older versions of Python. """ - file = io.BytesIO() - try: + with io.BytesIO() as file: cp = CloudPickler(file, protocol=protocol) cp.dump(obj) return file.getvalue() - finally: - file.close() - # COLLECTION OF OBJECTS __getnewargs__-LIKE METHODS # ------------------------------------------------- From e3344b032e5565f5eb840b34e9e84f1ddda9faf8 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 11 Apr 2019 16:17:54 +0200 Subject: [PATCH 24/70] CLN hide slotstate (possible implementation detail) --- cloudpickle/cloudpickle_fast.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 7d7f92edb..3b2aadbcc 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -484,13 +484,14 @@ def _class_reduce(obj): # it has to be updated to how it was at unpickling time. -def _function_setstate(obj, state, slotstate): +def _function_setstate(obj, state): """Update the state of a dynaamic function. As __closure__ and __globals__ are readonly attributes of a function, we cannot rely on the native setstate routine of pickle.load_build, that calls setattr on items of the slotstate. Instead, we have to modify them inplace. """ + state, slotstate = state obj.__dict__.update(state) obj_globals = slotstate.pop("__globals__") @@ -513,7 +514,8 @@ def _function_setstate(obj, state, slotstate): setattr(obj, k, v) -def _class_setstate(obj, state, slotstate): +def _class_setstate(obj, state): + state, slotstate = state registry = None for attrname, attr in state.items(): if attrname == "_abc_impl": From 749e88b9810ade25228729c29a54577d781e61fb Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 11 Apr 2019 16:40:27 +0200 Subject: [PATCH 25/70] CLN make extract_code_globals private --- cloudpickle/cloudpickle.py | 2 +- cloudpickle/cloudpickle_fast.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index edc1c17ca..203195639 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -752,7 +752,7 @@ def extract_func_data(self, func): code = func.__code__ # extract all global ref's - func_global_refs = extract_code_globals(code) + func_global_refs = _extract_code_globals(code) # process all variables referenced by global environment f_globals = {} diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 3b2aadbcc..7694f6dc9 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -18,7 +18,7 @@ from _pickle import Pickler from .cloudpickle import ( - islambda, _is_dynamic, extract_code_globals, _BUILTIN_TYPE_CONSTRUCTORS, + islambda, _is_dynamic, _extract_code_globals, _BUILTIN_TYPE_CONSTRUCTORS, _BUILTIN_TYPE_NAMES, DEFAULT_PROTOCOL, _find_loaded_submodules, _get_cell_contents ) @@ -154,7 +154,7 @@ def _function_getstate(func): "__closure__": func.__closure__, } - f_globals_ref = extract_code_globals(func.__code__) + f_globals_ref = _extract_code_globals(func.__code__) f_globals = {k: func.__globals__[k] for k in f_globals_ref if k in func.__globals__} From 09c38cd8bd89ada6f9827c3ab683d6e50b5aaa46 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Tue, 16 Apr 2019 17:15:32 +0200 Subject: [PATCH 26/70] CLN cleanup stale comments --- cloudpickle/cloudpickle_fast.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 7694f6dc9..d64c81d5d 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -529,11 +529,6 @@ def _class_setstate(obj, state): return obj -# Arbitration between builtin-save method and user-defined callbacks -# ------------------------------------------------------------------ -# This set of functions aim at deciding whether an object can be properly -# pickler by the c Pickler, or if it needs to be serialized using cloudpickle's -# reducers. def _reduce_global(pickler, obj): """Custom reducing callback for functions and classes @@ -542,13 +537,6 @@ def _reduce_global(pickler, obj): we return a reduce value the the Pickler will internally serialize via save_reduce. """ - # Classes deriving from custom, dynamic metaclasses won't get caught inside - # the hook_dispatch dict. In the legacy cloudpickle, this was not really a - # problem because not being present in the dispatch table meant falling - # back to save_global, which was already overriden by cloudpickle. Using - # the c pickler, save_global cannot be overriden, so we have manually check - # is obj's comes from a custom metaclass, and in this case, direct the - # object to save_global. t = type(obj) try: From 3151fc402ca84bf247241daf14b2da1d2f7c1c71 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Tue, 16 Apr 2019 17:16:11 +0200 Subject: [PATCH 27/70] CLN is_metaclass -> is_anyclass --- cloudpickle/cloudpickle_fast.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index d64c81d5d..9f38cd438 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -538,13 +538,12 @@ def _reduce_global(pickler, obj): save_reduce. """ t = type(obj) - try: - is_metaclass = issubclass(t, type) + is_anyclass = issubclass(t, type) except TypeError: # t is not a class (old Boost; see SF #502085) - is_metaclass = False + is_anyclass = False - if is_metaclass: + if is_anyclass: return _class_reduce(obj) elif isinstance(obj, types.FunctionType): return _function_reduce(obj, pickler.globals_ref) From cac49e44170f56d3cb328e84a79b92c469a17022 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Tue, 16 Apr 2019 17:17:28 +0200 Subject: [PATCH 28/70] CLN docstrings conventions --- cloudpickle/cloudpickle_fast.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 9f38cd438..bb5b08146 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -553,14 +553,14 @@ def _reduce_global(pickler, obj): class CloudPickler(Pickler): - """Fast C Pickler extension with additional reducing routines + """Fast C Pickler extension with additional reducing routines. - Cloudpickler's extensions exist into into: + Cloudpickler's extensions exist into into: - * it's dispatch_table containing reducers that are called only if ALL - built-in saving functions were previously discarded. - * a special callback, invoked before standard function/class - builtin-saving method (save_global), to serialize dynamic functions + * it's dispatch_table containing reducers that are called only if ALL + built-in saving functions were previously discarded. + * a special callback, invoked before standard function/class + builtin-saving method (save_global), to serialize dynamic functions """ dispatch = {} From c9402e9c4fcb88c83bc3e420dc5bf4b34c9a07cc Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Tue, 16 Apr 2019 17:24:10 +0200 Subject: [PATCH 29/70] CLN explain cloudpickle global_hook use_case --- cloudpickle/cloudpickle_fast.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index bb5b08146..25a7317a7 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -548,7 +548,7 @@ def _reduce_global(pickler, obj): elif isinstance(obj, types.FunctionType): return _function_reduce(obj, pickler.globals_ref) else: - # fallback to save_global + # fallback to save_global, including the pickler's distpatch_table return NotImplementedError @@ -586,6 +586,18 @@ def __init__(self, file, protocol=None): # global namespace at unpickling time. self.globals_ref = {} self.dispatch_table = self.dispatch + + # Pickling functions and classes cannot be customized using the + # dispatch_table: indeed, pickling an object using the dispatch_table + # works by invoking a reducer specific to the object's type. When the + # object is a class, its type is often ``type``, except when the class + # is an instance of another metaclass. In this cased, the metaclass + # will likely not be known in advance, and thus cannot be special-cased + # using an entry in the dispatch_table. + + # The pickler's global_hook, among other things, allows us to register + # a reducer that will be called for any class, independently of its + # type. self.global_hook = _reduce_global self.proto = int(protocol) From 841f33fe2f1d6c1d39307f667058131b4a33300c Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Tue, 16 Apr 2019 17:28:38 +0200 Subject: [PATCH 30/70] MNT better compat with early python3.8 versions --- cloudpickle/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cloudpickle/__init__.py b/cloudpickle/__init__.py index 9d458368f..143f24ad7 100644 --- a/cloudpickle/__init__.py +++ b/cloudpickle/__init__.py @@ -1,9 +1,10 @@ from __future__ import absolute_import import sys +import pickle -if sys.version_info[:2] >= (3, 8): +if hasattr(pickle.Pickler, 'global_hook'): from cloudpickle.cloudpickle_fast import * else: from cloudpickle.cloudpickle import * From ea21d065d06289634b3961e8fa69be63f781eefd Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 17 Apr 2019 10:26:03 +0200 Subject: [PATCH 31/70] CLN stale comments --- cloudpickle/cloudpickle_fast.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 25a7317a7..057f32bab 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -309,8 +309,6 @@ def _weakset_reduce(obj): def _dynamic_function_reduce(func, globals_ref): """Reduce a function that is not pickleable via attribute loookup. """ - # XXX: should globals_ref be a global variable instead? The reason is - # purely cosmetic. newargs = _function_getnewargs(func, globals_ref) state = _function_getstate(func) return types.FunctionType, newargs, state, None, None, _function_setstate @@ -394,18 +392,10 @@ def _dynamic_class_reduce(obj): from global modules. """ # XXX: This code is nearly untouch with regards to the legacy cloudpickle. - # It is pretty and hard to understand. Maybe refactor it by dumping - # potential python2 specific code and making a trading off optimizations in - # favor of readbility. + # It is pretty and hard to understand. clsdict = dict(obj.__dict__) # copy dict proxy to a dict clsdict.pop("__weakref__", None) - # XXX: I am trying to add the abc-registered subclasses into the class - # reconstructor, because using save_reduce semantics prevents us to perform - # any other operation than state updating after obj is created. - - # I may encounter reference cycles, although there seems to be checks - # preventing this to happen. if "_abc_impl" in clsdict: (registry, _, _, _) = abc._get_dump(obj) clsdict["_abc_impl"] = [ @@ -461,8 +451,6 @@ def _class_reduce(obj): # obj is either a non-pickleable builtin or dynamic. pickle.dumps(obj) except Exception: - # XXX: previously, we also looked for the __builtin__ module, but this - # is python 2 specific. if obj.__module__ == "builtins": if obj in _BUILTIN_TYPE_NAMES: return _builtin_type_reduce(obj) @@ -537,6 +525,8 @@ def _reduce_global(pickler, obj): we return a reduce value the the Pickler will internally serialize via save_reduce. """ + # XXX: This functions needs the current active pickler to pass its + # globals_ref to _function_reduce t = type(obj) try: is_anyclass = issubclass(t, type) From 0145a1c7e8ce8125c692da67880859de9223b19d Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 19 Apr 2019 16:06:27 +0200 Subject: [PATCH 32/70] MNT update to comply cpython PR changes --- cloudpickle/__init__.py | 2 +- cloudpickle/cloudpickle_fast.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cloudpickle/__init__.py b/cloudpickle/__init__.py index 143f24ad7..7c301e995 100644 --- a/cloudpickle/__init__.py +++ b/cloudpickle/__init__.py @@ -4,7 +4,7 @@ import pickle -if hasattr(pickle.Pickler, 'global_hook'): +if hasattr(pickle.Pickler, 'reducer_override'): from cloudpickle.cloudpickle_fast import * else: from cloudpickle.cloudpickle import * diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 057f32bab..3b9e3b29f 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -359,7 +359,7 @@ def _function_reduce(obj, globals_ref): if lookedup_by_name is obj: # in this case, module is None # if obj exists in a filesytem-backed module, let the builtin pickle # saving routines save obj - return NotImplementedError + return NotImplemented # XXX: the special handling of builtin_function_or_method is removed as # currently this hook is not called for such instances, as opposed to @@ -376,7 +376,7 @@ def _function_reduce(obj, globals_ref): return _dynamic_function_reduce(obj, globals_ref=globals_ref) # this whole code section may be cleanable: the if/else conditions + the - # NotImplementedError look like they cover nearly all cases. + # NotImplemented look like they cover nearly all cases. else: # func is nested if lookedup_by_name is None or lookedup_by_name is not obj: @@ -463,7 +463,7 @@ def _class_reduce(obj): # if pickle.dumps worked out fine, then simply fallback to the # traditional pickle by attribute # implemented in the builtin # `Pickler.save_global`. - return NotImplementedError + return NotImplemented # COLLECTIONS OF OBJECTS STATE SETTERS @@ -539,7 +539,7 @@ def _reduce_global(pickler, obj): return _function_reduce(obj, pickler.globals_ref) else: # fallback to save_global, including the pickler's distpatch_table - return NotImplementedError + return NotImplemented class CloudPickler(Pickler): @@ -585,10 +585,10 @@ def __init__(self, file, protocol=None): # will likely not be known in advance, and thus cannot be special-cased # using an entry in the dispatch_table. - # The pickler's global_hook, among other things, allows us to register - # a reducer that will be called for any class, independently of its - # type. - self.global_hook = _reduce_global + # The pickler's reducer_override, among other things, allows us to + # register a reducer that will be called for any class, independently + # of its type. + self.reducer_override = _reduce_global self.proto = int(protocol) def dump(self, obj): From 0aba01764d74a46b881656bea2a6184fc5d22288 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 26 Apr 2019 16:45:17 +0200 Subject: [PATCH 33/70] MNT use the new pickler subclassing API --- cloudpickle/__init__.py | 2 +- cloudpickle/cloudpickle_fast.py | 48 +++++++++++++++------------------ 2 files changed, 23 insertions(+), 27 deletions(-) diff --git a/cloudpickle/__init__.py b/cloudpickle/__init__.py index 7c301e995..3debb3cdf 100644 --- a/cloudpickle/__init__.py +++ b/cloudpickle/__init__.py @@ -4,7 +4,7 @@ import pickle -if hasattr(pickle.Pickler, 'reducer_override'): +if sys.version_info[:2] >= (3, 8): from cloudpickle.cloudpickle_fast import * else: from cloudpickle.cloudpickle import * diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 3b9e3b29f..8765715a5 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -517,31 +517,6 @@ def _class_setstate(obj, state): return obj -def _reduce_global(pickler, obj): - """Custom reducing callback for functions and classes - - This function is the analog of a custom save_global. However, the C Pickler - API does not expose low-level instructions such as save or write. Instead, - we return a reduce value the the Pickler will internally serialize via - save_reduce. - """ - # XXX: This functions needs the current active pickler to pass its - # globals_ref to _function_reduce - t = type(obj) - try: - is_anyclass = issubclass(t, type) - except TypeError: # t is not a class (old Boost; see SF #502085) - is_anyclass = False - - if is_anyclass: - return _class_reduce(obj) - elif isinstance(obj, types.FunctionType): - return _function_reduce(obj, pickler.globals_ref) - else: - # fallback to save_global, including the pickler's distpatch_table - return NotImplemented - - class CloudPickler(Pickler): """Fast C Pickler extension with additional reducing routines. @@ -588,9 +563,30 @@ def __init__(self, file, protocol=None): # The pickler's reducer_override, among other things, allows us to # register a reducer that will be called for any class, independently # of its type. - self.reducer_override = _reduce_global self.proto = int(protocol) + def reducer_override(self, obj): + """Custom reducing callback for functions and classes + + This function is the analog of a custom save_global. However, the C + Pickler API does not expose low-level instructions such as save or + write. Instead, we return a reduce value the the Pickler will + internally serialize via save_reduce. + """ + t = type(obj) + try: + is_anyclass = issubclass(t, type) + except TypeError: # t is not a class (old Boost; see SF #502085) + is_anyclass = False + + if is_anyclass: + return _class_reduce(obj) + elif isinstance(obj, types.FunctionType): + return _function_reduce(obj, self.globals_ref) + else: + # fallback to save_global, including the pickler's distpatch_table + return NotImplemented + def dump(self, obj): try: return Pickler.dump(self, obj) From 707ec295a2b3e666c8554589ddde68201f0e6b99 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 22 May 2019 18:49:49 +0200 Subject: [PATCH 34/70] MNT update to recent changes in master --- cloudpickle/cloudpickle.py | 77 ++++++++++ cloudpickle/cloudpickle_fast.py | 255 ++++++++++++++------------------ 2 files changed, 187 insertions(+), 145 deletions(-) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index 203195639..e7bede6fb 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -107,6 +107,12 @@ else: from _frozen_importlib import _find_spec +_extract_code_globals_cache = ( + weakref.WeakKeyDictionary() + if not hasattr(sys, "pypy_version_info") + else {}) + + def _ensure_tracking(class_def): with _DYNAMIC_CLASS_TRACKER_LOCK: @@ -200,6 +206,77 @@ def _is_global(obj, name=None): return obj2 is obj +def _extract_code_globals(co): + """ + Find all globals names read or written to by codeblock co + """ + out_names = _extract_code_globals_cache.get(co) + if out_names is None: + try: + names = co.co_names + except AttributeError: + # PyPy "builtin-code" object + out_names = set() + else: + out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} + + # Declaring a function inside another one using the "def ..." + # syntax generates a constant code object corresonding to the one + # of the nested function's As the nested function may itself need + # global variables, we need to introspect its code, extract its + # globals, (look for code object in it's co_consts attribute..) and + # add the result to code_globals + if co.co_consts: + for const in co.co_consts: + if isinstance(const, types.CodeType): + out_names |= _extract_code_globals(const) + + _extract_code_globals_cache[co] = out_names + + return out_names + + +def _find_loaded_submodules(code, top_level_dependencies): + """ + Save submodules used by a function but not listed in its globals. + In the example below: + ``` + import concurrent.futures + import cloudpickle + def func(): + x = concurrent.futures.ThreadPoolExecutor + if __name__ == '__main__': + cloudpickle.dumps(func) + ``` + the globals extracted by cloudpickle in the function's state include + the concurrent package, but not its submodule (here, + concurrent.futures), which is the module used by func. + To ensure that calling the depickled function does not raise an + AttributeError, this function looks for any currently loaded submodule + that the function uses and whose parent is present in the function + globals, and saves it before saving the function. + """ + + subimports = [] + # check if any known dependency is an imported package + for x in top_level_dependencies: + if (isinstance(x, types.ModuleType) and + hasattr(x, '__package__') and x.__package__): + # check if the package has any currently loaded sub-imports + prefix = x.__name__ + '.' + # A concurrent thread could mutate sys.modules, + # make sure we iterate over a copy to avoid exceptions + for name in list(sys.modules): + # Older versions of pytest will add a "None" module to + # sys.modules. + if name is not None and name.startswith(prefix): + # check whether the function can address the sub-module + tokens = set(name[len(prefix):].split('.')) + if not tokens - set(code.co_names): + subimports.append(sys.modules[name]) + return subimports + + def _make_cell_set_template_code(): """Get the Python compiler to emit LOAD_FAST(arg); STORE_DEREF diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 8765715a5..b6e5b332a 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -18,9 +18,10 @@ from _pickle import Pickler from .cloudpickle import ( - islambda, _is_dynamic, _extract_code_globals, _BUILTIN_TYPE_CONSTRUCTORS, - _BUILTIN_TYPE_NAMES, DEFAULT_PROTOCOL, _find_loaded_submodules, - _get_cell_contents + _is_dynamic, _extract_code_globals, _BUILTIN_TYPE_NAMES, DEFAULT_PROTOCOL, + _find_loaded_submodules, _get_cell_contents, _is_global, _builtin_type, + Enum, _ensure_tracking, _lookup_class_or_track, _make_skeleton_class, + _make_skeleton_enum, _extract_class_dict, string_types ) load, loads = _pickle.load, _pickle.loads @@ -95,6 +96,29 @@ def _function_getnewargs(func, globals_ref): return code, base_globals, None, None, closure +def _class_getnewargs(obj): + # On PyPy, __doc__ is a readonly attribute, so we need to include it in + # the initial skeleton class. This is safe because we know that the + # doc can't participate in a cycle with the original class. + type_kwargs = {'__doc__': obj.__dict__.get('__doc__', None)} + + if hasattr(obj, "__slots__"): + type_kwargs["__slots__"] = obj.__slots__ + + __dict__ = obj.__dict__.get('__dict__', None) + if isinstance(__dict__, property): + type_kwargs['__dict__'] = __dict__ + + return (type(obj), obj.__name__, obj.__bases__, type_kwargs, + _ensure_tracking(obj), None) + + +def _enum_getnewargs(obj): + members = dict((e.name, e.value) for e in obj) + return (obj.__bases__, obj.__name__, obj.__qualname__, members, + obj.__module__, _ensure_tracking(obj), None) + + # COLLECTION OF OBJECTS RECONSTRUCTORS # ------------------------------------ @@ -172,6 +196,49 @@ def _function_getstate(func): return state, slotstate +def _class_getstate(obj): + clsdict = _extract_class_dict(obj) + clsdict.pop('__weakref__', None) + clsdict.pop('__doc__', None) # present in the reconstructor args + + # For ABCMeta in python3.7+, remove _abc_impl as it is not picklable. + # This is a fix which breaks the cache but this only makes the first + # calls to issubclass slower. + if "_abc_impl" in clsdict: + (registry, _, _, _) = abc._get_dump(obj) + clsdict["_abc_impl"] = [subclass_weakref() + for subclass_weakref in registry] + if hasattr(obj, "__slots__"): + # pickle string length optimization: member descriptors of obj are + # created automatically from obj's __slots__ attribute, no need to + # save them in obj's state + if isinstance(obj.__slots__, string_types): + clsdict.pop(obj.__slots__) + else: + for k in obj.__slots__: + clsdict.pop(k, None) + + clsdict.pop('__dict__', None) # unpickleable property object + + return (clsdict, {}) + + +def _enum_getstate(obj): + clsdict, slotstate = _class_getstate(obj) + + members = dict((e.name, e.value) for e in obj) + # Cleanup the clsdict that will be passed to _rehydrate_skeleton_class: + # Those attributes are already handled by the metaclass. + for attrname in ["_generate_next_value_", "_member_names_", + "_member_map_", "_member_type_", + "_value2member_map_"]: + clsdict.pop(attrname, None) + for member in members: + clsdict.pop(member) + # Special handling of Enum subclasses + return clsdict, slotstate + + # COLLECTIONS OF OBJECTS REDUCERS # ------------------------------- # A reducer is a function taking a single argument (obj), and that returns a @@ -193,23 +260,23 @@ def _builtin_type_reduce(obj): def _code_reduce(obj): """codeobject reducer""" - args = ( - obj.co_argcount, - obj.co_kwonlyargcount, - obj.co_nlocals, - obj.co_stacksize, - obj.co_flags, - obj.co_code, - obj.co_consts, - obj.co_names, - obj.co_varnames, - obj.co_filename, - obj.co_name, - obj.co_firstlineno, - obj.co_lnotab, - obj.co_freevars, - obj.co_cellvars, - ) + if hasattr(obj, "co_posonlyargcount"): # pragma: no branch + args = ( + obj.co_argcount, obj.co_posonlyargcount, + obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, + obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, + obj.co_varnames, obj.co_filename, obj.co_name, + obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, + obj.co_cellvars + ) + else: + args = ( + obj.co_argcount, obj.co_kwonlyargcount, obj.co_nlocals, + obj.co_stacksize, obj.co_flags, obj.co_code, obj.co_consts, + obj.co_names, obj.co_varnames, obj.co_filename, + obj.co_name, obj.co_firstlineno, obj.co_lnotab, + obj.co_freevars, obj.co_cellvars + ) return types.CodeType, args @@ -321,66 +388,9 @@ def _function_reduce(obj, globals_ref): from an attribute lookup of a file-backed module. If this check fails, then a custom reducer is called. """ - if obj in _BUILTIN_TYPE_CONSTRUCTORS: - # We keep a special-cased cache of built-in type constructors at - # global scope, because these functions are structured very - # differently in different python versions and implementations (for - # example, they're instances of types.BuiltinFunctionType in - # CPython, but they're ordinary types.FunctionType instances in - # PyPy). - # - # If the function we've received is in that cache, we just - # serialize it as a lookup into the cache. - return _BUILTIN_TYPE_CONSTRUCTORS[obj], () - - name = obj.__name__ - try: - modname = pickle.whichmodule(obj, name) - except Exception: - modname = None - - try: - themodule = sys.modules[modname] - except KeyError: - # eval'd items such as namedtuple give invalid items for their function - # __module__ - modname = "__main__" - - if modname == "__main__": - # we do not want the incoming module attribute lookup to succeed for - # the __main__ module. - themodule = None - - try: - lookedup_by_name = getattr(themodule, name, None) - except Exception: - lookedup_by_name = None - - if lookedup_by_name is obj: # in this case, module is None - # if obj exists in a filesytem-backed module, let the builtin pickle - # saving routines save obj - return NotImplemented - - # XXX: the special handling of builtin_function_or_method is removed as - # currently this hook is not called for such instances, as opposed to - # cloudpickle. - - # if func is lambda, def'ed at prompt, is in main, or is nested, then - # we'll pickle the actual function object rather than simply saving a - # reference (as is done in default pickler), via save_function_tuple. - if ( - islambda(obj) - or getattr(obj.__code__, "co_filename", None) == "" - or themodule is None - ): - return _dynamic_function_reduce(obj, globals_ref=globals_ref) - - # this whole code section may be cleanable: the if/else conditions + the - # NotImplemented look like they cover nearly all cases. - else: - # func is nested - if lookedup_by_name is None or lookedup_by_name is not obj: - return _dynamic_function_reduce(obj, globals_ref=globals_ref) + if not _is_global(obj): + return _dynamic_function_reduce(obj, globals_ref) + return NotImplemented def _dynamic_class_reduce(obj): @@ -391,49 +401,16 @@ def _dynamic_class_reduce(obj): functions, or that otherwise can't be serialized as attribute lookups from global modules. """ - # XXX: This code is nearly untouch with regards to the legacy cloudpickle. - # It is pretty and hard to understand. - clsdict = dict(obj.__dict__) # copy dict proxy to a dict - clsdict.pop("__weakref__", None) - - if "_abc_impl" in clsdict: - (registry, _, _, _) = abc._get_dump(obj) - clsdict["_abc_impl"] = [ - subclass_weakref() for subclass_weakref in registry - ] - - # On PyPy, __doc__ is a readonly attribute, so we need to include it in - # the initial skeleton class. This is safe because we know that the - # doc can't participate in a cycle with the original class. - type_kwargs = {"__doc__": clsdict.pop("__doc__", None)} - - if hasattr(obj, "__slots__"): - type_kwargs["__slots__"] = obj.__slots__ - # Pickle string length optimization: member descriptors of obj are - # created automatically from obj's __slots__ attribute, no need to - # save them in obj's state - if isinstance(obj.__slots__, str): - clsdict.pop(obj.__slots__) - else: - for k in obj.__slots__: - clsdict.pop(k, None) - - # If type overrides __dict__ as a property, include it in the type kwargs. - # In Python 2, we can't set this attribute after construction. - # XXX: removed special handling of __dict__ for python2 - __dict__ = clsdict.pop("__dict__", None) - if isinstance(__dict__, property): - type_kwargs["__dict__"] = __dict__ - __dict__ = None - - return ( - type(obj), - (obj.__name__, obj.__bases__, type_kwargs), - (clsdict, {}), - None, - None, - _class_setstate, - ) + if Enum is not None and issubclass(obj, Enum): + return ( + _make_skeleton_enum, _enum_getnewargs(obj), _enum_getstate(obj), + None, None, _class_setstate + ) + else: + return ( + _make_skeleton_class, _class_getnewargs(obj), _class_getstate(obj), + None, None, _class_setstate + ) def _class_reduce(obj): @@ -441,29 +418,17 @@ def _class_reduce(obj): # XXX: there used to be special handling for NoneType, EllipsisType and # NotImplementedType. As for now this module handles only python3.8+, this # code has been removed. - if obj.__module__ == "__main__": + if obj is type(None): # noqa + return type, (None,) + elif obj is type(Ellipsis): + return type, (Ellipsis,) + elif obj is type(NotImplemented): + return type, (NotImplemented,) + elif obj in _BUILTIN_TYPE_NAMES: + return _builtin_type, (_BUILTIN_TYPE_NAMES[obj],) + elif not _is_global(obj): return _dynamic_class_reduce(obj) - - try: - # All classes are caught in this function: pickleable classes are - # filtered out by creating a Pickler with no custom class reducer - # (thus, falling back to save_global). If it fails to save obj, then - # obj is either a non-pickleable builtin or dynamic. - pickle.dumps(obj) - except Exception: - if obj.__module__ == "builtins": - if obj in _BUILTIN_TYPE_NAMES: - return _builtin_type_reduce(obj) - - typ = type(obj) - if typ is not obj and isinstance(obj, type): # noqa: E721 - return _dynamic_class_reduce(obj) - - else: - # if pickle.dumps worked out fine, then simply fallback to the - # traditional pickle by attribute # implemented in the builtin - # `Pickler.save_global`. - return NotImplemented + return NotImplemented # COLLECTIONS OF OBJECTS STATE SETTERS From b92ac584a4712ede3bf1d9e106fe0140683ae547 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 22 May 2019 18:56:25 +0200 Subject: [PATCH 35/70] CLN cleanups --- cloudpickle/cloudpickle_fast.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index b6e5b332a..bb7cf758e 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -27,8 +27,6 @@ load, loads = _pickle.load, _pickle.loads # Shorthands similar to pickle.dump/pickle.dumps - - def dump(obj, file, protocol=None): """Serialize obj as bytes streamed into file @@ -57,10 +55,10 @@ def dumps(obj, protocol=None): cp.dump(obj) return file.getvalue() + # COLLECTION OF OBJECTS __getnewargs__-LIKE METHODS # ------------------------------------------------- - def _function_getnewargs(func, globals_ref): code = func.__code__ From 920949e1abd3c5869919b33fb7d6b51bc3c96c30 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 23 May 2019 10:53:12 +0200 Subject: [PATCH 36/70] [ci python-nightly] fix coverage failure --- dev-requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/dev-requirements.txt b/dev-requirements.txt index faff1709e..1849f20d5 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -6,3 +6,4 @@ psutil futures; python_version < '3.4' # Code coverage uploader for Travis: codecov +coverage From 54e73418d892a2d3d4cf531e22a5a82ca4c4b093 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 23 May 2019 12:18:42 +0200 Subject: [PATCH 37/70] [ci python-nightly] fix coverage failure (2) --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index c99177d59..bc43c7fee 100644 --- a/.travis.yml +++ b/.travis.yml @@ -137,5 +137,5 @@ script: fi fi after_success: - - coverage combine --append - - codecov + - $PYTHON_EXE -m coverage combine --append + - $PYTHON_EXE -m codecov From 2357fa4e29465dae4d52440b232995fdd1ce51e7 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 23 May 2019 13:38:14 +0200 Subject: [PATCH 38/70] [ci python-nightly] fix coverage failure (3) --- .travis.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index bc43c7fee..a50a5656a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,6 +26,7 @@ matrix: - os: linux if: commit_message =~ /(\[ci python-nightly\])/ env: PYTHON_NIGHTLY=1 + python: 3.7 - os: linux python: 3.7 - os: linux @@ -137,5 +138,5 @@ script: fi fi after_success: - - $PYTHON_EXE -m coverage combine --append - - $PYTHON_EXE -m codecov + - coverage combine --append + - codecov From d753cdf0242ba1de555116a615bcf87a6d9ff35d Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 23 May 2019 14:17:30 +0200 Subject: [PATCH 39/70] [ci python-nightly] fix coverage failure (4) --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index a50a5656a..b958331c9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -138,5 +138,6 @@ script: fi fi after_success: + - pip install coverage codecov - coverage combine --append - codecov From d79c3a9ce2a737c1cb77e0e3a7d03fdd37c2bdeb Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 23 May 2019 14:48:21 +0200 Subject: [PATCH 40/70] CI re-enable windows builds --- .travis.yml | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/.travis.yml b/.travis.yml index b958331c9..2ea0971df 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,23 +3,12 @@ dist: xenial matrix: include: - # - os: windows - # language: sh - # env: PYTHON_ROOT="/c/Python37" PYTHON_CHOCO_PKG="python3" - # - os: windows - # language: sh - # env: PYTHON_ROOT="/c/Python27" PYTHON_CHOCO_PKG="python2" - # - os: linux - # dist: trusty - # python: "pypy3" - # - os: linux - # python: 3.7 - # - os: linux - # python: 3.6 - # - os: linux - # python: 3.5 - # - os: linux - # python: 2.7 + - os: windows + language: sh + env: PYTHON_ROOT="/c/Python37" PYTHON_CHOCO_PKG="python3" + - os: windows + language: sh + env: PYTHON_ROOT="/c/Python27" PYTHON_CHOCO_PKG="python2" - os: linux dist: trusty python: "pypy3" From 8db031afaca99c45ee29cd99a906ddc7d405d5df Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 23 May 2019 15:02:56 +0200 Subject: [PATCH 41/70] CLN duplicated code --- cloudpickle/cloudpickle_fast.py | 45 +++------------------------------ 1 file changed, 4 insertions(+), 41 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index bb7cf758e..27be4abd0 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -21,7 +21,8 @@ _is_dynamic, _extract_code_globals, _BUILTIN_TYPE_NAMES, DEFAULT_PROTOCOL, _find_loaded_submodules, _get_cell_contents, _is_global, _builtin_type, Enum, _ensure_tracking, _lookup_class_or_track, _make_skeleton_class, - _make_skeleton_enum, _extract_class_dict, string_types + _make_skeleton_enum, _extract_class_dict, string_types, dynamic_subimport, + subimport ) load, loads = _pickle.load, _pickle.loads @@ -119,39 +120,6 @@ def _enum_getnewargs(obj): # COLLECTION OF OBJECTS RECONSTRUCTORS # ------------------------------------ - -# Builtin types are types defined in the python language source code, that are -# not defined in an importable python module (Lib/* for pure python module, -# Modules/* for C-implemented modules). The most wildely used ones (such as -# tuple, dict, list) are made accessible in any interpreter session by exposing -# them in the builtin namespace at startup time. - -# By construction, builtin types do not have a module. Trying to access their -# __module__ attribute will default to 'builtins', that only contains builtin -# types accessible at interpreter startup. Therefore, trying to pickle the -# other ones using classic module attribute lookup instructions will fail. - -# Fortunately, many non-accessible builtin-types are mirrored in the types -# module. For those types, we pickle the function builtin_type_reconstructor -# instead, that contains instruction to look them up via the types module. -def _builtin_type_reconstructor(name): - """Return a builtin-type using attribute lookup from the types module""" - return getattr(types, name) - - -# XXX: what does "not working as desired" means? -# hack for __import__ not working as desired -def _module_reconstructor(name): - __import__(name) - return sys.modules[name] - - -def _dynamic_module_reconstructor(name, vars): - mod = types.ModuleType(name) - mod.__dict__.update(vars) - return mod - - def _file_reconstructor(retval): return retval @@ -251,11 +219,6 @@ def _enum_getstate(obj): # XXX: no itemgetter/attrgetter reducer support implemented as the tests seem # to pass even without them - -def _builtin_type_reduce(obj): - return _builtin_type_reconstructor, (_BUILTIN_TYPE_NAMES[obj],) - - def _code_reduce(obj): """codeobject reducer""" if hasattr(obj, "co_posonlyargcount"): # pragma: no branch @@ -350,9 +313,9 @@ def _memoryview_reduce(obj): def _module_reduce(obj): if _is_dynamic(obj): - return _dynamic_module_reconstructor, (obj.__name__, vars(obj)) + return dynamic_subimport, (obj.__name__, vars(obj)) else: - return _module_reconstructor, (obj.__name__,) + return subimport, (obj.__name__,) def _method_reduce(obj): From 6e70c2d996b5e5fe82008a4973fed5a9d525c72a Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 5 Jun 2019 10:45:54 +0200 Subject: [PATCH 42/70] MNT rebasing mistakes --- cloudpickle/cloudpickle.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index e7bede6fb..db35a215d 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -102,11 +102,6 @@ PY2 = False from importlib._bootstrap import _find_spec - if platform.python_implementation() == 'PyPy': - from importlib._bootstrap import _find_spec - else: - from _frozen_importlib import _find_spec - _extract_code_globals_cache = ( weakref.WeakKeyDictionary() if not hasattr(sys, "pypy_version_info") @@ -194,8 +189,8 @@ def _is_global(obj, name=None): # supported, as the standard pickle does not support it either. return False + # module has been added to sys.modules, but it can still be dynamic. if _is_dynamic(module): - # module has been added to sys.modules, but it can still be dynamic. return False try: From e439f7af795abee502d33ea52f3508695f6cd6f3 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 5 Jun 2019 11:15:42 +0200 Subject: [PATCH 43/70] CLN make some reducers CloudPickler methods also, typos, comments --- cloudpickle/cloudpickle_fast.py | 148 ++++++++++++++++---------------- 1 file changed, 75 insertions(+), 73 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 27be4abd0..24b72d6f8 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -1,9 +1,10 @@ """ -New, fast version of the Cloudpickler. +New, fast version of the CloudPickler. -This new Cloudpickler class can now extend the fast C Pickler instead of the -previous pythonic Pickler. Because this functionality is only available for -python versions 3.8+, a lot of backward-compatibilty code is also removed. +This new CloudPickler class can now extend the fast C Pickler instead of the +previous python implementation of the Pickler class. Because this functionality +is only available for python versions 3.8+, a lot of backward-compatibility +code is also removed. """ import abc import io @@ -60,41 +61,6 @@ def dumps(obj, protocol=None): # COLLECTION OF OBJECTS __getnewargs__-LIKE METHODS # ------------------------------------------------- -def _function_getnewargs(func, globals_ref): - code = func.__code__ - - # base_globals represents the future global namespace of func at - # unpickling time. Looking it up and storing it in globals_ref allow - # functions sharing the same globals at pickling time to also - # share them once unpickled, at one condition: since globals_ref is - # an attribute of a Cloudpickler instance, and that a new CloudPickler is - # created each time pickle.dump or pickle.dumps is called, functions - # also need to be saved within the same invokation of - # cloudpickle.dump/cloudpickle.dumps - # (for example: cloudpickle.dumps([f1, f2])). There - # is no such limitation when using Cloudpickler.dump, as long as the - # multiple invokations are bound to the same Cloudpickler. - base_globals = globals_ref.setdefault(id(func.__globals__), {}) - - if base_globals == {}: - # Add module attributes used to resolve relative imports - # instructions inside func. - for k in ["__package__", "__name__", "__path__", "__file__"]: - # Some built-in functions/methods such as object.__new__ have - # their __globals__ set to None in PyPy - if func.__globals__ is not None and k in func.__globals__: - base_globals[k] = func.__globals__[k] - - # Do not bind the free variables before the function is created to avoid - # infinite recursion. - if func.__closure__ is None: - closure = None - else: - closure = tuple(types.CellType() for _ in range(len(code.co_freevars))) - - return code, base_globals, None, None, closure - - def _class_getnewargs(obj): # On PyPy, __doc__ is a readonly attribute, so we need to include it in # the initial skeleton class. This is safe because we know that the @@ -153,7 +119,10 @@ def _function_getstate(func): if func.__closure__ is not None else () ) - # extract submodules referenced by attribute lookup (no global opcode) + # Extract submodules referenced by attribute lookup (no global opcode). + # Storing the loaded submodules in a smoke __submodule__ attribute of + # func.__globals__ allow these modules to be saved at pickling time, and + # thus imported when the function is unpickled. f_globals["__submodules__"] = _find_loaded_submodules( func.__code__, itertools.chain(f_globals.values(), closure_values)) slotstate["__globals__"] = f_globals @@ -184,7 +153,7 @@ def _class_getstate(obj): for k in obj.__slots__: clsdict.pop(k, None) - clsdict.pop('__dict__', None) # unpickleable property object + clsdict.pop('__dict__', None) # unpicklable property object return (clsdict, {}) @@ -209,16 +178,13 @@ def _enum_getstate(obj): # ------------------------------- # A reducer is a function taking a single argument (obj), and that returns a # tuple with all the necessary data to re-construct obj. Apart from a few -# exceptions (list, dicts, bytes, ints, etc.), a reducer is necessary to -# correclty pickle an object. +# exceptions (list, dict, bytes, int, etc.), a reducer is necessary to +# correctly pickle an object. # While many built-in objects (Exceptions objects, instances of the "object" # class, etc), are shipped with their own built-in reducer (invoked using # obj.__reduce__), some do not. The following methods were created to "fill # these holes". -# XXX: no itemgetter/attrgetter reducer support implemented as the tests seem -# to pass even without them - def _code_reduce(obj): """codeobject reducer""" if hasattr(obj, "co_posonlyargcount"): # pragma: no branch @@ -334,26 +300,6 @@ def _weakset_reduce(obj): return weakref.WeakSet, (list(obj),) -def _dynamic_function_reduce(func, globals_ref): - """Reduce a function that is not pickleable via attribute loookup. - """ - newargs = _function_getnewargs(func, globals_ref) - state = _function_getstate(func) - return types.FunctionType, newargs, state, None, None, _function_setstate - - -def _function_reduce(obj, globals_ref): - """Select the reducer depending on obj's dynamic nature - - This functions starts by replicating save_global: trying to retrieve obj - from an attribute lookup of a file-backed module. If this check fails, then - a custom reducer is called. - """ - if not _is_global(obj): - return _dynamic_function_reduce(obj, globals_ref) - return NotImplemented - - def _dynamic_class_reduce(obj): """ Save a class that can't be stored as module global. @@ -411,7 +357,7 @@ def _function_setstate(obj, state): obj_globals = slotstate.pop("__globals__") obj_closure = slotstate.pop("__closure__") - # remove uncessary references to submodules + # remove unnecessary references to submodules obj_globals.pop("__submodules__") obj.__globals__.update(obj_globals) obj.__globals__["__builtins__"] = __builtins__ @@ -446,9 +392,9 @@ def _class_setstate(obj, state): class CloudPickler(Pickler): """Fast C Pickler extension with additional reducing routines. - Cloudpickler's extensions exist into into: + CloudPickler's extensions exist into into: - * it's dispatch_table containing reducers that are called only if ALL + * its dispatch_table containing reducers that are called only if ALL built-in saving functions were previously discarded. * a special callback, invoked before standard function/class builtin-saving method (save_global), to serialize dynamic functions @@ -486,7 +432,7 @@ def __init__(self, file, protocol=None): # will likely not be known in advance, and thus cannot be special-cased # using an entry in the dispatch_table. - # The pickler's reducer_override, among other things, allows us to + # The Pickler's reducer_override, among other things, allows us to # register a reducer that will be called for any class, independently # of its type. self.proto = int(protocol) @@ -496,7 +442,7 @@ def reducer_override(self, obj): This function is the analog of a custom save_global. However, the C Pickler API does not expose low-level instructions such as save or - write. Instead, we return a reduce value the the Pickler will + write. Instead, we return a reduce value the Pickler will internally serialize via save_reduce. """ t = type(obj) @@ -508,11 +454,67 @@ def reducer_override(self, obj): if is_anyclass: return _class_reduce(obj) elif isinstance(obj, types.FunctionType): - return _function_reduce(obj, self.globals_ref) + return self._function_reduce(obj) else: - # fallback to save_global, including the pickler's distpatch_table + # fallback to save_global, including the Pickler's distpatch_table return NotImplemented + # function reducers are defined as instance methods of CloudPickler + # objects, as they rely on a CloudPickler attribute (globals_ref) + def _dynamic_function_reduce(self, func): + """Reduce a function that is not pickleable via attribute lookup. + """ + newargs = self._function_getnewargs(func) + state = _function_getstate(func) + return (types.FunctionType, newargs, state, None, None, + _function_setstate) + + def _function_reduce(self, obj): + """Select the reducer depending on obj's dynamic nature + + This functions starts by replicating save_global: trying to retrieve + obj from an attribute lookup of a file-backed module. If this check + fails, then a custom reducer is called. + """ + if not _is_global(obj): + return self._dynamic_function_reduce(obj) + return NotImplemented + + def _function_getnewargs(self, func): + code = func.__code__ + + # base_globals represents the future global namespace of func at + # unpickling time. Looking it up and storing it in + # CloudpiPickler.globals_ref allow functions sharing the same globals + # at pickling time to also share them once unpickled, at one condition: + # since globals_ref is an attribute of a CloudPickler instance, and + # that a new CloudPickler is created each time pickle.dump or + # pickle.dumps is called, functions also need to be saved within the + # same invocation of cloudpickle.dump/cloudpickle.dumps (for example: + # cloudpickle.dumps([f1, f2])). There is no such limitation when using + # CloudPickler.dump, as long as the multiple invocations are bound to + # the same CloudPickler. + base_globals = self.globals_ref.setdefault(id(func.__globals__), {}) + + if base_globals == {}: + # Add module attributes used to resolve relative imports + # instructions inside func. + for k in ["__package__", "__name__", "__path__", "__file__"]: + # Some built-in functions/methods such as object.__new__ have + # their __globals__ set to None in PyPy + if func.__globals__ is not None and k in func.__globals__: + base_globals[k] = func.__globals__[k] + + # Do not bind the free variables before the function is created to + # avoid infinite recursion. + if func.__closure__ is None: + closure = None + else: + closure = tuple( + types.CellType() for _ in range(len(code.co_freevars))) + + return code, base_globals, None, None, closure + def dump(self, obj): try: return Pickler.dump(self, obj) From 0bdb4bc15f86604a0fdf60dd5e165a03c38b5c51 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 5 Jun 2019 11:33:16 +0200 Subject: [PATCH 44/70] CI test python3.8-dev version --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 2ea0971df..b20f45b96 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,6 +18,8 @@ matrix: python: 3.7 - os: linux python: 3.7 + - os: linux + python: 3.8-dev - os: linux python: 3.6 - os: linux From ac1d05ce4488db3a95357f3a2517f0ebb38540b3 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Wed, 5 Jun 2019 11:56:48 +0200 Subject: [PATCH 45/70] CI test against python nighlty on every commit --- .travis.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index b20f45b96..cc59f015d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,13 +13,10 @@ matrix: dist: trusty python: "pypy3" - os: linux - if: commit_message =~ /(\[ci python-nightly\])/ env: PYTHON_NIGHTLY=1 python: 3.7 - os: linux python: 3.7 - - os: linux - python: 3.8-dev - os: linux python: 3.6 - os: linux From 968f7696515789a29751f1460133b189329d3c8d Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 6 Jun 2019 15:01:11 +0200 Subject: [PATCH 46/70] MNT DOC explain WeakKeyDictionary guard in PyPy --- cloudpickle/cloudpickle.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index db35a215d..ca719e227 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -102,6 +102,15 @@ PY2 = False from importlib._bootstrap import _find_spec +# XXX: This cache cannot be a WeakKeyDictionary, because sometimes, cloudpickle +# misclassifies a builtin pypy function as dynamic, and thus tries to extract +# the globals of its underlying builtin-code. However, builtin-code objects +# cannot be weak-referenced (hence the if-else clause below). +# Note that the root cause of cloudpickle misclassification of builtin +# functions is PyPy flaky support of __qualname__ attributes in v3.5. This +# guard can be removed by either spotting more proactively builtin pypy +# functions before trying to save them as dynamic, or simply after support for +# pypy3.5 is dropped. _extract_code_globals_cache = ( weakref.WeakKeyDictionary() if not hasattr(sys, "pypy_version_info") From db8b2c5c9b59855512ca07b88871810ef268a3c0 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 6 Jun 2019 16:55:43 +0200 Subject: [PATCH 47/70] FIX pre-populate dispatch with copyreg_dispatch_table --- cloudpickle/cloudpickle_fast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 24b72d6f8..aa2f67bbd 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -7,6 +7,7 @@ code is also removed. """ import abc +import copyreg import io import itertools import logging @@ -400,7 +401,7 @@ class CloudPickler(Pickler): builtin-saving method (save_global), to serialize dynamic functions """ - dispatch = {} + dispatch = copyreg.dispatch_table.copy() dispatch[classmethod] = _classmethod_reduce dispatch[io.TextIOWrapper] = _file_reduce dispatch[logging.Logger] = _logger_reduce From c276d838f955b8d37d268e70f26f16aac3f639b1 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 6 Jun 2019 17:15:55 +0200 Subject: [PATCH 48/70] FIX more robust alternative + comments --- cloudpickle/cloudpickle_fast.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index aa2f67bbd..706307b39 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -401,7 +401,10 @@ class CloudPickler(Pickler): builtin-saving method (save_global), to serialize dynamic functions """ - dispatch = copyreg.dispatch_table.copy() + # cloudpickle's own dispatch_table, containing the additional set of + # objects (compared to the standard library pickle) that cloupickle can + # serialize. + dispatch = {} dispatch[classmethod] = _classmethod_reduce dispatch[io.TextIOWrapper] = _file_reduce dispatch[logging.Logger] = _logger_reduce @@ -423,7 +426,12 @@ def __init__(self, file, protocol=None): # sharing the same global namespace at pickling time also share their # global namespace at unpickling time. self.globals_ref = {} - self.dispatch_table = self.dispatch + + # Take into account potential custom reducers registered by external + # modules + self.dispatch_table = copyreg.dispatch_table.copy() + + self.dispatch_table.update(self.dispatch) # Pickling functions and classes cannot be customized using the # dispatch_table: indeed, pickling an object using the dispatch_table From c50339b0a23ae58c773e30bbb3d26de66a71997c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 6 Jun 2019 17:40:15 +0200 Subject: [PATCH 49/70] MAINT add numpy master to python nightly ci --- .travis.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index cc59f015d..27e65b301 100644 --- a/.travis.yml +++ b/.travis.yml @@ -91,8 +91,13 @@ install: - $PYTHON_EXE -m pip install . - $PYTHON_EXE -m pip install --upgrade -r dev-requirements.txt - $PYTHON_EXE -m pip install tornado - - if [[ $TRAVIS_PYTHON_VERSION != 'pypy'* && "$PYTHON_NIGHTLY" != 1 ]]; then + - if [[ $TRAVIS_PYTHON_VERSION != 'pypy'* ]]; then + if [[ "$PYTHON_NIGHTLY" == 1 ]]; then + # Install the master version of numpy (with the master branch of cython) + $PYTHON_EXE -m pip install git+https://github.com/cython/cython git+https://github.com/numpy/numpy; + else $PYTHON_EXE -m pip install numpy scipy; + fi fi - if [[ $PROJECT != "" ]]; then $PYTHON_EXE -m pip install $TEST_REQUIREMENTS; From 4c3c05f377770866fec6b3d243fbbd202a9c8e4b Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 6 Jun 2019 17:55:05 +0200 Subject: [PATCH 50/70] Fix equality test? --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 27e65b301..1c72769b8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -92,7 +92,7 @@ install: - $PYTHON_EXE -m pip install --upgrade -r dev-requirements.txt - $PYTHON_EXE -m pip install tornado - if [[ $TRAVIS_PYTHON_VERSION != 'pypy'* ]]; then - if [[ "$PYTHON_NIGHTLY" == 1 ]]; then + if [[ "$PYTHON_NIGHTLY" == "1" ]]; then # Install the master version of numpy (with the master branch of cython) $PYTHON_EXE -m pip install git+https://github.com/cython/cython git+https://github.com/numpy/numpy; else From a7125576562b307cc96f1f3a50f8344e326b5a2d Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 6 Jun 2019 18:03:30 +0200 Subject: [PATCH 51/70] Remove comment that breaks yaml parsing --- .travis.yml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 1c72769b8..dc0fb07d4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -92,12 +92,11 @@ install: - $PYTHON_EXE -m pip install --upgrade -r dev-requirements.txt - $PYTHON_EXE -m pip install tornado - if [[ $TRAVIS_PYTHON_VERSION != 'pypy'* ]]; then - if [[ "$PYTHON_NIGHTLY" == "1" ]]; then - # Install the master version of numpy (with the master branch of cython) - $PYTHON_EXE -m pip install git+https://github.com/cython/cython git+https://github.com/numpy/numpy; - else - $PYTHON_EXE -m pip install numpy scipy; - fi + if [[ "$PYTHON_NIGHTLY" == "1" ]]; then + $PYTHON_EXE -m pip install git+https://github.com/cython/cython git+https://github.com/numpy/numpy; + else + $PYTHON_EXE -m pip install numpy scipy; + fi fi - if [[ $PROJECT != "" ]]; then $PYTHON_EXE -m pip install $TEST_REQUIREMENTS; From 5f6defe59f011da65c820b5e904235077307401f Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Thu, 6 Jun 2019 21:17:52 +0200 Subject: [PATCH 52/70] CLN rebase with #278 --- cloudpickle/cloudpickle.py | 68 +++++++-------------------------- cloudpickle/cloudpickle_fast.py | 7 +++- tests/cloudpickle_test.py | 2 +- 3 files changed, 20 insertions(+), 57 deletions(-) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index ca719e227..14f799eeb 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -102,20 +102,7 @@ PY2 = False from importlib._bootstrap import _find_spec -# XXX: This cache cannot be a WeakKeyDictionary, because sometimes, cloudpickle -# misclassifies a builtin pypy function as dynamic, and thus tries to extract -# the globals of its underlying builtin-code. However, builtin-code objects -# cannot be weak-referenced (hence the if-else clause below). -# Note that the root cause of cloudpickle misclassification of builtin -# functions is PyPy flaky support of __qualname__ attributes in v3.5. This -# guard can be removed by either spotting more proactively builtin pypy -# functions before trying to save them as dynamic, or simply after support for -# pypy3.5 is dropped. -_extract_code_globals_cache = ( - weakref.WeakKeyDictionary() - if not hasattr(sys, "pypy_version_info") - else {}) - +_extract_code_globals_cache = weakref.WeakKeyDictionary() def _ensure_tracking(class_def): @@ -216,24 +203,19 @@ def _extract_code_globals(co): """ out_names = _extract_code_globals_cache.get(co) if out_names is None: - try: - names = co.co_names - except AttributeError: - # PyPy "builtin-code" object - out_names = set() - else: - out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} - - # Declaring a function inside another one using the "def ..." - # syntax generates a constant code object corresonding to the one - # of the nested function's As the nested function may itself need - # global variables, we need to introspect its code, extract its - # globals, (look for code object in it's co_consts attribute..) and - # add the result to code_globals - if co.co_consts: - for const in co.co_consts: - if isinstance(const, types.CodeType): - out_names |= _extract_code_globals(const) + names = co.co_names + out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} + + # Declaring a function inside another one using the "def ..." + # syntax generates a constant code object corresonding to the one + # of the nested function's As the nested function may itself need + # global variables, we need to introspect its code, extract its + # globals, (look for code object in it's co_consts attribute..) and + # add the result to code_globals + if co.co_consts: + for const in co.co_consts: + if isinstance(const, types.CodeType): + out_names |= _extract_code_globals(const) _extract_code_globals_cache[co] = out_names @@ -803,28 +785,6 @@ def save_function_tuple(self, func): write(pickle.TUPLE) write(pickle.REDUCE) # applies _fill_function on the tuple - _extract_code_globals_cache = weakref.WeakKeyDictionary() - - @classmethod - def extract_code_globals(cls, co): - """ - Find all globals names read or written to by codeblock co - """ - out_names = cls._extract_code_globals_cache.get(co) - if out_names is None: - names = co.co_names - out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} - - # see if nested function have any global refs - if co.co_consts: - for const in co.co_consts: - if isinstance(const, types.CodeType): - out_names |= cls.extract_code_globals(const) - - cls._extract_code_globals_cache[co] = out_names - - return out_names - def extract_func_data(self, func): """ Turn the function into a tuple of data necessary to recreate it: diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 706307b39..4af0c3946 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -485,9 +485,12 @@ def _function_reduce(self, obj): obj from an attribute lookup of a file-backed module. If this check fails, then a custom reducer is called. """ - if not _is_global(obj): + # There no special handling for builtin pypy functions like in + # cloudpickle.py because cloudpickle_fast is CPython-specific. + if _is_global(obj): + return NotImplemented + else: return self._dynamic_function_reduce(obj) - return NotImplemented def _function_getnewargs(self, func): code = func.__code__ diff --git a/tests/cloudpickle_test.py b/tests/cloudpickle_test.py index 32e58af87..b23c8c0f8 100644 --- a/tests/cloudpickle_test.py +++ b/tests/cloudpickle_test.py @@ -1829,7 +1829,7 @@ def inner_function(): return _TEST_GLOBAL_VARIABLE return inner_function - globals_ = cloudpickle.CloudPickler.extract_code_globals( + globals_ = cloudpickle.cloudpickle._extract_code_globals( function_factory.__code__) assert globals_ == {'_TEST_GLOBAL_VARIABLE'} From a3bb79affd4a1f419a474cf89dce38d27a70d99b Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 10:23:44 +0200 Subject: [PATCH 53/70] Update cloudpickle/cloudpickle_fast.py Co-Authored-By: Olivier Grisel --- cloudpickle/cloudpickle_fast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 4af0c3946..5565a60f5 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -397,7 +397,7 @@ class CloudPickler(Pickler): * its dispatch_table containing reducers that are called only if ALL built-in saving functions were previously discarded. - * a special callback, invoked before standard function/class + * a special callback named "reducer_override", invoked before standard function/class builtin-saving method (save_global), to serialize dynamic functions """ From e90d45d32b4ece9ab4d71d0aeeb9ecab6b251137 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 11:36:45 +0200 Subject: [PATCH 54/70] DOC better reducer_override comment --- cloudpickle/cloudpickle_fast.py | 45 ++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 5565a60f5..05f414588 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -433,26 +433,37 @@ def __init__(self, file, protocol=None): self.dispatch_table.update(self.dispatch) - # Pickling functions and classes cannot be customized using the - # dispatch_table: indeed, pickling an object using the dispatch_table - # works by invoking a reducer specific to the object's type. When the - # object is a class, its type is often ``type``, except when the class - # is an instance of another metaclass. In this cased, the metaclass - # will likely not be known in advance, and thus cannot be special-cased - # using an entry in the dispatch_table. - - # The Pickler's reducer_override, among other things, allows us to - # register a reducer that will be called for any class, independently - # of its type. self.proto = int(protocol) def reducer_override(self, obj): - """Custom reducing callback for functions and classes - - This function is the analog of a custom save_global. However, the C - Pickler API does not expose low-level instructions such as save or - write. Instead, we return a reduce value the Pickler will - internally serialize via save_reduce. + """Type-agnostic reducing callback for function and classes. + + For performance reasons, subclasses of the C _pickle.Pickler class + cannot register custom reducers for functions and classes in the + dispatch_table. Reducer for such types must instead implemented in the + special reducer_override method. + + Note that method will be called for any object except a few + builtin-types (int, lists, dicts etc.), which differs from reducers in + the Pickler's dispatch_table, each of them being invoked for objects of + a specific type only. + + This property comes in handy for classes: although most classes are + instances of the ``type`` metaclass, some of them can be instances of + other custom metaclasses (such as enum.EnumMeta for example). In + particular, the metaclass will likely not be known in advance, and thus + cannot be special-cased using an entry in the dispatch_table. + reducer_override, among other things, allows us to register a reducer + that will be called for any class, independently of its type. + + + Notes: + + - reducer_override has the priority over dispatch_table-registered + reducers. + - reducer_override can be use to fix other limitations of cloudpickle + for other types that suffered from type-specific reducers, such as + Exceptions. See https://github.com/cloudpipe/cloudpickle/issues/248 """ t = type(obj) try: From 3b879044a2e4d46350baf9e77a39da63355d4ae7 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 11:48:01 +0200 Subject: [PATCH 55/70] DOC, CLN clearer comments and names --- cloudpickle/cloudpickle.py | 4 ++-- cloudpickle/cloudpickle_fast.py | 22 ++++++++++++---------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index 14f799eeb..f819d2f1c 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -222,7 +222,7 @@ def _extract_code_globals(co): return out_names -def _find_loaded_submodules(code, top_level_dependencies): +def _find_imported_submodules(code, top_level_dependencies): """ Save submodules used by a function but not listed in its globals. In the example below: @@ -744,7 +744,7 @@ def save_function_tuple(self, func): save(_fill_function) # skeleton function updater write(pickle.MARK) # beginning of tuple that _fill_function expects - submodules = _find_loaded_submodules( + submodules = _find_imported_submodules( code, itertools.chain(f_globals.values(), closure_values or ()), ) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 05f414588..5be2e27e1 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -21,7 +21,7 @@ from .cloudpickle import ( _is_dynamic, _extract_code_globals, _BUILTIN_TYPE_NAMES, DEFAULT_PROTOCOL, - _find_loaded_submodules, _get_cell_contents, _is_global, _builtin_type, + _find_imported_submodules, _get_cell_contents, _is_global, _builtin_type, Enum, _ensure_tracking, _lookup_class_or_track, _make_skeleton_class, _make_skeleton_enum, _extract_class_dict, string_types, dynamic_subimport, subimport @@ -120,11 +120,11 @@ def _function_getstate(func): if func.__closure__ is not None else () ) - # Extract submodules referenced by attribute lookup (no global opcode). - # Storing the loaded submodules in a smoke __submodule__ attribute of - # func.__globals__ allow these modules to be saved at pickling time, and - # thus imported when the function is unpickled. - f_globals["__submodules__"] = _find_loaded_submodules( + # Extract currently-imported submodules used by func. Storing these modules + # in a smoke _cloudpickle_subimports attribute of the object's state will + # trigger the side effect of importing these modules at unpickling time + # (which is necessary for func to work correctly once depickled) + slotstate["_cloudpickle_submodules"] = _find_imported_submodules( func.__code__, itertools.chain(f_globals.values(), closure_values)) slotstate["__globals__"] = f_globals @@ -357,9 +357,13 @@ def _function_setstate(obj, state): obj_globals = slotstate.pop("__globals__") obj_closure = slotstate.pop("__closure__") + # _cloudpickle_subimports is a set of submodules that must be loaded for + # the pickled function to work correctly at unpickling time. Now that these + # submodules are depickled (hence imported), they can be removed from the + # object's state (the object state only served as a reference holder to + # these submodules) + slotstate.pop("_cloudpickle_submodules") - # remove unnecessary references to submodules - obj_globals.pop("__submodules__") obj.__globals__.update(obj_globals) obj.__globals__["__builtins__"] = __builtins__ @@ -430,9 +434,7 @@ def __init__(self, file, protocol=None): # Take into account potential custom reducers registered by external # modules self.dispatch_table = copyreg.dispatch_table.copy() - self.dispatch_table.update(self.dispatch) - self.proto = int(protocol) def reducer_override(self, obj): From 042e8e4594b5f3baf09464d72d6392fd7d7f0a5f Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 12:09:56 +0200 Subject: [PATCH 56/70] DOC better find_imported_submodules docstring --- cloudpickle/cloudpickle.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index f819d2f1c..95aea8eab 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -224,8 +224,15 @@ def _extract_code_globals(co): def _find_imported_submodules(code, top_level_dependencies): """ - Save submodules used by a function but not listed in its globals. - In the example below: + Find currently imported submodules used by a function. + + Submodules used by a function need to be detected and referenced for the + function to work correctly at depickling time. Because submodules can be + referenced as attribute of their parent package (``package.submodule``), we + need a special introspection technique that does not rely on GLOBAL-related + opcodes to find references of them in a code object. + + Example: ``` import concurrent.futures import cloudpickle @@ -234,13 +241,12 @@ def func(): if __name__ == '__main__': cloudpickle.dumps(func) ``` - the globals extracted by cloudpickle in the function's state include - the concurrent package, but not its submodule (here, - concurrent.futures), which is the module used by func. - To ensure that calling the depickled function does not raise an - AttributeError, this function looks for any currently loaded submodule - that the function uses and whose parent is present in the function - globals, and saves it before saving the function. + The globals extracted by cloudpickle in the function's state include the + concurrent package, but not its submodule (here, concurrent.futures), which + is the module used by func. Find_imported_submodules will detect the usage + of concurrent.futures. Saving this module alongside with func will ensure + that calling func once depickled does not fail due to concurrent.futures + not being imported """ subimports = [] From 71f3c0973242c1917fca5f63f1363066cac9c9bf Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 13:56:21 +0200 Subject: [PATCH 57/70] TST test reference cycle error --- tests/cloudpickle_test.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/cloudpickle_test.py b/tests/cloudpickle_test.py index b23c8c0f8..52da47ac3 100644 --- a/tests/cloudpickle_test.py +++ b/tests/cloudpickle_test.py @@ -1838,6 +1838,25 @@ def inner_function(): inner_func = depickled_factory() assert inner_func() == _TEST_GLOBAL_VARIABLE + def test_recursion_during_pickling(self): + class A: + def __init__(self, some_attribute): + self.some_attribute = some_attribute + + def __reduce__(self): + # Make some_attribute an initarg instead of a state item. This + # makes the reducer unsafe with regards to reference cycles as + # cloudpickle will try to some_attribute before self is + # memoized. + return A, (self.some_attribute, ), {} + + a = A(None) + + # generate a reference cycle + a.some_attribute = a + with pytest.raises(pickle.PicklingError, match='recursion'): + cloudpickle.dumps(a) + class Protocol2CloudPickleTest(CloudPickleTest): From 8219e371d02ab89fc5d9649c8afc44505f85c76d Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 14:05:04 +0200 Subject: [PATCH 58/70] CLN cleaner __init__ --- cloudpickle/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cloudpickle/__init__.py b/cloudpickle/__init__.py index 3debb3cdf..2909cebb0 100644 --- a/cloudpickle/__init__.py +++ b/cloudpickle/__init__.py @@ -4,9 +4,8 @@ import pickle +from cloudpickle.cloudpickle import * if sys.version_info[:2] >= (3, 8): - from cloudpickle.cloudpickle_fast import * -else: - from cloudpickle.cloudpickle import * + from cloudpickle.cloudpickle_fast import CloudPickler, dumps, dump __version__ = '1.2.0.dev0' From b4a8bdf8c90abd3b75d0be65f67487dd851e21c2 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 14:06:34 +0200 Subject: [PATCH 59/70] MNT drop support for 3.8 alpha releases --- cloudpickle/cloudpickle_fast.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 5be2e27e1..31a099127 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -188,23 +188,14 @@ def _enum_getstate(obj): def _code_reduce(obj): """codeobject reducer""" - if hasattr(obj, "co_posonlyargcount"): # pragma: no branch - args = ( - obj.co_argcount, obj.co_posonlyargcount, - obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, - obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, - obj.co_varnames, obj.co_filename, obj.co_name, - obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, - obj.co_cellvars - ) - else: - args = ( - obj.co_argcount, obj.co_kwonlyargcount, obj.co_nlocals, - obj.co_stacksize, obj.co_flags, obj.co_code, obj.co_consts, - obj.co_names, obj.co_varnames, obj.co_filename, - obj.co_name, obj.co_firstlineno, obj.co_lnotab, - obj.co_freevars, obj.co_cellvars - ) + args = ( + obj.co_argcount, obj.co_posonlyargcount, + obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, + obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, + obj.co_varnames, obj.co_filename, obj.co_name, + obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, + obj.co_cellvars + ) return types.CodeType, args From 3db1d3dea9f9bf01e5b0eccc7b8128e4a97a153e Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 14:24:35 +0200 Subject: [PATCH 60/70] TST cross-version recursion test --- cloudpickle/cloudpickle.py | 4 ++-- tests/cloudpickle_test.py | 16 +++------------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index 95aea8eab..6a8abc044 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -679,8 +679,8 @@ def save_dynamic_class(self, obj): # If type overrides __dict__ as a property, include it in the type # kwargs. In Python 2, we can't set this attribute after construction. __dict__ = clsdict.pop('__dict__', None) - if isinstance(__dict__, property): - type_kwargs['__dict__'] = __dict__ + # if isinstance(__dict__, property): + # type_kwargs['__dict__'] = __dict__ save = self.save write = self.write diff --git a/tests/cloudpickle_test.py b/tests/cloudpickle_test.py index 52da47ac3..615b6977b 100644 --- a/tests/cloudpickle_test.py +++ b/tests/cloudpickle_test.py @@ -1840,20 +1840,10 @@ def inner_function(): def test_recursion_during_pickling(self): class A: - def __init__(self, some_attribute): - self.some_attribute = some_attribute + def __getattr__(self, name): + return getattr(self, name) - def __reduce__(self): - # Make some_attribute an initarg instead of a state item. This - # makes the reducer unsafe with regards to reference cycles as - # cloudpickle will try to some_attribute before self is - # memoized. - return A, (self.some_attribute, ), {} - - a = A(None) - - # generate a reference cycle - a.some_attribute = a + a = A() with pytest.raises(pickle.PicklingError, match='recursion'): cloudpickle.dumps(a) From 8be8e723a6194711aad45065c1237ae4f7db037b Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 14:25:52 +0200 Subject: [PATCH 61/70] FIX fix spurious debugging attempts --- cloudpickle/cloudpickle.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index 6a8abc044..95aea8eab 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -679,8 +679,8 @@ def save_dynamic_class(self, obj): # If type overrides __dict__ as a property, include it in the type # kwargs. In Python 2, we can't set this attribute after construction. __dict__ = clsdict.pop('__dict__', None) - # if isinstance(__dict__, property): - # type_kwargs['__dict__'] = __dict__ + if isinstance(__dict__, property): + type_kwargs['__dict__'] = __dict__ save = self.save write = self.write From 3290de2acf730c3f3aa4276a086a27a65dbb10a3 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 15:24:58 +0200 Subject: [PATCH 62/70] CLN drop old save_subimport method in cloudpickle --- cloudpickle/cloudpickle.py | 67 ++++++++------------------------------ 1 file changed, 13 insertions(+), 54 deletions(-) diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index 95aea8eab..44ace512b 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -567,54 +567,6 @@ def save_pypy_builtin_func(self, obj): obj.__dict__) self.save_reduce(*rv, obj=obj) - - def _save_subimports(self, code, top_level_dependencies): - """ - Save submodules used by a function but not listed in its globals. - - In the example below: - - ``` - import concurrent.futures - import cloudpickle - - - def func(): - x = concurrent.futures.ThreadPoolExecutor - - - if __name__ == '__main__': - cloudpickle.dumps(func) - ``` - - the globals extracted by cloudpickle in the function's state include - the concurrent module, but not its submodule (here, - concurrent.futures), which is the module used by func. - - To ensure that calling the depickled function does not raise an - AttributeError, this function looks for any currently loaded submodule - that the function uses and whose parent is present in the function - globals, and saves it before saving the function. - """ - - # check if any known dependency is an imported package - for x in top_level_dependencies: - if isinstance(x, types.ModuleType) and hasattr(x, '__package__') and x.__package__: - # check if the package has any currently loaded sub-imports - prefix = x.__name__ + '.' - # A concurrent thread could mutate sys.modules, - # make sure we iterate over a copy to avoid exceptions - for name in list(sys.modules): - # Older versions of pytest will add a "None" module to sys.modules. - if name is not None and name.startswith(prefix): - # check whether the function can address the sub-module - tokens = set(name[len(prefix):].split('.')) - if not tokens - set(code.co_names): - # ensure unpickler executes this import - self.save(sys.modules[name]) - # then discards the reference to it - self.write(pickle.POP) - def _save_dynamic_enum(self, obj, clsdict): """Special handling for dynamic Enum subclasses @@ -750,16 +702,15 @@ def save_function_tuple(self, func): save(_fill_function) # skeleton function updater write(pickle.MARK) # beginning of tuple that _fill_function expects + # Extract currently-imported submodules used by func. Storing these + # modules in a smoke _cloudpickle_subimports attribute of the object's + # state will trigger the side effect of importing these modules at + # unpickling time (which is necessary for func to work correctly once + # depickled) submodules = _find_imported_submodules( code, itertools.chain(f_globals.values(), closure_values or ()), ) - for s in submodules: - # ensure that subimport s is loaded at unpickling time - self.save(s) - # then discards the reference to it - self.write(pickle.POP) - # create a skeleton function object and memoize it save(_make_skel_func) @@ -780,6 +731,7 @@ def save_function_tuple(self, func): 'module': func.__module__, 'name': func.__name__, 'doc': func.__doc__, + '_cloudpickle_submodules': submodules } if hasattr(func, '__annotations__') and sys.version_info >= (3, 4): state['annotations'] = func.__annotations__ @@ -1260,6 +1212,13 @@ def _fill_function(*args): func.__qualname__ = state['qualname'] if 'kwdefaults' in state: func.__kwdefaults__ = state['kwdefaults'] + # _cloudpickle_subimports is a set of submodules that must be loaded for + # the pickled function to work correctly at unpickling time. Now that these + # submodules are depickled (hence imported), they can be removed from the + # object's state (the object state only served as a reference holder to + # these submodules) + if '_cloudpickle_submodules' in state: + state.pop('_cloudpickle_submodules') cells = func.__closure__ if cells is not None: From 14190736e1d672dc0cec8c35a34c2b9943d31b58 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 15:41:27 +0200 Subject: [PATCH 63/70] MNT changelog --- CHANGES.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 14d989501..7f393af00 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,11 @@ 1.2.0 ===== +- Leverage the C-accelerated Pickler new subclassing API (available in Python + 3.8) in cloudpickle. This allows cloudpickle to pickle Python objects up to + 30 times faster. + ([issue #253](https://github.com/cloudpipe/cloudpickle/pull/253)) + - Support pickling of classmethod and staticmethod objects in python2. arguments. ([issue #262](https://github.com/cloudpipe/cloudpickle/pull/262)) From e9425ce77216ba06ca27bc5123d60f211d851ed2 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 15:49:11 +0200 Subject: [PATCH 64/70] CLN remove PyPy-specific code in cloudpickle_fast --- cloudpickle/cloudpickle_fast.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 31a099127..2b53adcfe 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -5,6 +5,10 @@ previous python implementation of the Pickler class. Because this functionality is only available for python versions 3.8+, a lot of backward-compatibility code is also removed. + +Note that the C Pickler sublassing API is CPython-specific. Therefore, some +guards present in cloudpickle.py that were written to handle PyPy specificities +are not present in cloudpickle_fast.py """ import abc import copyreg @@ -63,11 +67,7 @@ def dumps(obj, protocol=None): # ------------------------------------------------- def _class_getnewargs(obj): - # On PyPy, __doc__ is a readonly attribute, so we need to include it in - # the initial skeleton class. This is safe because we know that the - # doc can't participate in a cycle with the original class. - type_kwargs = {'__doc__': obj.__dict__.get('__doc__', None)} - + type_kwargs = {} if hasattr(obj, "__slots__"): type_kwargs["__slots__"] = obj.__slots__ @@ -135,7 +135,6 @@ def _function_getstate(func): def _class_getstate(obj): clsdict = _extract_class_dict(obj) clsdict.pop('__weakref__', None) - clsdict.pop('__doc__', None) # present in the reconstructor args # For ABCMeta in python3.7+, remove _abc_impl as it is not picklable. # This is a fix which breaks the cache but this only makes the first @@ -516,9 +515,7 @@ def _function_getnewargs(self, func): # Add module attributes used to resolve relative imports # instructions inside func. for k in ["__package__", "__name__", "__path__", "__file__"]: - # Some built-in functions/methods such as object.__new__ have - # their __globals__ set to None in PyPy - if func.__globals__ is not None and k in func.__globals__: + if k in func.__globals__: base_globals[k] = func.__globals__[k] # Do not bind the free variables before the function is created to From 80246a95dc10241533182bf1a24211f8b7e569a2 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 15:58:47 +0200 Subject: [PATCH 65/70] CLN fix flake8 complains --- cloudpickle/cloudpickle_fast.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 2b53adcfe..4d368962d 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -2,8 +2,8 @@ New, fast version of the CloudPickler. This new CloudPickler class can now extend the fast C Pickler instead of the -previous python implementation of the Pickler class. Because this functionality -is only available for python versions 3.8+, a lot of backward-compatibility +previous Python implementation of the Pickler class. Because this functionality +is only available for Python versions 3.8+, a lot of backward-compatibility code is also removed. Note that the C Pickler sublassing API is CPython-specific. Therefore, some @@ -26,13 +26,13 @@ from .cloudpickle import ( _is_dynamic, _extract_code_globals, _BUILTIN_TYPE_NAMES, DEFAULT_PROTOCOL, _find_imported_submodules, _get_cell_contents, _is_global, _builtin_type, - Enum, _ensure_tracking, _lookup_class_or_track, _make_skeleton_class, - _make_skeleton_enum, _extract_class_dict, string_types, dynamic_subimport, - subimport + Enum, _ensure_tracking, _make_skeleton_class, _make_skeleton_enum, + _extract_class_dict, string_types, dynamic_subimport, subimport ) load, loads = _pickle.load, _pickle.loads + # Shorthands similar to pickle.dump/pickle.dumps def dump(obj, file, protocol=None): """Serialize obj as bytes streamed into file @@ -391,8 +391,9 @@ class CloudPickler(Pickler): * its dispatch_table containing reducers that are called only if ALL built-in saving functions were previously discarded. - * a special callback named "reducer_override", invoked before standard function/class - builtin-saving method (save_global), to serialize dynamic functions + * a special callback named "reducer_override", invoked before standard + function/class builtin-saving method (save_global), to serialize dynamic + functions """ # cloudpickle's own dispatch_table, containing the additional set of From 4275d10abe3ca4008675136851469a4f14410e4d Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 16:15:22 +0200 Subject: [PATCH 66/70] CLN clearer doc --- cloudpickle/cloudpickle_fast.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 4d368962d..1488f59f9 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -483,14 +483,17 @@ def _dynamic_function_reduce(self, func): _function_setstate) def _function_reduce(self, obj): - """Select the reducer depending on obj's dynamic nature + """Reducer for function objects. - This functions starts by replicating save_global: trying to retrieve - obj from an attribute lookup of a file-backed module. If this check - fails, then a custom reducer is called. + If obj is a top-level attribute of a file-backed module, this + reducer returns NotImplemented, making the CloudPickler fallback to + traditional _pickle.Pickler routines to save obj. Otherwise, it reduces + obj using a custom cloudpickle reducer designed specifically to handle + dynamic functions. + + As opposed to cloudpickle.py, There no special handling for builtin + pypy functions because cloudpickle_fast is CPython-specific. """ - # There no special handling for builtin pypy functions like in - # cloudpickle.py because cloudpickle_fast is CPython-specific. if _is_global(obj): return NotImplemented else: From 853bd3135776570136f433139c3dc2eca47e58f3 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 16:16:12 +0200 Subject: [PATCH 67/70] CLN cosmetics --- cloudpickle/cloudpickle_fast.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 1488f59f9..b2377ac58 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -475,8 +475,7 @@ def reducer_override(self, obj): # function reducers are defined as instance methods of CloudPickler # objects, as they rely on a CloudPickler attribute (globals_ref) def _dynamic_function_reduce(self, func): - """Reduce a function that is not pickleable via attribute lookup. - """ + """Reduce a function that is not pickleable via attribute lookup.""" newargs = self._function_getnewargs(func) state = _function_getstate(func) return (types.FunctionType, newargs, state, None, None, From 021a6b19e944678c41d7eb2e88a2cee603d810cb Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 16:17:00 +0200 Subject: [PATCH 68/70] CLN cosmetics (2) --- cloudpickle/cloudpickle_fast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index b2377ac58..2dbfb3ec3 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -452,9 +452,9 @@ def reducer_override(self, obj): Notes: - - reducer_override has the priority over dispatch_table-registered + * reducer_override has the priority over dispatch_table-registered reducers. - - reducer_override can be use to fix other limitations of cloudpickle + * reducer_override can be use to fix other limitations of cloudpickle for other types that suffered from type-specific reducers, such as Exceptions. See https://github.com/cloudpipe/cloudpickle/issues/248 """ From f27427a0565a1ffdc4e8d4053e61ad8a789fd932 Mon Sep 17 00:00:00 2001 From: Pierre Glaser Date: Fri, 7 Jun 2019 16:20:56 +0200 Subject: [PATCH 69/70] CLN stale comment --- cloudpickle/cloudpickle_fast.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 2dbfb3ec3..0caca2592 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -313,9 +313,6 @@ def _dynamic_class_reduce(obj): def _class_reduce(obj): """Select the reducer depending on the dynamic nature of the class obj""" - # XXX: there used to be special handling for NoneType, EllipsisType and - # NotImplementedType. As for now this module handles only python3.8+, this - # code has been removed. if obj is type(None): # noqa return type, (None,) elif obj is type(Ellipsis): From 21726eb6eb175f296815576161196c60dc54cb8f Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 7 Jun 2019 18:26:06 +0200 Subject: [PATCH 70/70] typo --- cloudpickle/cloudpickle_fast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudpickle/cloudpickle_fast.py b/cloudpickle/cloudpickle_fast.py index 0caca2592..54421edb8 100644 --- a/cloudpickle/cloudpickle_fast.py +++ b/cloudpickle/cloudpickle_fast.py @@ -451,7 +451,7 @@ def reducer_override(self, obj): * reducer_override has the priority over dispatch_table-registered reducers. - * reducer_override can be use to fix other limitations of cloudpickle + * reducer_override can be used to fix other limitations of cloudpickle for other types that suffered from type-specific reducers, such as Exceptions. See https://github.com/cloudpipe/cloudpickle/issues/248 """