diff --git a/docs/source/python/api/ipc.rst b/docs/source/python/api/ipc.rst index f4c0a17dba9..cc3ccfe40bc 100644 --- a/docs/source/python/api/ipc.rst +++ b/docs/source/python/api/ipc.rst @@ -48,6 +48,13 @@ Inter-Process Communication Serialization ------------- +.. warning:: + + The serialization functionality is deprecated in pyarrow 2.0, and will + be removed in a future version. Use the standard library ``pickle`` or + the IPC functionality of pyarrow (see :ref:`ipc`). + + .. autosummary:: :toctree: ../generated/ diff --git a/docs/source/python/ipc.rst b/docs/source/python/ipc.rst index b7a032d1342..5eeedbdae89 100644 --- a/docs/source/python/ipc.rst +++ b/docs/source/python/ipc.rst @@ -157,17 +157,25 @@ DataFrame output: Arbitrary Object Serialization ------------------------------ +.. warning:: + + The custom serialization functionality is deprecated in pyarrow 2.0, and + will be removed in a future version. + + While the serialization functions in this section utilize the Arrow stream + protocol internally, they do not produce data that is compatible with the + above ``ipc.open_file`` and ``ipc.open_stream`` functions. + + For arbitrary objects, you can use the standard library ``pickle`` + functionality instead. For pyarrow objects, you can use the IPC + serialization format through the ``pyarrow.ipc`` module, as explained + above. + In ``pyarrow`` we are able to serialize and deserialize many kinds of Python objects. While not a complete replacement for the ``pickle`` module, these functions can be significantly faster, particular when dealing with collections of NumPy arrays. -.. warning:: - - While the functions in this section utilize the Arrow stream protocol - internally, they do not produce data that is compatible with the above - ``ipc.open_file`` and ``ipc.open_stream`` functions. - As an example, consider a dictionary containing NumPy arrays: .. ipython:: python @@ -324,7 +332,7 @@ An object can be reconstructed from its component-based representation using ``SerializationContext`` objects. Serializing pandas Objects --------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~ The default serialization context has optimized handling of pandas objects like ``DataFrame`` and ``Series``. Combined with component-based diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 9f544a12184..1dd209a66fc 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -187,7 +187,6 @@ def show_versions(): from pyarrow.lib import (deserialize_from, deserialize, deserialize_components, serialize, serialize_to, read_serialized, - SerializedPyObject, SerializationContext, SerializationCallbackError, DeserializationCallbackError) @@ -203,15 +202,28 @@ def show_versions(): import pyarrow.types as types -# deprecated filesystems +# deprecated top-level access -from pyarrow.filesystem import FileSystem as _FileSystem, LocalFileSystem as _LocalFileSystem + +from pyarrow.filesystem import FileSystem as _FileSystem +from pyarrow.filesystem import LocalFileSystem as _LocalFileSystem from pyarrow.hdfs import HadoopFileSystem as _HadoopFileSystem +from pyarrow.lib import SerializationContext as _SerializationContext +from pyarrow.lib import SerializedPyObject as _SerializedPyObject + + _localfs = _LocalFileSystem._get_instance() -_msg = "pyarrow.{0} is deprecated as of 2.0.0, please use pyarrow.fs.{1} instead." +_msg = ( + "pyarrow.{0} is deprecated as of 2.0.0, please use pyarrow.fs.{1} instead." +) + +_serialization_msg = ( + "'pyarrow.{0}' is deprecated and will be removed in a future version. " + "Use pickle or the pyarrow IPC functionality instead." +) _deprecated = { "localfs": (_localfs, "LocalFileSystem"), @@ -220,6 +232,11 @@ def show_versions(): "HadoopFileSystem": (_HadoopFileSystem, "HadoopFileSystem"), } +_serialization_deprecatd = { + "SerializationContext": _SerializationContext, + "SerializedPyObject": _SerializedPyObject, +} + if _sys.version_info >= (3, 7): def __getattr__(name): if name in _deprecated: @@ -227,6 +244,10 @@ def __getattr__(name): _warnings.warn(_msg.format(name, new_name), DeprecationWarning, stacklevel=2) return obj + elif name in _serialization_deprecatd: + _warnings.warn(_serialization_msg.format(name), + DeprecationWarning, stacklevel=2) + return _serialization_deprecatd[name] raise AttributeError( "module 'pyarrow' has no attribute '{0}'".format(name) @@ -236,6 +257,8 @@ def __getattr__(name): FileSystem = _FileSystem LocalFileSystem = _LocalFileSystem HadoopFileSystem = _HadoopFileSystem + SerializationContext = _SerializationContext + SerializedPyObject = _SerializedPyObject # Entry point for starting the plasma store diff --git a/python/pyarrow/_plasma.pyx b/python/pyarrow/_plasma.pyx index 157fa7a4d79..5cab1030169 100644 --- a/python/pyarrow/_plasma.pyx +++ b/python/pyarrow/_plasma.pyx @@ -535,7 +535,13 @@ cdef class PlasmaClient(_Weakrefable): """ cdef ObjectID target_id = (object_id if object_id else ObjectID.from_random()) - serialized = pyarrow.serialize(value, serialization_context) + if serialization_context is not None: + warnings.warn( + "'serialization_context' is deprecated and will be removed " + "in a future version.", + DeprecationWarning, stacklevel=2 + ) + serialized = pyarrow.lib._serialize(value, serialization_context) buffer = self.create(target_id, serialized.total_bytes) stream = pyarrow.FixedSizeBufferWriter(buffer) stream.set_memcopy_threads(memcopy_threads) @@ -566,6 +572,12 @@ cdef class PlasmaClient(_Weakrefable): the object_ids and ObjectNotAvailable if the object was not available. """ + if serialization_context is not None: + warnings.warn( + "'serialization_context' is deprecated and will be removed " + "in a future version.", + DeprecationWarning, stacklevel=2 + ) if isinstance(object_ids, Sequence): results = [] buffers = self.get_buffers(object_ids, timeout_ms) @@ -573,8 +585,8 @@ cdef class PlasmaClient(_Weakrefable): # buffers[i] is None if this object was not available within # the timeout if buffers[i]: - val = pyarrow.deserialize(buffers[i], - serialization_context) + val = pyarrow.lib._deserialize(buffers[i], + serialization_context) results.append(val) else: results.append(ObjectNotAvailable) diff --git a/python/pyarrow/serialization.pxi b/python/pyarrow/serialization.pxi index ee999b5aa8b..b8d7df54b15 100644 --- a/python/pyarrow/serialization.pxi +++ b/python/pyarrow/serialization.pxi @@ -17,6 +17,16 @@ from cpython.ref cimport PyObject +import warnings + + +def _deprecate_serialization(name): + msg = ( + "'pyarrow.{}' is deprecated as of 2.0.0 and will be removed in a " + "future version. Use pickle or the pyarrow IPC functionality instead." + ).format(name) + warnings.warn(msg, DeprecationWarning, stacklevel=3) + def is_named_tuple(cls): """ @@ -224,9 +234,10 @@ _default_context_initialized = False def _get_default_context(): global _default_context_initialized - from pyarrow.serialization import register_default_serialization_handlers + from pyarrow.serialization import _register_default_serialization_handlers if not _default_context_initialized: - register_default_serialization_handlers(_default_serialization_context) + _register_default_serialization_handlers( + _default_serialization_context) _default_context_initialized = True return _default_serialization_context @@ -369,6 +380,11 @@ def serialize(object value, SerializationContext context=None): serialized : SerializedPyObject """ + _deprecate_serialization("serialize") + return _serialize(value, context) + + +def _serialize(object value, SerializationContext context=None): cdef SerializedPyObject serialized = SerializedPyObject() wrapped_value = [value] @@ -394,7 +410,8 @@ def serialize_to(object value, sink, SerializationContext context=None): Custom serialization and deserialization context, uses a default context with some standard type handlers if not specified. """ - serialized = serialize(value, context) + _deprecate_serialization("serialize_to") + serialized = _serialize(value, context) serialized.write_to(sink) @@ -414,6 +431,11 @@ def read_serialized(source, base=None): ------- serialized : the serialized data """ + _deprecate_serialization("read_serialized") + return _read_serialized(source, base=base) + + +def _read_serialized(source, base=None): cdef shared_ptr[CRandomAccessFile] stream get_reader(source, True, &stream) @@ -447,7 +469,8 @@ def deserialize_from(source, object base, SerializationContext context=None): object Python object for the deserialized sequence. """ - serialized = read_serialized(source, base=base) + _deprecate_serialization("deserialize_from") + serialized = _read_serialized(source, base=base) return serialized.deserialize(context) @@ -465,6 +488,7 @@ def deserialize_components(components, SerializationContext context=None): ------- object : the Python object that was originally serialized """ + _deprecate_serialization("deserialize_components") serialized = SerializedPyObject.from_components(components) return serialized.deserialize(context) @@ -487,5 +511,11 @@ def deserialize(obj, SerializationContext context=None): ------- deserialized : object """ + _deprecate_serialization("deserialize") + return _deserialize(obj, context=context) + + +def _deserialize(obj, SerializationContext context=None): source = BufferReader(obj) - return deserialize_from(source, obj, context) + serialized = _read_serialized(source, base=obj) + return serialized.deserialize(context) diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py index 9c1d53ea646..5e8ea697ccb 100644 --- a/python/pyarrow/serialization.py +++ b/python/pyarrow/serialization.py @@ -16,6 +16,7 @@ # under the License. import collections +import warnings import numpy as np @@ -75,6 +76,14 @@ def descr_to_dtype(descr): 'offsets': offsets, 'itemsize': offset}) +def _deprecate_serialization(name): + msg = ( + "'pyarrow.{}' is deprecated as of 2.0.0 and will be removed in a " + "future version. Use pickle or the pyarrow IPC functionality instead." + ).format(name) + warnings.warn(msg, DeprecationWarning, stacklevel=3) + + # ---------------------------------------------------------------------- # Set up serialization for numpy with dtype object (primitive types are # handled efficiently with Arrow's Tensor facilities, see @@ -261,6 +270,7 @@ def _deserialize_pandas_series(data): def register_torch_serialization_handlers(serialization_context): # ---------------------------------------------------------------------- # Set up serialization for pytorch tensors + _deprecate_serialization("register_torch_serialization_handlers") try: import torch @@ -432,7 +442,7 @@ def _deserialize_pydata_sparse(data): pass -def register_default_serialization_handlers(serialization_context): +def _register_default_serialization_handlers(serialization_context): # ---------------------------------------------------------------------- # Set up serialization for primitive datatypes @@ -482,7 +492,13 @@ def register_default_serialization_handlers(serialization_context): _register_pydata_sparse_handlers(serialization_context) +def register_default_serialization_handlers(serialization_context): + _deprecate_serialization("register_default_serialization_handlers") + _register_default_serialization_handlers(serialization_context) + + def default_serialization_context(): + _deprecate_serialization("default_serialization_context") context = SerializationContext() - register_default_serialization_handlers(context) + _register_default_serialization_handlers(context) return context diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 0e01dc08ef6..41b76339782 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2764,10 +2764,13 @@ def test_roundtrip_with_bytes_unicode(columns): def _check_serialize_components_roundtrip(pd_obj): - ctx = pa.default_serialization_context() + with pytest.warns(DeprecationWarning): + ctx = pa.default_serialization_context() - components = ctx.serialize(pd_obj).to_components() - deserialized = ctx.deserialize_components(components) + with pytest.warns(DeprecationWarning): + components = ctx.serialize(pd_obj).to_components() + with pytest.warns(DeprecationWarning): + deserialized = ctx.deserialize_components(components) if isinstance(pd_obj, pd.DataFrame): tm.assert_frame_equal(pd_obj, deserialized) diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py index f1e0cf42563..456f42c92fa 100644 --- a/python/pyarrow/tests/test_serialization.py +++ b/python/pyarrow/tests/test_serialization.py @@ -52,6 +52,11 @@ sparse = None +# ignore all serialization deprecation warnings in this file, we test that the +# warnings are actually raised in test_serialization_deprecated.py +pytestmark = pytest.mark.filterwarnings("ignore:'pyarrow:DeprecationWarning") + + def assert_equal(obj1, obj2): if torch is not None and torch.is_tensor(obj1) and torch.is_tensor(obj2): if obj1.is_sparse: @@ -233,7 +238,8 @@ class CustomError(Exception): def make_serialization_context(): - context = pa.default_serialization_context() + with pytest.warns(DeprecationWarning): + context = pa.default_serialization_context() context.register_type(Foo, "Foo") context.register_type(Bar, "Bar") diff --git a/python/pyarrow/tests/test_serialization_deprecated.py b/python/pyarrow/tests/test_serialization_deprecated.py new file mode 100644 index 00000000000..f818d56b4d3 --- /dev/null +++ b/python/pyarrow/tests/test_serialization_deprecated.py @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +import pytest + +import pyarrow as pa + + +def test_serialization_deprecated(): + with pytest.warns(DeprecationWarning): + ser = pa.serialize(1) + + with pytest.warns(DeprecationWarning): + pa.deserialize(ser.to_buffer()) + + f = pa.BufferOutputStream() + with pytest.warns(DeprecationWarning): + pa.serialize_to(12, f) + + buf = f.getvalue() + f = pa.BufferReader(buf) + with pytest.warns(DeprecationWarning): + pa.read_serialized(f).deserialize() + + with pytest.warns(DeprecationWarning): + pa.default_serialization_context() + + context = pa.lib.SerializationContext() + with pytest.warns(DeprecationWarning): + pa.register_default_serialization_handlers(context) + + +@pytest.mark.skipif(sys.version_info < (3, 7), + reason="getattr needs Python 3.7") +def test_serialization_deprecated_toplevel(): + with pytest.warns(DeprecationWarning): + pa.SerializedPyObject() + + with pytest.warns(DeprecationWarning): + pa.SerializationContext()