apache · jorisvandenbossche · Sep 24, 2020 · Sep 30, 2020 · Oct 7, 2020 · Oct 7, 2020
diff --git a/docs/source/python/api/ipc.rst b/docs/source/python/api/ipc.rst
@@ -48,6 +48,13 @@ Inter-Process Communication
 Serialization
 -------------
 
+.. warning::
+
+   The serialization functionality is deprecated in pyarrow 2.0, and will
+   be removed in a future version. Use the standard library ``pickle`` or
+   the IPC functionality of pyarrow (see :ref:`ipc`).
+
+
 .. autosummary::
    :toctree: ../generated/
 

diff --git a/docs/source/python/ipc.rst b/docs/source/python/ipc.rst
@@ -157,17 +157,25 @@ DataFrame output:
 Arbitrary Object Serialization
 ------------------------------
 
+.. warning::
+
+   The custom serialization functionality is deprecated in pyarrow 2.0, and
+   will be removed in a future version.
+
+   While the serialization functions in this section utilize the Arrow stream
+   protocol internally, they do not produce data that is compatible with the
+   above ``ipc.open_file`` and ``ipc.open_stream`` functions.
+
+   For arbitrary objects, you can use the standard library ``pickle``
+   functionality instead. For pyarrow objects, you can use the IPC
+   serialization format through the ``pyarrow.ipc`` module, as explained
+   above.
+
 In ``pyarrow`` we are able to serialize and deserialize many kinds of Python
 objects. While not a complete replacement for the ``pickle`` module, these
 functions can be significantly faster, particular when dealing with collections
 of NumPy arrays.
 
-.. warning::
-
-   While the functions in this section utilize the Arrow stream protocol
-   internally, they do not produce data that is compatible with the above
-   ``ipc.open_file`` and ``ipc.open_stream`` functions.
-
 As an example, consider a dictionary containing NumPy arrays:
 
 .. ipython:: python
@@ -324,7 +332,7 @@ An object can be reconstructed from its component-based representation using
 ``SerializationContext`` objects.
 
 Serializing pandas Objects
---------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The default serialization context has optimized handling of pandas
 objects like ``DataFrame`` and ``Series``. Combined with component-based

diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
@@ -187,7 +187,6 @@ def show_versions():
 from pyarrow.lib import (deserialize_from, deserialize,
                          deserialize_components,
                          serialize, serialize_to, read_serialized,
-                         SerializedPyObject, SerializationContext,
                          SerializationCallbackError,
                          DeserializationCallbackError)
 
@@ -203,15 +202,28 @@ def show_versions():
 import pyarrow.types as types
 
 
-# deprecated filesystems
+# deprecated top-level access
 
-from pyarrow.filesystem import FileSystem as _FileSystem, LocalFileSystem as _LocalFileSystem
+
+from pyarrow.filesystem import FileSystem as _FileSystem
+from pyarrow.filesystem import LocalFileSystem as _LocalFileSystem
 from pyarrow.hdfs import HadoopFileSystem as _HadoopFileSystem
 
+from pyarrow.lib import SerializationContext as _SerializationContext
+from pyarrow.lib import SerializedPyObject as _SerializedPyObject
+
+
 _localfs = _LocalFileSystem._get_instance()
 
 
-_msg = "pyarrow.{0} is deprecated as of 2.0.0, please use pyarrow.fs.{1} instead."
+_msg = (
+    "pyarrow.{0} is deprecated as of 2.0.0, please use pyarrow.fs.{1} instead."
+)
+
+_serialization_msg = (
+    "'pyarrow.{0}' is deprecated and will be removed in a future version. "
+    "Use pickle or the pyarrow IPC functionality instead."
+)
 
 _deprecated = {
     "localfs": (_localfs, "LocalFileSystem"),
@@ -220,13 +232,22 @@ def show_versions():
     "HadoopFileSystem": (_HadoopFileSystem, "HadoopFileSystem"),
 }
 
+_serialization_deprecatd = {
+    "SerializationContext": _SerializationContext,
+    "SerializedPyObject": _SerializedPyObject,
+}
+
 if _sys.version_info >= (3, 7):
     def __getattr__(name):
         if name in _deprecated:
             obj, new_name = _deprecated[name]
             _warnings.warn(_msg.format(name, new_name),
                            DeprecationWarning, stacklevel=2)
             return obj
+        elif name in _serialization_deprecatd:
+            _warnings.warn(_serialization_msg.format(name),
+                           DeprecationWarning, stacklevel=2)
+            return _serialization_deprecatd[name]
 
         raise AttributeError(
             "module 'pyarrow' has no attribute '{0}'".format(name)
@@ -236,6 +257,8 @@ def __getattr__(name):
     FileSystem = _FileSystem
     LocalFileSystem = _LocalFileSystem
     HadoopFileSystem = _HadoopFileSystem
+    SerializationContext = _SerializationContext
+    SerializedPyObject = _SerializedPyObject
 
 
 # Entry point for starting the plasma store

diff --git a/python/pyarrow/_plasma.pyx b/python/pyarrow/_plasma.pyx
@@ -535,7 +535,13 @@ cdef class PlasmaClient(_Weakrefable):
         """
         cdef ObjectID target_id = (object_id if object_id
                                    else ObjectID.from_random())
-        serialized = pyarrow.serialize(value, serialization_context)
+        if serialization_context is not None:
+            warnings.warn(
+                "'serialization_context' is deprecated and will be removed "
+                "in a future version.",
+                DeprecationWarning, stacklevel=2
+            )
+        serialized = pyarrow.lib._serialize(value, serialization_context)
         buffer = self.create(target_id, serialized.total_bytes)
         stream = pyarrow.FixedSizeBufferWriter(buffer)
         stream.set_memcopy_threads(memcopy_threads)
@@ -566,15 +572,21 @@ cdef class PlasmaClient(_Weakrefable):
             the object_ids and ObjectNotAvailable if the object was not
             available.
         """
+        if serialization_context is not None:
+            warnings.warn(
+                "'serialization_context' is deprecated and will be removed "
+                "in a future version.",
+                DeprecationWarning, stacklevel=2
+            )
         if isinstance(object_ids, Sequence):
             results = []
             buffers = self.get_buffers(object_ids, timeout_ms)
             for i in range(len(object_ids)):
                 # buffers[i] is None if this object was not available within
                 # the timeout
                 if buffers[i]:
-                    val = pyarrow.deserialize(buffers[i],
-                                              serialization_context)
+                    val = pyarrow.lib._deserialize(buffers[i],
+                                                   serialization_context)
                     results.append(val)
                 else:
                     results.append(ObjectNotAvailable)

diff --git a/python/pyarrow/serialization.pxi b/python/pyarrow/serialization.pxi
@@ -17,6 +17,16 @@
 
 from cpython.ref cimport PyObject
 
+import warnings
+
+
+def _deprecate_serialization(name):
+    msg = (
+        "'pyarrow.{}' is deprecated as of 2.0.0 and will be removed in a "
+        "future version. Use pickle or the pyarrow IPC functionality instead."
+    ).format(name)
+    warnings.warn(msg, DeprecationWarning, stacklevel=3)
+
 
 def is_named_tuple(cls):
     """
@@ -224,9 +234,10 @@ _default_context_initialized = False
 
 def _get_default_context():
     global _default_context_initialized
-    from pyarrow.serialization import register_default_serialization_handlers
+    from pyarrow.serialization import _register_default_serialization_handlers
     if not _default_context_initialized:
-        register_default_serialization_handlers(_default_serialization_context)
+        _register_default_serialization_handlers(
+            _default_serialization_context)
         _default_context_initialized = True
     return _default_serialization_context
 
@@ -369,6 +380,11 @@ def serialize(object value, SerializationContext context=None):
     serialized : SerializedPyObject
 
     """
+    _deprecate_serialization("serialize")
+    return _serialize(value, context)
+
+
+def _serialize(object value, SerializationContext context=None):
     cdef SerializedPyObject serialized = SerializedPyObject()
     wrapped_value = [value]
 
@@ -394,7 +410,8 @@ def serialize_to(object value, sink, SerializationContext context=None):
         Custom serialization and deserialization context, uses a default
         context with some standard type handlers if not specified.
     """
-    serialized = serialize(value, context)
+    _deprecate_serialization("serialize_to")
+    serialized = _serialize(value, context)
     serialized.write_to(sink)
 
 
@@ -414,6 +431,11 @@ def read_serialized(source, base=None):
     -------
     serialized : the serialized data
     """
+    _deprecate_serialization("read_serialized")
+    return _read_serialized(source, base=base)
+
+
+def _read_serialized(source, base=None):
     cdef shared_ptr[CRandomAccessFile] stream
     get_reader(source, True, &stream)
 
@@ -447,7 +469,8 @@ def deserialize_from(source, object base, SerializationContext context=None):
     object
         Python object for the deserialized sequence.
     """
-    serialized = read_serialized(source, base=base)
+    _deprecate_serialization("deserialize_from")
+    serialized = _read_serialized(source, base=base)
     return serialized.deserialize(context)
 
 
@@ -465,6 +488,7 @@ def deserialize_components(components, SerializationContext context=None):
     -------
     object : the Python object that was originally serialized
     """
+    _deprecate_serialization("deserialize_components")
     serialized = SerializedPyObject.from_components(components)
     return serialized.deserialize(context)
 
@@ -487,5 +511,11 @@ def deserialize(obj, SerializationContext context=None):
     -------
     deserialized : object
     """
+    _deprecate_serialization("deserialize")
+    return _deserialize(obj, context=context)
+
+
+def _deserialize(obj, SerializationContext context=None):
     source = BufferReader(obj)
-    return deserialize_from(source, obj, context)
+    serialized = _read_serialized(source, base=obj)
+    return serialized.deserialize(context)
diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import collections
+import warnings
 
 import numpy as np
 
@@ -75,6 +76,14 @@ def descr_to_dtype(descr):
                          'offsets': offsets, 'itemsize': offset})
 
 
+def _deprecate_serialization(name):
+    msg = (
+        "'pyarrow.{}' is deprecated as of 2.0.0 and will be removed in a "
+        "future version. Use pickle or the pyarrow IPC functionality instead."
+    ).format(name)
+    warnings.warn(msg, DeprecationWarning, stacklevel=3)
+
+
 # ----------------------------------------------------------------------
 # Set up serialization for numpy with dtype object (primitive types are
 # handled efficiently with Arrow's Tensor facilities, see
@@ -261,6 +270,7 @@ def _deserialize_pandas_series(data):
 def register_torch_serialization_handlers(serialization_context):
     # ----------------------------------------------------------------------
     # Set up serialization for pytorch tensors
+    _deprecate_serialization("register_torch_serialization_handlers")
 
     try:
         import torch
@@ -432,7 +442,7 @@ def _deserialize_pydata_sparse(data):
         pass
 
 
-def register_default_serialization_handlers(serialization_context):
+def _register_default_serialization_handlers(serialization_context):
 
     # ----------------------------------------------------------------------
     # Set up serialization for primitive datatypes
@@ -482,7 +492,13 @@ def register_default_serialization_handlers(serialization_context):
     _register_pydata_sparse_handlers(serialization_context)
 
 
+def register_default_serialization_handlers(serialization_context):
+    _deprecate_serialization("register_default_serialization_handlers")
+    _register_default_serialization_handlers(serialization_context)
+
+
 def default_serialization_context():
+    _deprecate_serialization("default_serialization_context")
     context = SerializationContext()
-    register_default_serialization_handlers(context)
+    _register_default_serialization_handlers(context)
     return context
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -2764,10 +2764,13 @@ def test_roundtrip_with_bytes_unicode(columns):
 
 
 def _check_serialize_components_roundtrip(pd_obj):
-    ctx = pa.default_serialization_context()
+    with pytest.warns(DeprecationWarning):
+        ctx = pa.default_serialization_context()
 
-    components = ctx.serialize(pd_obj).to_components()
-    deserialized = ctx.deserialize_components(components)
+    with pytest.warns(DeprecationWarning):
+        components = ctx.serialize(pd_obj).to_components()
+    with pytest.warns(DeprecationWarning):
+        deserialized = ctx.deserialize_components(components)
 
     if isinstance(pd_obj, pd.DataFrame):
         tm.assert_frame_equal(pd_obj, deserialized)

diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
@@ -52,6 +52,11 @@
     sparse = None
 
 
+# ignore all serialization deprecation warnings in this file, we test that the
+# warnings are actually raised in test_serialization_deprecated.py
+pytestmark = pytest.mark.filterwarnings("ignore:'pyarrow:DeprecationWarning")
+
+
 def assert_equal(obj1, obj2):
     if torch is not None and torch.is_tensor(obj1) and torch.is_tensor(obj2):
         if obj1.is_sparse:
@@ -233,7 +238,8 @@ class CustomError(Exception):
 
 
 def make_serialization_context():
-    context = pa.default_serialization_context()
+    with pytest.warns(DeprecationWarning):
+        context = pa.default_serialization_context()
 
     context.register_type(Foo, "Foo")
     context.register_type(Bar, "Bar")