Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/source/python/api/ipc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@ Inter-Process Communication
Serialization
-------------

.. warning::

The serialization functionality is deprecated in pyarrow 2.0, and will
be removed in a future version. Use the standard library ``pickle`` or
the IPC functionality of pyarrow (see :ref:`ipc`).


.. autosummary::
:toctree: ../generated/

Expand Down
22 changes: 15 additions & 7 deletions docs/source/python/ipc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -157,17 +157,25 @@ DataFrame output:
Arbitrary Object Serialization
------------------------------

.. warning::

The custom serialization functionality is deprecated in pyarrow 2.0, and
will be removed in a future version.

While the serialization functions in this section utilize the Arrow stream
protocol internally, they do not produce data that is compatible with the
above ``ipc.open_file`` and ``ipc.open_stream`` functions.

For arbitrary objects, you can use the standard library ``pickle``
functionality instead. For pyarrow objects, you can use the IPC
serialization format through the ``pyarrow.ipc`` module, as explained
above.

In ``pyarrow`` we are able to serialize and deserialize many kinds of Python
objects. While not a complete replacement for the ``pickle`` module, these
functions can be significantly faster, particular when dealing with collections
of NumPy arrays.

.. warning::

While the functions in this section utilize the Arrow stream protocol
internally, they do not produce data that is compatible with the above
``ipc.open_file`` and ``ipc.open_stream`` functions.

As an example, consider a dictionary containing NumPy arrays:

.. ipython:: python
Expand Down Expand Up @@ -324,7 +332,7 @@ An object can be reconstructed from its component-based representation using
``SerializationContext`` objects.

Serializing pandas Objects
--------------------------
~~~~~~~~~~~~~~~~~~~~~~~~~~

The default serialization context has optimized handling of pandas
objects like ``DataFrame`` and ``Series``. Combined with component-based
Expand Down
31 changes: 27 additions & 4 deletions python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,6 @@ def show_versions():
from pyarrow.lib import (deserialize_from, deserialize,
deserialize_components,
serialize, serialize_to, read_serialized,
SerializedPyObject, SerializationContext,
SerializationCallbackError,
DeserializationCallbackError)

Expand All @@ -203,15 +202,28 @@ def show_versions():
import pyarrow.types as types


# deprecated filesystems
# deprecated top-level access

from pyarrow.filesystem import FileSystem as _FileSystem, LocalFileSystem as _LocalFileSystem

from pyarrow.filesystem import FileSystem as _FileSystem
from pyarrow.filesystem import LocalFileSystem as _LocalFileSystem
from pyarrow.hdfs import HadoopFileSystem as _HadoopFileSystem

from pyarrow.lib import SerializationContext as _SerializationContext
from pyarrow.lib import SerializedPyObject as _SerializedPyObject


_localfs = _LocalFileSystem._get_instance()


_msg = "pyarrow.{0} is deprecated as of 2.0.0, please use pyarrow.fs.{1} instead."
_msg = (
"pyarrow.{0} is deprecated as of 2.0.0, please use pyarrow.fs.{1} instead."
)

_serialization_msg = (
"'pyarrow.{0}' is deprecated and will be removed in a future version. "
"Use pickle or the pyarrow IPC functionality instead."
)

_deprecated = {
"localfs": (_localfs, "LocalFileSystem"),
Expand All @@ -220,13 +232,22 @@ def show_versions():
"HadoopFileSystem": (_HadoopFileSystem, "HadoopFileSystem"),
}

_serialization_deprecatd = {
"SerializationContext": _SerializationContext,
"SerializedPyObject": _SerializedPyObject,
}

if _sys.version_info >= (3, 7):
def __getattr__(name):
if name in _deprecated:
obj, new_name = _deprecated[name]
_warnings.warn(_msg.format(name, new_name),
DeprecationWarning, stacklevel=2)
return obj
elif name in _serialization_deprecatd:
_warnings.warn(_serialization_msg.format(name),
DeprecationWarning, stacklevel=2)
return _serialization_deprecatd[name]

raise AttributeError(
"module 'pyarrow' has no attribute '{0}'".format(name)
Expand All @@ -236,6 +257,8 @@ def __getattr__(name):
FileSystem = _FileSystem
LocalFileSystem = _LocalFileSystem
HadoopFileSystem = _HadoopFileSystem
SerializationContext = _SerializationContext
SerializedPyObject = _SerializedPyObject


# Entry point for starting the plasma store
Expand Down
18 changes: 15 additions & 3 deletions python/pyarrow/_plasma.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,13 @@ cdef class PlasmaClient(_Weakrefable):
"""
cdef ObjectID target_id = (object_id if object_id
else ObjectID.from_random())
serialized = pyarrow.serialize(value, serialization_context)
if serialization_context is not None:
warnings.warn(
"'serialization_context' is deprecated and will be removed "
"in a future version.",
DeprecationWarning, stacklevel=2
)
serialized = pyarrow.lib._serialize(value, serialization_context)
buffer = self.create(target_id, serialized.total_bytes)
stream = pyarrow.FixedSizeBufferWriter(buffer)
stream.set_memcopy_threads(memcopy_threads)
Expand Down Expand Up @@ -566,15 +572,21 @@ cdef class PlasmaClient(_Weakrefable):
the object_ids and ObjectNotAvailable if the object was not
available.
"""
if serialization_context is not None:
warnings.warn(
"'serialization_context' is deprecated and will be removed "
"in a future version.",
DeprecationWarning, stacklevel=2
)
if isinstance(object_ids, Sequence):
results = []
buffers = self.get_buffers(object_ids, timeout_ms)
for i in range(len(object_ids)):
# buffers[i] is None if this object was not available within
# the timeout
if buffers[i]:
val = pyarrow.deserialize(buffers[i],
serialization_context)
val = pyarrow.lib._deserialize(buffers[i],
serialization_context)
results.append(val)
else:
results.append(ObjectNotAvailable)
Expand Down
40 changes: 35 additions & 5 deletions python/pyarrow/serialization.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@

from cpython.ref cimport PyObject

import warnings


def _deprecate_serialization(name):
msg = (
"'pyarrow.{}' is deprecated as of 2.0.0 and will be removed in a "
"future version. Use pickle or the pyarrow IPC functionality instead."
).format(name)
warnings.warn(msg, DeprecationWarning, stacklevel=3)


def is_named_tuple(cls):
"""
Expand Down Expand Up @@ -224,9 +234,10 @@ _default_context_initialized = False

def _get_default_context():
global _default_context_initialized
from pyarrow.serialization import register_default_serialization_handlers
from pyarrow.serialization import _register_default_serialization_handlers
if not _default_context_initialized:
register_default_serialization_handlers(_default_serialization_context)
_register_default_serialization_handlers(
_default_serialization_context)
_default_context_initialized = True
return _default_serialization_context

Expand Down Expand Up @@ -369,6 +380,11 @@ def serialize(object value, SerializationContext context=None):
serialized : SerializedPyObject

"""
_deprecate_serialization("serialize")
return _serialize(value, context)


def _serialize(object value, SerializationContext context=None):
cdef SerializedPyObject serialized = SerializedPyObject()
wrapped_value = [value]

Expand All @@ -394,7 +410,8 @@ def serialize_to(object value, sink, SerializationContext context=None):
Custom serialization and deserialization context, uses a default
context with some standard type handlers if not specified.
"""
serialized = serialize(value, context)
_deprecate_serialization("serialize_to")
serialized = _serialize(value, context)
serialized.write_to(sink)


Expand All @@ -414,6 +431,11 @@ def read_serialized(source, base=None):
-------
serialized : the serialized data
"""
_deprecate_serialization("read_serialized")
return _read_serialized(source, base=base)


def _read_serialized(source, base=None):
cdef shared_ptr[CRandomAccessFile] stream
get_reader(source, True, &stream)

Expand Down Expand Up @@ -447,7 +469,8 @@ def deserialize_from(source, object base, SerializationContext context=None):
object
Python object for the deserialized sequence.
"""
serialized = read_serialized(source, base=base)
_deprecate_serialization("deserialize_from")
serialized = _read_serialized(source, base=base)
return serialized.deserialize(context)


Expand All @@ -465,6 +488,7 @@ def deserialize_components(components, SerializationContext context=None):
-------
object : the Python object that was originally serialized
"""
_deprecate_serialization("deserialize_components")
serialized = SerializedPyObject.from_components(components)
return serialized.deserialize(context)

Expand All @@ -487,5 +511,11 @@ def deserialize(obj, SerializationContext context=None):
-------
deserialized : object
"""
_deprecate_serialization("deserialize")
return _deserialize(obj, context=context)


def _deserialize(obj, SerializationContext context=None):
source = BufferReader(obj)
return deserialize_from(source, obj, context)
serialized = _read_serialized(source, base=obj)
return serialized.deserialize(context)
20 changes: 18 additions & 2 deletions python/pyarrow/serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# under the License.

import collections
import warnings

import numpy as np

Expand Down Expand Up @@ -75,6 +76,14 @@ def descr_to_dtype(descr):
'offsets': offsets, 'itemsize': offset})


def _deprecate_serialization(name):
msg = (
"'pyarrow.{}' is deprecated as of 2.0.0 and will be removed in a "
"future version. Use pickle or the pyarrow IPC functionality instead."
).format(name)
warnings.warn(msg, DeprecationWarning, stacklevel=3)


# ----------------------------------------------------------------------
# Set up serialization for numpy with dtype object (primitive types are
# handled efficiently with Arrow's Tensor facilities, see
Expand Down Expand Up @@ -261,6 +270,7 @@ def _deserialize_pandas_series(data):
def register_torch_serialization_handlers(serialization_context):
# ----------------------------------------------------------------------
# Set up serialization for pytorch tensors
_deprecate_serialization("register_torch_serialization_handlers")

try:
import torch
Expand Down Expand Up @@ -432,7 +442,7 @@ def _deserialize_pydata_sparse(data):
pass


def register_default_serialization_handlers(serialization_context):
def _register_default_serialization_handlers(serialization_context):

# ----------------------------------------------------------------------
# Set up serialization for primitive datatypes
Expand Down Expand Up @@ -482,7 +492,13 @@ def register_default_serialization_handlers(serialization_context):
_register_pydata_sparse_handlers(serialization_context)


def register_default_serialization_handlers(serialization_context):
_deprecate_serialization("register_default_serialization_handlers")
_register_default_serialization_handlers(serialization_context)


def default_serialization_context():
_deprecate_serialization("default_serialization_context")
context = SerializationContext()
register_default_serialization_handlers(context)
_register_default_serialization_handlers(context)
return context
9 changes: 6 additions & 3 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2764,10 +2764,13 @@ def test_roundtrip_with_bytes_unicode(columns):


def _check_serialize_components_roundtrip(pd_obj):
ctx = pa.default_serialization_context()
with pytest.warns(DeprecationWarning):
ctx = pa.default_serialization_context()

components = ctx.serialize(pd_obj).to_components()
deserialized = ctx.deserialize_components(components)
with pytest.warns(DeprecationWarning):
components = ctx.serialize(pd_obj).to_components()
with pytest.warns(DeprecationWarning):
deserialized = ctx.deserialize_components(components)

if isinstance(pd_obj, pd.DataFrame):
tm.assert_frame_equal(pd_obj, deserialized)
Expand Down
8 changes: 7 additions & 1 deletion python/pyarrow/tests/test_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@
sparse = None


# ignore all serialization deprecation warnings in this file, we test that the
# warnings are actually raised in test_serialization_deprecated.py
pytestmark = pytest.mark.filterwarnings("ignore:'pyarrow:DeprecationWarning")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm... is this another pytest magic? What does it do exactly? Filter these warnings only for this test module?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, this is a way to suppress (ignore) all DeprecationWarnings that start with "pyarrow" in this test module. That's easier than catching all individual warnings, since this full file is about serialization and thus deprecated.
(and therefore I added a separate file to explicitly test the warnings are raised)

Will add a short comment about it



def assert_equal(obj1, obj2):
if torch is not None and torch.is_tensor(obj1) and torch.is_tensor(obj2):
if obj1.is_sparse:
Expand Down Expand Up @@ -233,7 +238,8 @@ class CustomError(Exception):


def make_serialization_context():
context = pa.default_serialization_context()
with pytest.warns(DeprecationWarning):
context = pa.default_serialization_context()

context.register_type(Foo, "Foo")
context.register_type(Bar, "Bar")
Expand Down
Loading