diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py index ea6deab2269..3be93177d00 100644 --- a/python/pyarrow/serialization.py +++ b/python/pyarrow/serialization.py @@ -350,6 +350,37 @@ def _deserialize_scipy_sparse(data): pass +# ---------------------------------------------------------------------- +# Set up serialization for pydata/sparse tensors. + +def _register_pydata_sparse_handlers(serialization_context): + try: + import sparse + + def _serialize_pydata_sparse(obj): + if isinstance(obj, sparse.COO): + return 'coo', pa.SparseCOOTensor.from_pydata_sparse(obj) + else: + raise NotImplementedError( + "Serialization of {} is not supported.".format(sparse.COO)) + + def _deserialize_pydata_sparse(data): + if data[0] == 'coo': + data_array, coords = data[1].to_numpy() + return sparse.COO( + data=data_array[:, 0], + coords=coords.T, shape=data[1].shape) + + serialization_context.register_type( + sparse.COO, 'sparse.COO', + custom_serializer=_serialize_pydata_sparse, + custom_deserializer=_deserialize_pydata_sparse) + + except ImportError: + # no pydata/sparse + pass + + def register_default_serialization_handlers(serialization_context): # ---------------------------------------------------------------------- @@ -403,6 +434,7 @@ def register_default_serialization_handlers(serialization_context): _register_collections_serialization_handlers(serialization_context) _register_custom_pandas_handlers(serialization_context) _register_scipy_handlers(serialization_context) + _register_pydata_sparse_handlers(serialization_context) def default_serialization_context(): diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index 8661c1edd13..37a87cdc823 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -203,7 +203,34 @@ shape: {0.shape}""".format(self) coords = np.require(coords, dtype='i8', requirements='F') check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(), - obj.data.view(), coords, c_shape, c_dim_names, + obj.data, coords, c_shape, c_dim_names, + &csparse_tensor)) + return pyarrow_wrap_sparse_coo_tensor(csparse_tensor) + + @staticmethod + def from_pydata_sparse(obj, dim_names=None): + """ + Convert pydata/sparse.COO to arrow::SparseCOOTensor + """ + import sparse + if not isinstance(obj, sparse.COO): + raise TypeError( + "Expected sparse.COO, got {}".format(type(obj))) + + cdef shared_ptr[CSparseCOOTensor] csparse_tensor + cdef vector[int64_t] c_shape + cdef vector[c_string] c_dim_names + + for x in obj.shape: + c_shape.push_back(x) + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + coords = np.require(obj.coords.T, dtype='i8', requirements='F') + + check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(), + obj.data, coords, c_shape, c_dim_names, &csparse_tensor)) return pyarrow_wrap_sparse_coo_tensor(csparse_tensor) @@ -247,6 +274,21 @@ shape: {0.shape}""".format(self) shape=self.shape) return result + def to_pydata_sparse(self): + """ + Convert arrow::SparseCOOTensor to pydata/sparse.COO + """ + from sparse import COO + cdef PyObject* out_data + cdef PyObject* out_coords + + check_status(SparseCOOTensorToNdarray(self.sp_sparse_tensor, self, + &out_data, &out_coords)) + data = PyObject_to_object(out_data) + coords = PyObject_to_object(out_coords) + result = COO(data=data[:, 0], coords=coords.T, shape=self.shape) + return result + def to_tensor(self): """ Convert arrow::SparseCOOTensor to arrow::Tensor diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py index 53fccff5763..4077739d02e 100644 --- a/python/pyarrow/tests/test_serialization.py +++ b/python/pyarrow/tests/test_serialization.py @@ -46,6 +46,11 @@ coo_matrix = None csr_matrix = None +try: + import sparse +except ImportError: + sparse = None + def assert_equal(obj1, obj2): if torch is not None and torch.is_tensor(obj1) and torch.is_tensor(obj2): @@ -597,6 +602,22 @@ def test_scipy_sparse_coo_tensor_serialization(): assert np.array_equal(sparse_array.toarray(), result.toarray()) +@pytest.mark.skipif(not sparse, reason="requires pydata/sparse") +def test_pydata_sparse_sparse_coo_tensor_serialization(): + data = np.array([1, 2, 3, 4, 5, 6]) + coords = np.array([ + [0, 0, 2, 3, 1, 3], + [0, 2, 0, 4, 5, 5], + ]) + shape = (4, 6) + + sparse_array = sparse.COO(data=data, coords=coords, shape=shape) + serialized = pa.serialize(sparse_array) + result = serialized.deserialize() + + assert np.array_equal(sparse_array.todense(), result.todense()) + + @pytest.mark.parametrize('tensor_type', tensor_types) @pytest.mark.parametrize('index_type', index_types) def test_sparse_csr_matrix_serialization(index_type, tensor_type): diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index 31c40bd94ed..24b45928dfb 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -27,6 +27,11 @@ coo_matrix = None csr_matrix = None +try: + import sparse +except ImportError: + sparse = None + tensor_type_pairs = [ ('i1', pa.int8()), @@ -325,3 +330,29 @@ def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type): else: dense_array = sparse_array.toarray() assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy()) + + +@pytest.mark.skipif(not sparse, reason="requires pydata/sparse") +@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) +def test_pydata_sparse_sparse_coo_tensor_roundtrip(dtype_str, arrow_type): + dtype = np.dtype(dtype_str) + data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype) + coords = np.array([ + [0, 0, 2, 3, 1, 3], + [0, 2, 0, 4, 5, 5], + ]) + shape = (4, 6) + dim_names = ("x", "y") + + sparse_array = sparse.COO(data=data, coords=coords, shape=shape) + sparse_tensor = pa.SparseCOOTensor.from_pydata_sparse(sparse_array, + dim_names=dim_names) + out_sparse_array = sparse_tensor.to_pydata_sparse() + + assert sparse_tensor.type == arrow_type + assert sparse_tensor.dim_names == dim_names + assert sparse_array.dtype == out_sparse_array.dtype + assert np.array_equal(sparse_array.data, out_sparse_array.data) + assert np.array_equal(sparse_array.coords, out_sparse_array.coords) + assert np.array_equal(sparse_array.todense(), + sparse_tensor.to_tensor().to_numpy())