From 759e0b38b6a7dfac857c083f7156abba43d5e523 Mon Sep 17 00:00:00 2001 From: Rok Date: Wed, 13 Nov 2019 11:18:37 +0100 Subject: [PATCH 1/3] Adding from_pydata_sparse and to_pydata_sparse. --- python/pyarrow/serialization.py | 32 +++++++++++++++++++ python/pyarrow/tensor.pxi | 37 ++++++++++++++++++++++ python/pyarrow/tests/test_serialization.py | 21 ++++++++++++ python/pyarrow/tests/test_sparse_tensor.py | 29 +++++++++++++++++ 4 files changed, 119 insertions(+) diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py index ea6deab2269..3be93177d00 100644 --- a/python/pyarrow/serialization.py +++ b/python/pyarrow/serialization.py @@ -350,6 +350,37 @@ def _deserialize_scipy_sparse(data): pass +# ---------------------------------------------------------------------- +# Set up serialization for pydata/sparse tensors. + +def _register_pydata_sparse_handlers(serialization_context): + try: + import sparse + + def _serialize_pydata_sparse(obj): + if isinstance(obj, sparse.COO): + return 'coo', pa.SparseCOOTensor.from_pydata_sparse(obj) + else: + raise NotImplementedError( + "Serialization of {} is not supported.".format(sparse.COO)) + + def _deserialize_pydata_sparse(data): + if data[0] == 'coo': + data_array, coords = data[1].to_numpy() + return sparse.COO( + data=data_array[:, 0], + coords=coords.T, shape=data[1].shape) + + serialization_context.register_type( + sparse.COO, 'sparse.COO', + custom_serializer=_serialize_pydata_sparse, + custom_deserializer=_deserialize_pydata_sparse) + + except ImportError: + # no pydata/sparse + pass + + def register_default_serialization_handlers(serialization_context): # ---------------------------------------------------------------------- @@ -403,6 +434,7 @@ def register_default_serialization_handlers(serialization_context): _register_collections_serialization_handlers(serialization_context) _register_custom_pandas_handlers(serialization_context) _register_scipy_handlers(serialization_context) + _register_pydata_sparse_handlers(serialization_context) def default_serialization_context(): diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index 8661c1edd13..698e5743f96 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -207,6 +207,28 @@ shape: {0.shape}""".format(self) &csparse_tensor)) return pyarrow_wrap_sparse_coo_tensor(csparse_tensor) + @staticmethod + def from_pydata_sparse(obj, dim_names=None): + """ + Convert pydata/sparse.COO to arrow::SparseCOOTensor + """ + cdef shared_ptr[CSparseCOOTensor] csparse_tensor + cdef vector[int64_t] c_shape + cdef vector[c_string] c_dim_names + + for x in obj.shape: + c_shape.push_back(x) + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + coords = np.require(obj.coords.T.view(), dtype='i8', requirements='F') + + check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(), + obj.data.view(), coords, c_shape, c_dim_names, + &csparse_tensor)) + return pyarrow_wrap_sparse_coo_tensor(csparse_tensor) + @staticmethod def from_tensor(obj): """ @@ -247,6 +269,21 @@ shape: {0.shape}""".format(self) shape=self.shape) return result + def to_pydata_sparse(self): + """ + Convert arrow::SparseCOOTensor to pydata/sparse.COO + """ + from sparse import COO + cdef PyObject* out_data + cdef PyObject* out_coords + + check_status(SparseCOOTensorToNdarray(self.sp_sparse_tensor, self, + &out_data, &out_coords)) + data = PyObject_to_object(out_data) + coords = PyObject_to_object(out_coords) + result = COO(data=data, coords=coords.T, shape=self.shape) + return result + def to_tensor(self): """ Convert arrow::SparseCOOTensor to arrow::Tensor diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py index 53fccff5763..4077739d02e 100644 --- a/python/pyarrow/tests/test_serialization.py +++ b/python/pyarrow/tests/test_serialization.py @@ -46,6 +46,11 @@ coo_matrix = None csr_matrix = None +try: + import sparse +except ImportError: + sparse = None + def assert_equal(obj1, obj2): if torch is not None and torch.is_tensor(obj1) and torch.is_tensor(obj2): @@ -597,6 +602,22 @@ def test_scipy_sparse_coo_tensor_serialization(): assert np.array_equal(sparse_array.toarray(), result.toarray()) +@pytest.mark.skipif(not sparse, reason="requires pydata/sparse") +def test_pydata_sparse_sparse_coo_tensor_serialization(): + data = np.array([1, 2, 3, 4, 5, 6]) + coords = np.array([ + [0, 0, 2, 3, 1, 3], + [0, 2, 0, 4, 5, 5], + ]) + shape = (4, 6) + + sparse_array = sparse.COO(data=data, coords=coords, shape=shape) + serialized = pa.serialize(sparse_array) + result = serialized.deserialize() + + assert np.array_equal(sparse_array.todense(), result.todense()) + + @pytest.mark.parametrize('tensor_type', tensor_types) @pytest.mark.parametrize('index_type', index_types) def test_sparse_csr_matrix_serialization(index_type, tensor_type): diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index 31c40bd94ed..3b494a3f490 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -27,6 +27,11 @@ coo_matrix = None csr_matrix = None +try: + import sparse +except ImportError: + sparse = None + tensor_type_pairs = [ ('i1', pa.int8()), @@ -325,3 +330,27 @@ def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type): else: dense_array = sparse_array.toarray() assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy()) + + +@pytest.mark.skipif(not sparse, reason="requires pydata/sparse") +@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) +def test_pydata_sparse_sparse_coo_tensor_roundtrip(dtype_str, arrow_type): + dtype = np.dtype(dtype_str) + data = np.array([[1, 2, 3, 4, 5, 6]]).T.astype(dtype) + coords = np.array([ + [0, 0, 2, 3, 1, 3], + [0, 2, 0, 4, 5, 5], + ]) + shape = (4, 6) + dim_names = ("x", "y") + + sparse_array = sparse.COO(data=data, coords=coords, shape=shape) + sparse_tensor = pa.SparseCOOTensor.from_pydata_sparse(sparse_array, + dim_names=dim_names) + out_sparse_array = sparse_tensor.to_pydata_sparse() + + assert sparse_tensor.type == arrow_type + assert sparse_tensor.dim_names == dim_names + assert sparse_array.dtype == out_sparse_array.dtype + assert np.array_equal(sparse_array.data, out_sparse_array.data) + assert np.array_equal(sparse_array.coords, out_sparse_array.coords) From 4efdd355b1fbaf9ca7c6f7d8da3e2484429afd14 Mon Sep 17 00:00:00 2001 From: Rok Date: Wed, 13 Nov 2019 12:07:56 +0100 Subject: [PATCH 2/3] Adding type check to from_pydata_sparse. --- python/pyarrow/tensor.pxi | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index 698e5743f96..7dbf179689c 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -212,6 +212,11 @@ shape: {0.shape}""".format(self) """ Convert pydata/sparse.COO to arrow::SparseCOOTensor """ + import sparse + if not isinstance(obj, sparse.COO): + raise TypeError( + "Expected sparse.COO, got {}".format(type(obj))) + cdef shared_ptr[CSparseCOOTensor] csparse_tensor cdef vector[int64_t] c_shape cdef vector[c_string] c_dim_names From d99d52d848a60b79cbd4903bb1087d9af7b293f9 Mon Sep 17 00:00:00 2001 From: Rok Date: Wed, 13 Nov 2019 12:51:13 +0100 Subject: [PATCH 3/3] Implementing review feedback. --- python/pyarrow/tensor.pxi | 8 ++++---- python/pyarrow/tests/test_sparse_tensor.py | 4 +++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index 7dbf179689c..37a87cdc823 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -203,7 +203,7 @@ shape: {0.shape}""".format(self) coords = np.require(coords, dtype='i8', requirements='F') check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(), - obj.data.view(), coords, c_shape, c_dim_names, + obj.data, coords, c_shape, c_dim_names, &csparse_tensor)) return pyarrow_wrap_sparse_coo_tensor(csparse_tensor) @@ -227,10 +227,10 @@ shape: {0.shape}""".format(self) for x in dim_names: c_dim_names.push_back(tobytes(x)) - coords = np.require(obj.coords.T.view(), dtype='i8', requirements='F') + coords = np.require(obj.coords.T, dtype='i8', requirements='F') check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(), - obj.data.view(), coords, c_shape, c_dim_names, + obj.data, coords, c_shape, c_dim_names, &csparse_tensor)) return pyarrow_wrap_sparse_coo_tensor(csparse_tensor) @@ -286,7 +286,7 @@ shape: {0.shape}""".format(self) &out_data, &out_coords)) data = PyObject_to_object(out_data) coords = PyObject_to_object(out_coords) - result = COO(data=data, coords=coords.T, shape=self.shape) + result = COO(data=data[:, 0], coords=coords.T, shape=self.shape) return result def to_tensor(self): diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index 3b494a3f490..24b45928dfb 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -336,7 +336,7 @@ def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type): @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_pydata_sparse_sparse_coo_tensor_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) - data = np.array([[1, 2, 3, 4, 5, 6]]).T.astype(dtype) + data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype) coords = np.array([ [0, 0, 2, 3, 1, 3], [0, 2, 0, 4, 5, 5], @@ -354,3 +354,5 @@ def test_pydata_sparse_sparse_coo_tensor_roundtrip(dtype_str, arrow_type): assert sparse_array.dtype == out_sparse_array.dtype assert np.array_equal(sparse_array.data, out_sparse_array.data) assert np.array_equal(sparse_array.coords, out_sparse_array.coords) + assert np.array_equal(sparse_array.todense(), + sparse_tensor.to_tensor().to_numpy())