diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index 6c1f3d7c967..792f47d0add 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -324,7 +324,7 @@ Status SparseCOOTensorToNdarray(const std::shared_ptr& sparse_t // Wrap tensor data OwnedRef result_data; RETURN_NOT_OK(SparseTensorDataToNdarray( - *sparse_tensor, {sparse_index.non_zero_length(), 1}, base, result_data.ref())); + *sparse_tensor, {sparse_tensor->non_zero_length(), 1}, base, result_data.ref())); // Wrap indices PyObject* result_coords; @@ -344,7 +344,7 @@ Status SparseCSRMatrixToNdarray(const std::shared_ptr& sparse_t // Wrap tensor data OwnedRef result_data; RETURN_NOT_OK(SparseTensorDataToNdarray( - *sparse_tensor, {sparse_index.non_zero_length(), 1}, base, result_data.ref())); + *sparse_tensor, {sparse_tensor->non_zero_length(), 1}, base, result_data.ref())); // Wrap indices OwnedRef result_indptr; diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index b6fe2f3a1e5..3fd7008cb7d 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -364,6 +364,131 @@ void MakeSparseTensorFromTensor(const Tensor& tensor, } } +template +Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_tensor, + std::shared_ptr* out) { + using c_index_value_type = typename IndexValueType::c_type; + using NumericTensorType = NumericTensor; + using value_type = typename NumericTensorType::value_type; + + std::shared_ptr values_buffer; + RETURN_NOT_OK( + AllocateBuffer(pool, sizeof(value_type) * sparse_tensor->size(), &values_buffer)); + auto values = reinterpret_cast(values_buffer->mutable_data()); + + std::fill_n(values, sparse_tensor->size(), static_cast(0)); + + switch (sparse_tensor->format_id()) { + case SparseTensorFormat::COO: { + const auto& sparse_index = + internal::checked_cast(*sparse_tensor->sparse_index()); + const std::shared_ptr coords = sparse_index.indices(); + const auto raw_data = + reinterpret_cast(sparse_tensor->raw_data()); + std::vector strides(sparse_tensor->ndim(), 1); + + for (int i = sparse_tensor->ndim() - 1; i > 0; --i) { + strides[i - 1] *= strides[i] * sparse_tensor->shape()[i]; + } + for (int64_t i = 0; i < sparse_tensor->non_zero_length(); ++i) { + std::vector coord(sparse_tensor->ndim()); + int64_t offset = 0; + for (int64_t j = 0; j < static_cast(coord.size()); ++j) { + coord[j] = coords->Value({i, j}); + offset += coord[j] * strides[j]; + } + values[offset] = raw_data[i]; + } + *out = std::make_shared(sparse_tensor->type(), values_buffer, + sparse_tensor->shape()); + return Status::OK(); + } + + case SparseTensorFormat::CSR: { + const auto& sparse_index = + internal::checked_cast(*sparse_tensor->sparse_index()); + const std::shared_ptr indptr = sparse_index.indptr(); + const std::shared_ptr indices = sparse_index.indices(); + const auto raw_data = + reinterpret_cast(sparse_tensor->raw_data()); + + int64_t offset; + for (int64_t i = 0; i < indptr->size() - 1; ++i) { + const int64_t start = indptr->Value({i}); + const int64_t stop = indptr->Value({i + 1}); + for (int64_t j = start; j < stop; ++j) { + offset = indices->Value({j}) + i * sparse_tensor->shape()[1]; + values[offset] = raw_data[j]; + } + } + *out = std::make_shared(sparse_tensor->type(), values_buffer, + sparse_tensor->shape()); + return Status::OK(); + } + } + return Status::NotImplemented("Unsupported SparseIndex format type"); +} + +#define MAKE_TENSOR_FROM_SPARSE_TENSOR_INDEX_TYPE(IndexValueType) \ + case IndexValueType##Type::type_id: \ + return MakeTensorFromSparseTensor(pool, sparse_tensor, \ + out); \ + break; + +template +Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_tensor, + std::shared_ptr* out) { + std::shared_ptr type; + switch (sparse_tensor->format_id()) { + case SparseTensorFormat::COO: { + const auto& sparse_index = + internal::checked_cast(*sparse_tensor->sparse_index()); + const std::shared_ptr indices = sparse_index.indices(); + type = indices->type(); + break; + } + case SparseTensorFormat::CSR: { + const auto& sparse_index = + internal::checked_cast(*sparse_tensor->sparse_index()); + const std::shared_ptr indices = sparse_index.indices(); + type = indices->type(); + break; + } + // LCOV_EXCL_START: ignore program failure + default: + ARROW_LOG(FATAL) << "Unsupported SparseIndex format"; + break; + // LCOV_EXCL_STOP + } + + switch (type->id()) { + ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(MAKE_TENSOR_FROM_SPARSE_TENSOR_INDEX_TYPE); + // LCOV_EXCL_START: ignore program failure + default: + ARROW_LOG(FATAL) << "Unsupported SparseIndex value type"; + return Status::NotImplemented("Unsupported SparseIndex value type"); + // LCOV_EXCL_STOP + } +} +#undef MAKE_TENSOR_FROM_SPARSE_TENSOR_INDEX_TYPE + +#define MAKE_TENSOR_FROM_SPARSE_TENSOR_VALUE_TYPE(TYPE) \ + case TYPE##Type::type_id: \ + return MakeTensorFromSparseTensor(pool, sparse_tensor, out); + +Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_tensor, + std::shared_ptr* out) { + switch (sparse_tensor->type()->id()) { + ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(MAKE_TENSOR_FROM_SPARSE_TENSOR_VALUE_TYPE); + // LCOV_EXCL_START: ignore program failure + default: + ARROW_LOG(FATAL) << "Unsupported SparseTensor value type"; + return Status::NotImplemented("Unsupported SparseTensor data value type"); + // LCOV_EXCL_STOP + } +} +#undef MAKE_TENSOR_FROM_SPARSE_TENSOR_VALUE_TYPE + } // namespace internal // ---------------------------------------------------------------------- @@ -429,4 +554,8 @@ bool SparseTensor::Equals(const SparseTensor& other) const { return SparseTensorEquals(*this, other); } +Status SparseTensor::ToTensor(MemoryPool* pool, std::shared_ptr* out) const { + return internal::MakeTensorFromSparseTensor(pool, this, out); +} + } // namespace arrow diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 47df0115028..d24a680df5b 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -204,6 +204,15 @@ class ARROW_EXPORT SparseTensor { /// \brief Return whether sparse tensors are equal bool Equals(const SparseTensor& other) const; + /// \brief Return dense representation of sparse tensor as tensor + Status ToTensor(std::shared_ptr* out) const { + return ToTensor(default_memory_pool(), out); + } + + /// \brief Return dense representation of sparse tensor as tensor + /// using specified memory pool + Status ToTensor(MemoryPool* pool, std::shared_ptr* out) const; + protected: // Constructor with all attributes SparseTensor(const std::shared_ptr& type, const std::shared_ptr& data, diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index e37f3e46ec9..5fcae47fe40 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -202,6 +202,21 @@ TEST_F(TestSparseCOOTensor, TensorEquality) { ASSERT_FALSE(st1.Equals(st2)); } +TEST_F(TestSparseCOOTensor, TestToTensor) { + std::vector values = {1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4}; + std::vector shape({4, 3, 2, 1}); + std::shared_ptr buffer = Buffer::Wrap(values); + Tensor tensor(int64(), buffer, shape, {}, this->dim_names_); + SparseTensorImpl sparse_tensor(tensor); + + ASSERT_EQ(5, sparse_tensor.non_zero_length()); + ASSERT_TRUE(sparse_tensor.is_mutable()); + std::shared_ptr dense_tensor; + ASSERT_OK(sparse_tensor.ToTensor(&dense_tensor)); + ASSERT_TRUE(tensor.Equals(*dense_tensor)); +} + template class TestSparseCOOTensorForIndexValueType : public TestSparseCOOTensorBase { @@ -469,4 +484,18 @@ TEST_F(TestSparseCSRMatrix, TensorEquality) { ASSERT_FALSE(st1.Equals(st2)); } +TEST_F(TestSparseCSRMatrix, TestToTensor) { + std::vector values = {1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 1, + 0, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1}; + std::vector shape({6, 4}); + std::shared_ptr buffer = Buffer::Wrap(values); + Tensor tensor(int64(), buffer, shape, {}, this->dim_names_); + SparseTensorImpl sparse_tensor(tensor); + + ASSERT_EQ(7, sparse_tensor.non_zero_length()); + ASSERT_TRUE(sparse_tensor.is_mutable()); + std::shared_ptr dense_tensor; + ASSERT_OK(sparse_tensor.ToTensor(&dense_tensor)); + ASSERT_TRUE(tensor.Equals(*dense_tensor)); +} } // namespace arrow diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index fd130f83474..dc29c10aed9 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -663,6 +663,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CSparseCOOTensor" arrow::SparseCOOTensor": shared_ptr[CDataType] type() shared_ptr[CBuffer] data() + CStatus ToTensor(shared_ptr[CTensor]*) const vector[int64_t]& shape() int64_t size() @@ -679,6 +680,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CSparseCSRMatrix" arrow::SparseCSRMatrix": shared_ptr[CDataType] type() shared_ptr[CBuffer] data() + CStatus ToTensor(shared_ptr[CTensor]*) const vector[int64_t]& shape() int64_t size() diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index fb2c3c0f852..4b93676ee07 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -175,7 +175,8 @@ shape: {0.shape}""".format(self) "SparseCOOTensor indices") check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(), - data, coords, c_shape, c_dim_names, &csparse_tensor)) + data, coords, c_shape, + c_dim_names, &csparse_tensor)) return pyarrow_wrap_sparse_coo_tensor(csparse_tensor) @staticmethod @@ -202,6 +203,16 @@ shape: {0.shape}""".format(self) &out_data, &out_coords)) return PyObject_to_object(out_data), PyObject_to_object(out_coords) + def to_tensor(self): + """ + Convert arrow::SparseCOOTensor to arrow::Tensor + """ + + cdef shared_ptr[CTensor] ctensor + check_status(self.stp.ToTensor(&ctensor)) + + return pyarrow_wrap_tensor(ctensor) + def equals(self, SparseCOOTensor other): """ Return true if sparse tensors contains exactly equal data @@ -296,8 +307,8 @@ shape: {0.shape}""".format(self) "SparseCSRMatrix indices") check_status(NdarraysToSparseCSRMatrix(c_default_memory_pool(), - data, indptr, indices, c_shape, c_dim_names, - &csparse_tensor)) + data, indptr, indices, c_shape, + c_dim_names, &csparse_tensor)) return pyarrow_wrap_sparse_csr_matrix(csparse_tensor) @staticmethod @@ -322,10 +333,21 @@ shape: {0.shape}""".format(self) cdef PyObject* out_indices check_status(SparseCSRMatrixToNdarray(self.sp_sparse_tensor, self, - &out_data, &out_indptr, &out_indices)) + &out_data, &out_indptr, + &out_indices)) return (PyObject_to_object(out_data), PyObject_to_object(out_indptr), PyObject_to_object(out_indices)) + def to_tensor(self): + """ + Convert arrow::SparseCSRMatrix to arrow::Tensor + """ + + cdef shared_ptr[CTensor] ctensor + check_status(self.stp.ToTensor(&ctensor)) + + return pyarrow_wrap_tensor(ctensor) + def equals(self, SparseCSRMatrix other): """ Return true if sparse tensors contains exactly equal data diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index 225bbbf56dc..aaf0468f982 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -219,3 +219,24 @@ def test_sparse_tensor_csr_numpy_roundtrip(dtype_str, arrow_type): assert np.array_equal(indptr, result_indptr) assert np.array_equal(indices, result_indices) assert sparse_tensor.dim_names == dim_names + + +@pytest.mark.parametrize('sparse_tensor_type', [ + pa.SparseCSRMatrix, + pa.SparseCOOTensor, +]) +@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) +def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type): + dtype = np.dtype(dtype_str) + array = np.array([[4, 0, 9, 0], + [0, 7, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 5]]).astype(dtype) + + sparse_tensor = sparse_tensor_type.from_dense_numpy(array) + tensor = sparse_tensor.to_tensor() + result_array = tensor.to_numpy() + + assert sparse_tensor.type == arrow_type + assert tensor.type == arrow_type + assert np.array_equal(array, result_array)