diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index f7431f80f5f..d2322009ea8 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -1196,6 +1196,13 @@ inline bool SparseTensorEqualsImplDispatch(const SparseTensorImpl&>(right); + return SparseTensorEqualsImpl::Compare(left, + right_csf); + } + default: return false; } @@ -1232,6 +1239,11 @@ bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right) { return SparseTensorEqualsImplDispatch(left_csc, right); } + case SparseTensorFormat::CSF: { + const auto& left_csf = checked_cast&>(left); + return SparseTensorEqualsImplDispatch(left_csf, right); + } + default: return false; } diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 09a322b1060..06c85648591 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -654,6 +654,7 @@ Status CountSparseTensors( OwnedRef num_sparse_tensors(PyDict_New()); size_t num_coo = 0; size_t num_csr = 0; + size_t num_csf = 0; for (const auto& sparse_tensor : sparse_tensors) { switch (sparse_tensor->format_id()) { @@ -663,6 +664,9 @@ Status CountSparseTensors( case SparseTensorFormat::CSR: ++num_csr; break; + case SparseTensorFormat::CSF: + ++num_csf; + break; case SparseTensorFormat::CSC: // TODO(mrkn): support csc break; @@ -671,6 +675,7 @@ Status CountSparseTensors( PyDict_SetItemString(num_sparse_tensors.obj(), "coo", PyLong_FromSize_t(num_coo)); PyDict_SetItemString(num_sparse_tensors.obj(), "csr", PyLong_FromSize_t(num_csr)); + PyDict_SetItemString(num_sparse_tensors.obj(), "csf", PyLong_FromSize_t(num_csf)); RETURN_IF_PYERROR(); *out = num_sparse_tensors.detach(); diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index d42bdf4ca61..549223c0798 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -23,9 +23,11 @@ #include #include +#include "arrow/buffer_builder.h" #include "arrow/compare.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" +#include "arrow/util/sort.h" #include "arrow/visitor_inline.h" namespace arrow { @@ -55,6 +57,38 @@ class SparseTensorConverter { Status Convert() { return Status::Invalid("Unsupported sparse index"); } }; +// ---------------------------------------------------------------------- +// IncrementIndex for SparseCOOIndex and SparseCSFIndex + +inline void IncrementIndex(std::vector& coord, + const std::vector& shape) { + const int64_t ndim = shape.size(); + ++coord[ndim - 1]; + if (coord[ndim - 1] == shape[ndim - 1]) { + int64_t d = ndim - 1; + while (d > 0 && coord[d] == shape[d]) { + coord[d] = 0; + ++coord[d - 1]; + --d; + } + } +} + +inline void IncrementIndex(std::vector& coord, const std::vector& shape, + const std::vector& axis_order) { + const int64_t ndim = shape.size(); + const int64_t last_axis = axis_order[ndim - 1]; + ++coord[last_axis]; + if (coord[last_axis] == shape[last_axis]) { + int64_t d = ndim - 1; + while (d > 0 && coord[axis_order[d]] == shape[axis_order[d]]) { + coord[axis_order[d]] = 0; + ++coord[axis_order[d - 1]]; + --d; + } + } +} + // ---------------------------------------------------------------------- // SparseTensorConverter for SparseCOOIndex @@ -128,17 +162,7 @@ class SparseTensorConverter *indices++ = static_cast(coord[i]); } } - - // increment index - ++coord[ndim - 1]; - if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) { - int64_t d = ndim - 1; - while (d > 0 && coord[d] == shape[d]) { - coord[d] = 0; - ++coord[d - 1]; - --d; - } - } + IncrementIndex(coord, shape); } } @@ -419,6 +443,144 @@ class SparseTensorConverter inline Status CheckMaximumValue(const uint64_t) const { return Status::OK(); } }; +// ---------------------------------------------------------------------- +// SparseTensorConverter for SparseCSFIndex + +template +class SparseTensorConverter + : private SparseTensorConverterBase { + public: + using BaseClass = SparseTensorConverterBase; + using typename BaseClass::NumericTensorType; + using typename BaseClass::value_type; + + SparseTensorConverter(const NumericTensorType& tensor, + const std::shared_ptr& index_value_type, + MemoryPool* pool) + : BaseClass(tensor, index_value_type, pool) {} + + template + Status Convert() { + using c_index_value_type = typename IndexValueType::c_type; + RETURN_NOT_OK(CheckMaximumValue(std::numeric_limits::max())); + + const int64_t ndim = tensor_.ndim(); + // Axis order as ascending order of dimension size is a good heuristic but is not + // necessarily optimal. + std::vector axis_order = internal::ArgSort(tensor_.shape()); + int64_t nonzero_count = -1; + RETURN_NOT_OK(tensor_.CountNonZero(&nonzero_count)); + + std::shared_ptr values_buffer; + RETURN_NOT_OK( + AllocateBuffer(pool_, sizeof(value_type) * nonzero_count, &values_buffer)); + value_type* values = reinterpret_cast(values_buffer->mutable_data()); + + std::vector counts(ndim, 0); + std::vector coord(ndim, 0); + std::vector previous_coord(ndim, -1); + std::vector> indptr_buffer_builders(ndim - 1); + std::vector> indices_buffer_builders(ndim); + + if (ndim <= 1) { + return Status::NotImplemented("TODO for ndim <= 1"); + } else { + const std::vector& shape = tensor_.shape(); + for (int64_t n = tensor_.size(); n > 0; n--) { + const value_type x = tensor_.Value(coord); + + if (x != 0) { + bool tree_split = false; + *values++ = x; + + for (int64_t i = 0; i < ndim; ++i) { + int64_t dimension = axis_order[i]; + + tree_split = tree_split || (coord[dimension] != previous_coord[dimension]); + if (tree_split) { + if (i < ndim - 1) { + RETURN_NOT_OK(indptr_buffer_builders[i].Append( + static_cast(counts[i + 1]))); + } + RETURN_NOT_OK(indices_buffer_builders[i].Append( + static_cast(coord[dimension]))); + ++counts[i]; + } + } + previous_coord = coord; + } + IncrementIndex(coord, shape, axis_order); + } + } + + for (int64_t column = 0; column < ndim - 1; ++column) { + RETURN_NOT_OK(indptr_buffer_builders[column].Append( + static_cast(counts[column + 1]))); + } + + // make results + data = values_buffer; + + std::vector> indptr_buffers(ndim - 1); + std::vector> indices_buffers(ndim); + std::vector indptr_shapes(counts.begin(), counts.end() - 1); + std::vector indices_shapes = counts; + + for (int64_t column = 0; column < ndim; ++column) { + RETURN_NOT_OK( + indices_buffer_builders[column].Finish(&indices_buffers[column], true)); + } + for (int64_t column = 0; column < ndim - 1; ++column) { + RETURN_NOT_OK(indptr_buffer_builders[column].Finish(&indptr_buffers[column], true)); + } + + ARROW_ASSIGN_OR_RAISE( + sparse_index, SparseCSFIndex::Make(index_value_type_, indices_shapes, axis_order, + indptr_buffers, indices_buffers)); + return Status::OK(); + } + +#define CALL_TYPE_SPECIFIC_CONVERT(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: \ + return Convert(); + + Status Convert() { + switch (index_value_type_->id()) { + ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_TYPE_SPECIFIC_CONVERT); + // LCOV_EXCL_START: The following invalid causes program failure. + default: + return Status::TypeError("Unsupported SparseTensor index value type"); + // LCOV_EXCL_STOP + } + } + +#undef CALL_TYPE_SPECIFIC_CONVERT + + std::shared_ptr sparse_index; + std::shared_ptr data; + + private: + using BaseClass::index_value_type_; + using BaseClass::pool_; + using BaseClass::tensor_; + + template + inline Status CheckMaximumValue(const c_value_type type_max) const { + auto max_dimension = + *std::max_element(tensor_.shape().begin(), tensor_.shape().end()); + if (static_cast(type_max) < max_dimension) { + // LCOV_EXCL_START: The following invalid causes program failure. + return Status::Invalid("The bit width of the index value type is too small"); + // LCOV_EXCL_STOP + } + return Status::OK(); + } + + inline Status CheckMaximumValue(const int64_t) const { return Status::OK(); } + + inline Status CheckMaximumValue(const uint64_t) const { return Status::OK(); } +}; + // ---------------------------------------------------------------------- // Instantiate templates @@ -438,6 +600,7 @@ class SparseTensorConverter INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCOOIndex); INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSRIndex); INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSCIndex); +INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSFIndex); } // namespace @@ -500,6 +663,10 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor, case SparseTensorFormat::CSC: return MakeSparseTensorFromTensor(tensor, index_value_type, pool, out_sparse_index, out_data); + case SparseTensorFormat::CSF: + return MakeSparseTensorFromTensor(tensor, index_value_type, pool, + out_sparse_index, out_data); + // LCOV_EXCL_START: ignore program failure default: return Status::Invalid("Invalid sparse tensor format"); @@ -507,6 +674,35 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor, } } +namespace { + +template +void ExpandSparseCSFTensorValues(int64_t dimension, int64_t dense_offset, + int64_t first_ptr, int64_t last_ptr, + const SparseCSFIndex& sparse_index, const TYPE* raw_data, + const std::vector& strides, + const std::vector& axis_order, TYPE* out) { + int64_t ndim = axis_order.size(); + + for (int64_t i = first_ptr; i < last_ptr; ++i) { + int64_t tmp_dense_offset = + dense_offset + sparse_index.indices()[dimension]->Value({i}) * + strides[axis_order[dimension]]; + + if (dimension < ndim - 1) { + ExpandSparseCSFTensorValues( + dimension + 1, tmp_dense_offset, + sparse_index.indptr()[dimension]->Value({i}), + sparse_index.indptr()[dimension]->Value({i + 1}), sparse_index, + raw_data, strides, axis_order, out); + } else { + out[tmp_dense_offset] = raw_data[i]; + } + } +} + +} // namespace + template Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_tensor, std::shared_ptr* out) { @@ -521,18 +717,20 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t std::fill_n(values, sparse_tensor->size(), static_cast(0)); + std::vector strides(sparse_tensor->ndim(), 1); + for (int i = sparse_tensor->ndim() - 1; i > 0; --i) { + strides[i - 1] *= strides[i] * sparse_tensor->shape()[i]; + } + std::vector empty_strides; + + const auto raw_data = reinterpret_cast(sparse_tensor->raw_data()); + switch (sparse_tensor->format_id()) { case SparseTensorFormat::COO: { const auto& sparse_index = internal::checked_cast(*sparse_tensor->sparse_index()); const std::shared_ptr coords = sparse_index.indices(); - const auto raw_data = - reinterpret_cast(sparse_tensor->raw_data()); - std::vector strides(sparse_tensor->ndim(), 1); - for (int i = sparse_tensor->ndim() - 1; i > 0; --i) { - strides[i - 1] *= strides[i] * sparse_tensor->shape()[i]; - } for (int64_t i = 0; i < sparse_tensor->non_zero_length(); ++i) { std::vector coord(sparse_tensor->ndim()); int64_t offset = 0; @@ -543,7 +741,8 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t values[offset] = raw_data[i]; } *out = std::make_shared(sparse_tensor->type(), values_buffer, - sparse_tensor->shape()); + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); return Status::OK(); } @@ -552,8 +751,6 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t internal::checked_cast(*sparse_tensor->sparse_index()); const std::shared_ptr indptr = sparse_index.indptr(); const std::shared_ptr indices = sparse_index.indices(); - const auto raw_data = - reinterpret_cast(sparse_tensor->raw_data()); int64_t offset; for (int64_t i = 0; i < indptr->size() - 1; ++i) { @@ -565,7 +762,8 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t } } *out = std::make_shared(sparse_tensor->type(), values_buffer, - sparse_tensor->shape()); + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); return Status::OK(); } @@ -574,8 +772,6 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t internal::checked_cast(*sparse_tensor->sparse_index()); const std::shared_ptr indptr = sparse_index.indptr(); const std::shared_ptr indices = sparse_index.indices(); - const auto raw_data = - reinterpret_cast(sparse_tensor->raw_data()); int64_t offset; for (int64_t j = 0; j < indptr->size() - 1; ++j) { @@ -587,7 +783,21 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t } } *out = std::make_shared(sparse_tensor->type(), values_buffer, - sparse_tensor->shape()); + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); + return Status::OK(); + } + + case SparseTensorFormat::CSF: { + const auto& sparse_index = + internal::checked_cast(*sparse_tensor->sparse_index()); + + ExpandSparseCSFTensorValues( + 0, 0, 0, sparse_index.indptr()[0]->size() - 1, sparse_index, raw_data, strides, + sparse_index.axis_order(), values); + *out = std::make_shared(sparse_tensor->type(), values_buffer, + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); return Status::OK(); } } @@ -625,6 +835,13 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t const std::shared_ptr indices = sparse_index.indices(); type = indices->type(); break; + } + case SparseTensorFormat::CSF: { + const auto& sparse_index = + internal::checked_cast(*sparse_tensor->sparse_index()); + const std::vector> indices = sparse_index.indices(); + type = indices[0]->type(); + break; } // LCOV_EXCL_START: ignore program failure default: @@ -754,6 +971,86 @@ void CheckSparseCSXIndexValidity(const std::shared_ptr& indptr_type, } // namespace internal +// ---------------------------------------------------------------------- +// SparseCSFIndex + +namespace { + +inline Status CheckSparseCSFIndexValidity(const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const int64_t num_indptrs, + const int64_t num_indices, + const std::vector& indptr_shape, + const std::vector& indices_shape, + const int64_t axis_order_size) { + if (!is_integer(indptr_type->id())) { + return Status::TypeError("Type of SparseCSFIndex indptr must be integer"); + } + if (!is_integer(indices_type->id())) { + return Status::TypeError("Type of SparseCSFIndex indices must be integer"); + } + if (num_indptrs + 1 != num_indices) { + return Status::Invalid( + "Length of indices must be equal to length of indptrs + 1 for SparseCSFIndex."); + } + if (axis_order_size != num_indices) { + return Status::Invalid( + "Length of indices must be equal to number of dimensions for SparseCSFIndex."); + } + return Status::OK(); +} + +} // namespace + +Result> SparseCSFIndex::Make( + const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const std::vector& indices_shapes, const std::vector& axis_order, + const std::vector>& indptr_data, + const std::vector>& indices_data) { + int64_t ndim = axis_order.size(); + std::vector> indptr(ndim - 1); + std::vector> indices(ndim); + + for (int64_t i = 0; i < ndim - 1; ++i) + indptr[i] = std::make_shared(indptr_type, indptr_data[i], + std::vector({indices_shapes[i] + 1})); + for (int64_t i = 0; i < ndim; ++i) + indices[i] = std::make_shared(indices_type, indices_data[i], + std::vector({indices_shapes[i]})); + + RETURN_NOT_OK(CheckSparseCSFIndexValidity(indptr_type, indices_type, indptr.size(), + indices.size(), indptr.back()->shape(), + indices.back()->shape(), axis_order.size())); + + return std::make_shared(indptr, indices, axis_order); +} + +// Constructor with two index vectors +SparseCSFIndex::SparseCSFIndex(const std::vector>& indptr, + const std::vector>& indices, + const std::vector& axis_order) + : SparseIndexBase(indices.back()->size()), + indptr_(indptr), + indices_(indices), + axis_order_(axis_order) { + ARROW_CHECK_OK(CheckSparseCSFIndexValidity( + indptr_.front()->type(), indices_.front()->type(), indptr_.size(), indices_.size(), + indptr_.back()->shape(), indices_.back()->shape(), axis_order_.size())); +} + +std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); } + +bool SparseCSFIndex::Equals(const SparseCSFIndex& other) const { + for (int64_t i = 0; i < static_cast(indices().size()); ++i) { + if (!indices()[i]->Equals(*other.indices()[i])) return false; + } + for (int64_t i = 0; i < static_cast(indptr().size()); ++i) { + if (!indptr()[i]->Equals(*other.indptr()[i])) return false; + } + return axis_order() == other.axis_order(); +} + // ---------------------------------------------------------------------- // SparseTensor diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index f736f7b7576..33a53761e14 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -40,6 +40,8 @@ struct SparseTensorFormat { CSR, /// Compressed sparse column (CSC) format. CSC, + /// Compressed sparse fiber (CSF) format. + CSF }; }; @@ -329,6 +331,68 @@ class ARROW_EXPORT SparseCSCIndex using SparseCSXIndex::SparseCSXIndex; }; +// ---------------------------------------------------------------------- +// SparseCSFIndex class + +/// \brief EXPERIMENTAL: The index data for a CSF sparse tensor +/// +/// A CSF sparse index manages the location of its non-zero values by set of +/// prefix trees. Each path from a root to leaf forms one tensor non-zero index. +/// CSF is implemented with three vectors. +/// +/// Vectors inptr and indices contain N-1 and N buffers respectively, where N is the +/// number of dimensions. Axis_order is a vector of integers of legth N. Indptr and +/// indices describe the set of prefix trees. Trees traverse dimensions in order given by +/// axis_order. +class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase { + public: + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSF; + static constexpr char const* kTypeName = "SparseCSFIndex"; + + /// \brief Make SparseCSFIndex from raw properties + static Result> Make( + const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const std::vector& indices_shapes, const std::vector& axis_order, + const std::vector>& indptr_data, + const std::vector>& indices_data); + + /// \brief Make SparseCSFIndex from raw properties + static Result> Make( + const std::shared_ptr& indices_type, + const std::vector& indices_shapes, const std::vector& axis_order, + const std::vector>& indptr_data, + const std::vector>& indices_data) { + return Make(indices_type, indices_type, indices_shapes, axis_order, indptr_data, + indices_data); + } + + /// \brief Construct SparseCSFIndex from two index vectors + explicit SparseCSFIndex(const std::vector>& indptr, + const std::vector>& indices, + const std::vector& axis_order); + + /// \brief Return a 1D vector of indptr tensors + const std::vector>& indptr() const { return indptr_; } + + /// \brief Return a 1D vector of indices tensors + const std::vector>& indices() const { return indices_; } + + /// \brief Return a 1D vector specifying the order of axes + const std::vector& axis_order() const { return axis_order_; } + + /// \brief Return a string representation of the sparse index + std::string ToString() const override; + + /// \brief Return whether the CSF indices are equal + bool Equals(const SparseCSFIndex& other) const; + + protected: + std::vector> indptr_; + std::vector> indices_; + std::vector axis_order_; +}; + // ---------------------------------------------------------------------- // SparseTensor class @@ -527,6 +591,9 @@ using SparseCSRMatrix = SparseTensorImpl; /// \brief EXPERIMENTAL: Type alias for CSC sparse matrix using SparseCSCMatrix = SparseTensorImpl; +/// \brief EXPERIMENTAL: Type alias for CSF sparse matrix +using SparseCSFTensor = SparseTensorImpl; + } // namespace arrow #endif // ARROW_SPARSE_TENSOR_H diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 198aa8f5f8d..45cb8dcc8f3 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -30,6 +30,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/util/sort.h" namespace arrow { @@ -910,4 +911,225 @@ TEST_F(TestSparseCSCMatrix, TestToTensor) { ASSERT_TRUE(tensor.Equals(*dense_tensor)); } +template +class TestSparseCSFTensorBase : public ::testing::Test { + public: + void SetUp() { + dim_names_ = {"a", "b", "c", "d"}; + shape_ = {2, 3, 4, 5}; + int16_t dense_values[2][3][4][5] = {}; // zero-initialized + + dense_values[0][0][0][1] = 1; + dense_values[0][0][0][2] = 2; + dense_values[0][1][0][0] = 3; + dense_values[0][1][0][2] = 4; + dense_values[0][1][1][0] = 5; + dense_values[1][1][1][0] = 6; + dense_values[1][1][1][1] = 7; + dense_values[1][1][1][2] = 8; + + auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values)); + Tensor dense_tensor_(int16(), dense_buffer, shape_, {}, dim_names_); + ASSERT_OK_AND_ASSIGN( + sparse_tensor_from_dense_, + SparseCSFTensor::Make(dense_tensor_, + TypeTraits::type_singleton())); + } + + protected: + std::vector shape_; + std::vector dim_names_; + std::shared_ptr sparse_tensor_from_dense_; +}; + +class TestSparseCSFTensor : public TestSparseCSFTensorBase {}; + +template +class TestSparseCSFTensorForIndexValueType + : public TestSparseCSFTensorBase { + protected: + std::shared_ptr MakeSparseCSFIndex( + const std::vector& axis_order, + const std::vector>& indptr_values, + const std::vector>& indices_values) + const { + int64_t ndim = axis_order.size(); + std::vector> indptr(ndim - 1); + std::vector> indices(ndim); + + for (int64_t i = 0; i < ndim - 1; ++i) { + indptr[i] = std::make_shared( + TypeTraits::type_singleton(), Buffer::Wrap(indptr_values[i]), + std::vector({static_cast(indptr_values[i].size())})); + } + for (int64_t i = 0; i < ndim; ++i) { + indices[i] = std::make_shared( + TypeTraits::type_singleton(), Buffer::Wrap(indices_values[i]), + std::vector({static_cast(indices_values[i].size())})); + } + return std::make_shared(indptr, indices, axis_order); + } + + template + std::shared_ptr MakeSparseTensor( + const std::shared_ptr& si, std::vector& sparse_values, + const std::vector& shape, + const std::vector& dim_names) const { + auto data_buffer = Buffer::Wrap(sparse_values); + return std::make_shared( + si, CTypeTraits::type_singleton(), data_buffer, shape, dim_names); + } +}; + +TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType); + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestCreateSparseTensor) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::vector shape = {2, 3, 4, 5}; + std::vector dim_names = {"a", "b", "c", "d"}; + std::vector axis_order = {0, 1, 2, 3}; + std::vector sparse_values = {1, 2, 3, 4, 5, 6, 7, 8}; + std::vector> indptr_values = { + {0, 2, 3}, {0, 1, 3, 4}, {0, 2, 4, 5, 8}}; + std::vector> indices_values = { + {0, 1}, {0, 1, 1}, {0, 0, 1, 1}, {1, 2, 0, 2, 0, 0, 1, 2}}; + + auto si = this->MakeSparseCSFIndex(axis_order, indptr_values, indices_values); + auto st = this->MakeSparseTensor(si, sparse_values, shape, dim_names); + + ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestTensorToSparseTensor) { + std::vector dim_names = {"a", "b", "c", "d"}; + ASSERT_EQ(8, this->sparse_tensor_from_dense_->non_zero_length()); + ASSERT_TRUE(this->sparse_tensor_from_dense_->is_mutable()); + ASSERT_EQ(dim_names, this->sparse_tensor_from_dense_->dim_names()); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestSparseTensorToTensor) { + std::vector shape = {2, 3, 4, 5}; + int16_t dense_values[2][3][4][5] = {}; // zero-initialized + dense_values[0][0][0][1] = 1; + dense_values[0][0][0][2] = 2; + dense_values[0][1][0][0] = 3; + dense_values[0][1][0][2] = 4; + dense_values[0][1][1][0] = 5; + dense_values[1][1][1][0] = 6; + dense_values[1][1][1][1] = 7; + dense_values[1][1][1][2] = 8; + auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values)); + Tensor dense_tensor(int16(), dense_buffer, shape, {}, this->dim_names_); + + std::shared_ptr dt; + ASSERT_OK(this->sparse_tensor_from_dense_->ToTensor(&dt)); + ASSERT_TRUE(dense_tensor.Equals(*dt)); + ASSERT_EQ(dense_tensor.dim_names(), dt->dim_names()); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestRoundTrip) { + using IndexValueType = TypeParam; + + std::shared_ptr dt; + ASSERT_OK(this->sparse_tensor_from_dense_->ToTensor(&dt)); + std::shared_ptr st; + ASSERT_OK_AND_ASSIGN( + st, SparseCSFTensor::Make(*dt, TypeTraits::type_singleton())); + + ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestAlternativeAxisOrder) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::vector dense_values = {1, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 5}; + std::vector shape = {4, 6}; + std::vector dim_names = {"a", "b"}; + std::shared_ptr dense_buffer = Buffer::Wrap(dense_values); + Tensor tensor(int16(), dense_buffer, shape, {}, dim_names); + + // Axis order 1 + std::vector axis_order_1 = {0, 1}; + std::vector sparse_values_1 = {1, 3, 2, 4, 5}; + std::vector> indptr_values_1 = {{0, 2, 3, 5}}; + std::vector> indices_values_1 = {{0, 1, 3}, + {0, 3, 1, 3, 5}}; + auto si_1 = this->MakeSparseCSFIndex(axis_order_1, indptr_values_1, indices_values_1); + auto st_1 = this->MakeSparseTensor(si_1, sparse_values_1, shape, dim_names); + + // Axis order 2 + std::vector axis_order_2 = {1, 0}; + std::vector sparse_values_2 = {1, 2, 3, 4, 5}; + std::vector> indptr_values_2 = {{0, 1, 2, 4, 5}}; + std::vector> indices_values_2 = {{0, 1, 3, 5}, + {0, 1, 0, 3, 3}}; + auto si_2 = this->MakeSparseCSFIndex(axis_order_2, indptr_values_2, indices_values_2); + auto st_2 = this->MakeSparseTensor(si_2, sparse_values_2, shape, dim_names); + + std::shared_ptr dt_1, dt_2; + ASSERT_OK(st_1->ToTensor(&dt_1)); + ASSERT_OK(st_2->ToTensor(&dt_2)); + + ASSERT_FALSE(st_1->Equals(*st_2)); + ASSERT_TRUE(dt_1->Equals(*dt_2)); + ASSERT_TRUE(dt_1->Equals(tensor)); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestNonAscendingShape) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::vector shape = {5, 2, 3, 4}; + int16_t dense_values[5][2][3][4] = {}; // zero-initialized + dense_values[0][0][0][1] = 1; + dense_values[0][0][0][2] = 2; + dense_values[0][1][0][0] = 3; + dense_values[0][1][0][2] = 4; + dense_values[0][1][1][0] = 5; + dense_values[1][1][1][0] = 6; + dense_values[1][1][1][1] = 7; + dense_values[1][1][1][2] = 8; + auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values)); + Tensor dense_tensor(int16(), dense_buffer, shape, {}, this->dim_names_); + + std::shared_ptr sparse_tensor; + ASSERT_OK_AND_ASSIGN( + sparse_tensor, + SparseCSFTensor::Make(dense_tensor, TypeTraits::type_singleton())); + + std::vector> indptr_values = { + {0, 1, 3}, {0, 2, 4, 7}, {0, 1, 2, 3, 4, 6, 7, 8}}; + std::vector> indices_values = { + {0, 1}, {0, 0, 1}, {1, 2, 0, 2, 0, 1, 2}, {0, 0, 0, 0, 0, 1, 1, 1}}; + std::vector axis_order = {1, 2, 3, 0}; + std::vector sparse_values = {1, 2, 3, 4, 5, 6, 7, 8}; + auto si = this->MakeSparseCSFIndex(axis_order, indptr_values, indices_values); + auto st = this->MakeSparseTensor(si, sparse_values, shape, this->dim_names_); + + std::shared_ptr dt; + ASSERT_OK(st->ToTensor(&dt)); + ASSERT_TRUE(dt->Equals(dense_tensor)); + ASSERT_TRUE(st->Equals(*sparse_tensor)); +} + +REGISTER_TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType, TestCreateSparseTensor, + TestTensorToSparseTensor, TestSparseTensorToTensor, + TestAlternativeAxisOrder, TestNonAscendingShape, + TestRoundTrip); + +INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestSparseCSFTensorForIndexValueType, Int8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestSparseCSFTensorForIndexValueType, UInt8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt16, TestSparseCSFTensorForIndexValueType, Int16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt16, TestSparseCSFTensorForIndexValueType, + UInt16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt32, TestSparseCSFTensorForIndexValueType, Int32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt32, TestSparseCSFTensorForIndexValueType, + UInt32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestSparseCSFTensorForIndexValueType, Int64Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt64, TestSparseCSFTensorForIndexValueType, + UInt64Type); } // namespace arrow diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index 1de67eed19a..9c8ddae0b7c 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -116,7 +116,7 @@ table SparseMatrixIndexCSX { union SparseTensorIndex { SparseTensorIndexCOO, - SparseMatrixIndexCSX, + SparseMatrixIndexCSX } table SparseTensor {