From 2d10104050604826f1b9d7077fd39da9129ae8be Mon Sep 17 00:00:00 2001 From: Rok Date: Fri, 8 Nov 2019 01:03:44 +0100 Subject: [PATCH 01/18] WIP --- cpp/src/arrow/sparse_tensor.cc | 83 ++++++++++++++++++++++++++--- cpp/src/arrow/sparse_tensor.h | 65 ++++++++++++++++++++++ cpp/src/arrow/sparse_tensor_test.cc | 72 +++++++++++++++++++++++++ format/SparseTensor.fbs | 13 +++++ 4 files changed, 225 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index d42bdf4ca61..ad528f0cf4d 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -507,6 +507,33 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor, } } +template +void assign_values(int64_t dimension_index, int64_t offset, int64_t first_ptr, + int64_t last_ptr, const SparseCSFIndex* sparse_index, + const int64_t* raw_data, const std::vector strides, + TYPE* out) { + auto indices_offset = sparse_index->indices_offsets()[dimension_index]; + auto indptr_offset = sparse_index->indptr_offsets()[dimension_index]; + int64_t ndim = sparse_index->indices_offsets().size(); + + if (dimension_index == 0 && ndim > 1) + last_ptr = sparse_index->indptr_offsets()[dimension_index + 1] - 1; + + for (int64_t i = first_ptr; i < last_ptr; ++i) { + int64_t tmp_offset = + offset + sparse_index->indices()->Value({indices_offset + i}) * + strides[dimension_index]; + if (dimension_index < ndim - 1) + assign_values( + dimension_index + 1, tmp_offset, + sparse_index->indptr()->Value({indptr_offset + i}), + sparse_index->indptr()->Value({indptr_offset + i + 1}), + sparse_index, raw_data, strides, out); + else + out[tmp_offset] = raw_data[i]; + } +} + template Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_tensor, std::shared_ptr* out) { @@ -521,18 +548,18 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t std::fill_n(values, sparse_tensor->size(), static_cast(0)); + std::vector strides(sparse_tensor->ndim(), 1); + for (int i = sparse_tensor->ndim() - 1; i > 0; --i) + strides[i - 1] *= strides[i] * sparse_tensor->shape()[i]; + + const auto raw_data = reinterpret_cast(sparse_tensor->raw_data()); + switch (sparse_tensor->format_id()) { case SparseTensorFormat::COO: { const auto& sparse_index = internal::checked_cast(*sparse_tensor->sparse_index()); const std::shared_ptr coords = sparse_index.indices(); - const auto raw_data = - reinterpret_cast(sparse_tensor->raw_data()); - std::vector strides(sparse_tensor->ndim(), 1); - for (int i = sparse_tensor->ndim() - 1; i > 0; --i) { - strides[i - 1] *= strides[i] * sparse_tensor->shape()[i]; - } for (int64_t i = 0; i < sparse_tensor->non_zero_length(); ++i) { std::vector coord(sparse_tensor->ndim()); int64_t offset = 0; @@ -552,8 +579,6 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t internal::checked_cast(*sparse_tensor->sparse_index()); const std::shared_ptr indptr = sparse_index.indptr(); const std::shared_ptr indices = sparse_index.indices(); - const auto raw_data = - reinterpret_cast(sparse_tensor->raw_data()); int64_t offset; for (int64_t i = 0; i < indptr->size() - 1; ++i) { @@ -590,6 +615,17 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t sparse_tensor->shape()); return Status::OK(); } + + case SparseTensorFormat::CSF: { + const auto& sparse_index = + internal::checked_cast(*sparse_tensor->sparse_index()); + assign_values( + 0, 0, 0, 0, &sparse_index, + reinterpret_cast(sparse_tensor->raw_data()), strides, values); + *out = std::make_shared(sparse_tensor->type(), values_buffer, + sparse_tensor->shape()); + return Status::OK(); + } } return Status::NotImplemented("Unsupported SparseIndex format type"); } @@ -625,6 +661,13 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t const std::shared_ptr indices = sparse_index.indices(); type = indices->type(); break; + } + case SparseTensorFormat::CSF: { + const auto& sparse_index = + internal::checked_cast(*sparse_tensor->sparse_index()); + const std::shared_ptr indices = sparse_index.indices(); + type = indices->type(); + break; } // LCOV_EXCL_START: ignore program failure default: @@ -754,6 +797,30 @@ void CheckSparseCSXIndexValidity(const std::shared_ptr& indptr_type, } // namespace internal +// ---------------------------------------------------------------------- +// SparseCSFIndex + +// Constructor with two index vectors +SparseCSFIndex::SparseCSFIndex(const std::shared_ptr& indptr, + const std::shared_ptr& indices, + const std::vector& indptr_offsets, + const std::vector& indices_offsets, + const std::vector& axis_order) + : SparseIndexBase(indices->shape()[0] - indices_offsets.back()), + indptr_(indptr), + indices_(indices), + indptr_offsets_(indptr_offsets), + indices_offsets_(indices_offsets), + axis_order_(axis_order) { + ARROW_CHECK(is_integer(indptr_->type_id())); + ARROW_CHECK_EQ(1, indptr_->ndim()); + ARROW_CHECK(is_integer(indices_->type_id())); + ARROW_CHECK_EQ(1, indices_->ndim()); + ARROW_CHECK_EQ(indptr_offsets_.size() + 1, indices_offsets_.size()); +} + +std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); } + // ---------------------------------------------------------------------- // SparseTensor diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index f736f7b7576..cdcbbe1ddc4 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -40,6 +40,8 @@ struct SparseTensorFormat { CSR, /// Compressed sparse column (CSC) format. CSC, + /// Compressed sparse fiber (CSF) format. + CSF }; }; @@ -329,6 +331,66 @@ class ARROW_EXPORT SparseCSCIndex using SparseCSXIndex::SparseCSXIndex; }; +// ---------------------------------------------------------------------- +// SparseCSFIndex class + +/// \brief EXPERIMENTAL: The index data for a CSF sparse tensor +/// +/// A CSF sparse index manages the location of its non-zero values by two +/// vectors. +/// TODO:rok, documentation +/// The first vector, called indptr, represents the range of the rows; the i-th +/// row spans from indptr[i] to indptr[i+1] in the corresponding value vector. +/// So the length of an indptr vector is the number of rows + 1. +/// +/// The other vector, called indices, represents the column indices of the +/// corresponding non-zero values. So the length of an indices vector is same +/// as the number of non-zero-values. +class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase { + public: + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSF; + + /// \brief Construct SparseCSFIndex from two index vectors + explicit SparseCSFIndex(const std::shared_ptr& indptr, + const std::shared_ptr& indices, + const std::vector& indptr_offsets, + const std::vector& indices_offsets, + const std::vector& axis_order); + + /// \brief Return a 1D tensor of indptr vector + const std::shared_ptr& indptr() const { return indptr_; } + + /// \brief Return a 1D tensor of indices vector + const std::shared_ptr& indices() const { return indices_; } + + /// \brief Return a 1D vector of indptr offsets + const std::vector& indptr_offsets() const { return indptr_offsets_; } + + /// \brief Return a vector of indices offsets + const std::vector& indices_offsets() const { return indices_offsets_; } + + /// \brief Return a 1D vector specifying the order of axes + const std::vector& axis_order() const { return axis_order_; } + + /// \brief Return a string representation of the sparse index + std::string ToString() const override; + + /// \brief Return whether the CSF indices are equal + bool Equals(const SparseCSFIndex& other) const { + return indptr()->Equals(*other.indptr()) && indices()->Equals(*other.indices()) && + indptr_offsets() == other.indptr_offsets() && + indices_offsets() == other.indices_offsets() && + axis_order() == other.axis_order(); + } + + protected: + std::shared_ptr indptr_; + std::shared_ptr indices_; + std::vector indptr_offsets_; + std::vector indices_offsets_; + std::vector axis_order_; +}; + // ---------------------------------------------------------------------- // SparseTensor class @@ -527,6 +589,9 @@ using SparseCSRMatrix = SparseTensorImpl; /// \brief EXPERIMENTAL: Type alias for CSC sparse matrix using SparseCSCMatrix = SparseTensorImpl; +/// \brief EXPERIMENTAL: Type alias for CSF sparse matrix +using SparseCSFTensor = SparseTensorImpl; + } // namespace arrow #endif // ARROW_SPARSE_TENSOR_H diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 198aa8f5f8d..91588ac27a7 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -910,4 +910,76 @@ TEST_F(TestSparseCSCMatrix, TestToTensor) { ASSERT_TRUE(tensor.Equals(*dense_tensor)); } +template +class TestSparseCSFTensorBase : public ::testing::Test { +public: + void SetUp() { + shape_ = {6, 4}; + dim_names_ = {"foo", "bar"}; + + // Dense representation: + // [ + // 1 0 2 0 + // 0 3 0 4 + // 5 0 6 0 + // 0 11 0 12 + // 13 0 14 0 + // 0 15 0 16 + // ] + std::vector dense_values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + auto dense_data = Buffer::Wrap(dense_values); + NumericTensor dense_tensor(dense_data, shape_, {}, dim_names_); + } + +protected: + std::vector shape_; + std::vector dim_names_; + std::shared_ptr sparse_tensor_from_dense_; +}; + +class TestSparseCSFTensor : public TestSparseCSFTensorBase {}; + +TEST_F(TestSparseCSFTensor, TestToTensor) { + std::vector data_values = {1, 2, 3, 4, 5, 6, 7, 8}; + std::vector indptr_values = {0, 2, 3, 0, 1, 3, 4, 0, 2, 4, 5, 8}; + std::vector indices_values = {1, 2, 1, 2, 2, 1, 1, 2, 2, + 2, 3, 1, 3, 1, 1, 2, 3}; + std::vector indices_offsets = {0, 2, 5, 9}; + std::vector indptr_offsets = {0, 3, 7}; + std::vector axis_order = {0, 1, 2, 3}; + std::vector sparse_tensor_shape({3, 3, 3, 4}); + std::vector indptr_shape({12}); + std::vector indices_shape({17}); + std::vector dim_names({"a", "b", "c", "d"}); + + std::shared_ptr data_buffer = Buffer::Wrap(data_values); + std::shared_ptr indptr_buffer = Buffer::Wrap(indptr_values); + std::shared_ptr indices_buffer = Buffer::Wrap(indices_values); + + std::shared_ptr indptr = + std::make_shared(int64(), indptr_buffer, indptr_shape); + std::shared_ptr indices = + std::make_shared(int64(), indices_buffer, indices_shape); + + std::shared_ptr sparse_index = std::make_shared( + indptr, indices, indptr_offsets, indices_offsets, axis_order); + std::shared_ptr sparse_tensor = std::make_shared( + sparse_index, int64(), data_buffer, sparse_tensor_shape, dim_names); + + ASSERT_EQ(8, sparse_tensor->non_zero_length()); + + std::shared_ptr dense_tensor; + ASSERT_OK(sparse_tensor->ToTensor(&dense_tensor)); + + std::vector dense_values = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; + auto dense_data = Buffer::Wrap(dense_values); + Tensor tensor(int64(), dense_data, sparse_tensor_shape, {}); + + ASSERT_TRUE(tensor.Equals(*dense_tensor)); +} } // namespace arrow diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index 1de67eed19a..b22c8c718a2 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -114,9 +114,22 @@ table SparseMatrixIndexCSX { indicesBuffer: Buffer (required); } +/// Compressed Sparse Fiber (CSF) sparse tensor format +/// +/// CSF is a generalization of compressed sparse row (CSR) index. +/// CSF compresses a tensor into one three one-dimensional tensors. +table SparseTensorIndexCSF { + indptrType: Int; + indptrBuffer: Buffer; + indicesType: Int; + indicesBuffer: Buffer; + axisOrder: [long]; +} + union SparseTensorIndex { SparseTensorIndexCOO, SparseMatrixIndexCSX, + SparseTensorIndexCSF } table SparseTensor { From 6b938f7daf0255d52cb99d3775239568cfc027c1 Mon Sep 17 00:00:00 2001 From: Rok Date: Sat, 16 Nov 2019 22:46:59 +0100 Subject: [PATCH 02/18] Documentation. --- cpp/src/arrow/python/serialize.cc | 4 ++ cpp/src/arrow/sparse_tensor.cc | 7 +++- cpp/src/arrow/sparse_tensor.h | 17 ++++---- format/SparseTensor.fbs | 69 +++++++++++++++++++++++++++++-- 4 files changed, 83 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 09a322b1060..88d763b7877 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -654,6 +654,7 @@ Status CountSparseTensors( OwnedRef num_sparse_tensors(PyDict_New()); size_t num_coo = 0; size_t num_csr = 0; + size_t num_csf = 0; for (const auto& sparse_tensor : sparse_tensors) { switch (sparse_tensor->format_id()) { @@ -665,12 +666,15 @@ Status CountSparseTensors( break; case SparseTensorFormat::CSC: // TODO(mrkn): support csc + case SparseTensorFormat::CSF: + ++num_csf; break; } } PyDict_SetItemString(num_sparse_tensors.obj(), "coo", PyLong_FromSize_t(num_coo)); PyDict_SetItemString(num_sparse_tensors.obj(), "csr", PyLong_FromSize_t(num_csr)); + PyDict_SetItemString(num_sparse_tensors.obj(), "csf", PyLong_FromSize_t(num_csf)); RETURN_IF_PYERROR(); *out = num_sparse_tensors.detach(); diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index ad528f0cf4d..19f0a9edb62 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -438,6 +438,7 @@ class SparseTensorConverter INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCOOIndex); INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSRIndex); INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSCIndex); +INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSFIndex); } // namespace @@ -500,6 +501,9 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor, case SparseTensorFormat::CSC: return MakeSparseTensorFromTensor(tensor, index_value_type, pool, out_sparse_index, out_data); + case SparseTensorFormat::CSF: + return Status::Invalid("Unsupported Tensor value type"); + // LCOV_EXCL_START: ignore program failure default: return Status::Invalid("Invalid sparse tensor format"); @@ -530,7 +534,7 @@ void assign_values(int64_t dimension_index, int64_t offset, int64_t first_ptr, sparse_index->indptr()->Value({indptr_offset + i + 1}), sparse_index, raw_data, strides, out); else - out[tmp_offset] = raw_data[i]; + out[tmp_offset] = static_cast(raw_data[i]); } } @@ -817,6 +821,7 @@ SparseCSFIndex::SparseCSFIndex(const std::shared_ptr& indptr, ARROW_CHECK(is_integer(indices_->type_id())); ARROW_CHECK_EQ(1, indices_->ndim()); ARROW_CHECK_EQ(indptr_offsets_.size() + 1, indices_offsets_.size()); + ARROW_CHECK_EQ(axis_order_.size(), indices_offsets_.size()); } std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); } diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index cdcbbe1ddc4..a8e38b89ad2 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -336,16 +336,15 @@ class ARROW_EXPORT SparseCSCIndex /// \brief EXPERIMENTAL: The index data for a CSF sparse tensor /// -/// A CSF sparse index manages the location of its non-zero values by two -/// vectors. -/// TODO:rok, documentation -/// The first vector, called indptr, represents the range of the rows; the i-th -/// row spans from indptr[i] to indptr[i+1] in the corresponding value vector. -/// So the length of an indptr vector is the number of rows + 1. +/// A CSF sparse index manages the location of its non-zero values by set of +/// prefix trees. Each path from a root to leaf forms one tensor non-zero index. +/// CSF is implemented with five vectors. /// -/// The other vector, called indices, represents the column indices of the -/// corresponding non-zero values. So the length of an indices vector is same -/// as the number of non-zero-values. +/// Vectors indptr and indices are split into N-1 segments (by indptr_offsets) and +/// N segments (by indices_offsetsy, where N is the number of dimensions. +/// Indptr and indices segments describe the set of prefix trees. +/// +/// Trees traverse dimensions in order given by axis_order. class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase { public: static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSF; diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index b22c8c718a2..12e9f870376 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -114,15 +114,76 @@ table SparseMatrixIndexCSX { indicesBuffer: Buffer (required); } -/// Compressed Sparse Fiber (CSF) sparse tensor format -/// -/// CSF is a generalization of compressed sparse row (CSR) index. -/// CSF compresses a tensor into one three one-dimensional tensors. +/// Compressed Sparse Fiber (CSF) sparse tensor index. table SparseTensorIndexCSF { + /// CSF is a generalization of compressed sparse row (CSR) index. + /// See [smith2017knl]: http://shaden.io/pub-files/smith2017knl.pdf + /// + /// CSF recursively compresses each mode of the tensor into a set + /// of prefix trees. Each path from a root to leaf forms one tensor + /// non-zero index. CSF is implemented with two buffers and three arrays. + /// + /// For example, let X be a 3x3x3x4 tensor, and it has the following + /// 8 non-zero values: + /// + /// X[1, 1, 1, 2] := 1 + /// X[1, 1, 1, 3] := 2 + /// X[1, 2, 1, 1] := 3 + /// X[1, 2, 1, 3] := 4 + /// X[1, 2, 2, 1] := 5 + /// X[2, 2, 2, 1] := 6 + /// X[2, 2, 2, 2] := 7 + /// X[2, 2, 2, 3] := 8 + /// + /// As a prefix tree this would be represented be: + /// + /// 1 2 + /// / \ | + /// 1 2 2 + /// / / \ | + /// 1 1 2 2 + /// / \ / \ \ /|\ + /// 2 3 1 3 1 1 2 3 + + /// The type of values in indptrBuffer indptrType: Int; + + /// indptrBuffer stores the sparsity structure. + /// For example, the indptrBuffer for the above X is: + /// + /// indptrBuffer(X) = [0, 2, 3, 0, 1, 3, 4, 0, 2, 4, 5, 8]. + /// indptrBuffer: Buffer; + + /// indptrOffsets stores per dimension offset in indptrBuffer. + /// For example, the indptrOffsets for the above X is: + /// + /// indptrOffsets(X) = [0, 3, 7]. + /// + indptrOffsets: [int]; + + /// The type of values in indicesBuffer indicesType: Int; + + /// indicesBuffer stores the label of each node, + /// For example, the indicesBuffer for the above X is: + /// + /// indicesBuffer(X) = [1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 3, 1, 3, 1, 1, 2, 3]. + /// indicesBuffer: Buffer; + + /// indicesOffsets stores per dimension offset in indicesOffsets. + /// For example, the indicesBuffer for the above X is: + /// + /// indicesOffsets(X) = [0, 2, 5, 9]. + /// + indicesOffsets: [int]; + + /// axisOrder stores the sequence in which dimensions were traversed. + /// For example, the axisOrder for the above X is: + /// + /// axisOrder(X) = [0, 1, 2, 3]. + /// axisOrder: [long]; } From 05a47a546f9e91a086af81521155f98cc9367ffd Mon Sep 17 00:00:00 2001 From: Rok Date: Mon, 18 Nov 2019 17:46:34 +0100 Subject: [PATCH 03/18] Using axis_order in CSF. --- cpp/src/arrow/sparse_tensor.cc | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 19f0a9edb62..f3cfd42b1be 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -515,24 +515,25 @@ template void assign_values(int64_t dimension_index, int64_t offset, int64_t first_ptr, int64_t last_ptr, const SparseCSFIndex* sparse_index, const int64_t* raw_data, const std::vector strides, - TYPE* out) { - auto indices_offset = sparse_index->indices_offsets()[dimension_index]; - auto indptr_offset = sparse_index->indptr_offsets()[dimension_index]; + const std::vector axis_order, TYPE* out) { + auto dimension = axis_order[dimension_index]; + auto indices_offset = sparse_index->indices_offsets()[dimension]; + auto indptr_offset = sparse_index->indptr_offsets()[dimension]; int64_t ndim = sparse_index->indices_offsets().size(); - if (dimension_index == 0 && ndim > 1) - last_ptr = sparse_index->indptr_offsets()[dimension_index + 1] - 1; + if (dimension == 0 && ndim > 1) + last_ptr = sparse_index->indptr_offsets()[dimension + 1] - 1; for (int64_t i = first_ptr; i < last_ptr; ++i) { int64_t tmp_offset = offset + sparse_index->indices()->Value({indices_offset + i}) * - strides[dimension_index]; + strides[dimension]; if (dimension_index < ndim - 1) assign_values( - dimension_index + 1, tmp_offset, + dimension + 1, tmp_offset, sparse_index->indptr()->Value({indptr_offset + i}), sparse_index->indptr()->Value({indptr_offset + i + 1}), - sparse_index, raw_data, strides, out); + sparse_index, raw_data, strides, axis_order, out); else out[tmp_offset] = static_cast(raw_data[i]); } @@ -625,7 +626,8 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t internal::checked_cast(*sparse_tensor->sparse_index()); assign_values( 0, 0, 0, 0, &sparse_index, - reinterpret_cast(sparse_tensor->raw_data()), strides, values); + reinterpret_cast(sparse_tensor->raw_data()), strides, + sparse_index.axis_order(), values); *out = std::make_shared(sparse_tensor->type(), values_buffer, sparse_tensor->shape()); return Status::OK(); From 7d17995a47b0699326d176645fa0da522085309f Mon Sep 17 00:00:00 2001 From: Rok Date: Mon, 25 Nov 2019 00:20:39 +0100 Subject: [PATCH 04/18] Adding Tensor to SparseCSFTensor conversion. --- cpp/src/arrow/sparse_tensor.cc | 153 +++++++++++++++++++++++++++- cpp/src/arrow/sparse_tensor_test.cc | 28 +++++ 2 files changed, 179 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index f3cfd42b1be..f8f8bf425e3 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -419,6 +419,154 @@ class SparseTensorConverter inline Status CheckMaximumValue(const uint64_t) const { return Status::OK(); } }; +// ---------------------------------------------------------------------- +// SparseTensorConverter for SparseCSFIndex + +template +class SparseTensorConverter + : private SparseTensorConverterBase { + public: + using BaseClass = SparseTensorConverterBase; + using typename BaseClass::NumericTensorType; + using typename BaseClass::value_type; + + SparseTensorConverter(const NumericTensorType& tensor, + const std::shared_ptr& index_value_type, + MemoryPool* pool) + : BaseClass(tensor, index_value_type, pool) {} + + template + Status Convert() { + using c_index_value_type = typename IndexValueType::c_type; + const int64_t indices_elsize = sizeof(c_index_value_type); + + std::shared_ptr sparse_coo_tensor; + RETURN_NOT_OK(SparseCOOTensor::Make(tensor_, &sparse_coo_tensor)); + std::shared_ptr coords = + arrow::internal::checked_pointer_cast( + sparse_coo_tensor->sparse_index()) + ->indices(); + + // Convert SparseCOOTensor to long CSF buffers + const int64_t ndim = tensor_.ndim(); + const int64_t nonzero_count = sparse_coo_tensor->non_zero_length(); + + std::vector counts(ndim); + std::fill_n(counts.begin(), ndim, static_cast(0)); + + std::vector axis_order(ndim); + for (int64_t i = 0; i < ndim; ++i) axis_order[i] = i; + + std::shared_ptr indices_buffer; + std::shared_ptr indptr_buffer; + RETURN_NOT_OK( + AllocateBuffer(pool_, indices_elsize * ndim * nonzero_count, &indices_buffer)); + RETURN_NOT_OK(AllocateBuffer(pool_, indices_elsize * (ndim - 1) * (nonzero_count + 1), + &indptr_buffer)); + int64_t* indices = reinterpret_cast(indices_buffer->mutable_data()); + int64_t* indptr = reinterpret_cast(indptr_buffer->mutable_data()); + + for (int64_t row = 0; row < nonzero_count; ++row) { + bool tree_split = false; + for (int64_t column = 0; column < ndim; ++column) { + bool change = coords->Value({row, column}) != + coords->Value({row - 1, column}); + + if (tree_split || change || row == 0) { + if (row > 1) tree_split = true; + + indices[column * nonzero_count + counts[column]] = + coords->Value({row, column}); + indptr[column * (nonzero_count + 1) + counts[column]] = counts[column + 1]; + ++counts[column]; + } + } + } + + for (int64_t column = 0; column < ndim; ++column) { + indptr[column * (nonzero_count + 1) + counts[column]] = counts[column + 1]; + } + + int64_t total_size = counts[0]; + for (int64_t column = 1; column < ndim; ++column) { + for (int64_t i = 0; i < counts[column] + 1; ++i) { + if (column < ndim - 1) + indptr[total_size + column + i] = indptr[column * (nonzero_count + 1) + i]; + if (i < counts[column]) + indices[total_size + i] = indices[column * nonzero_count + i]; + } + total_size += counts[column]; + } + + // Copy CSF index data into smaller buffers + std::shared_ptr out_indices_buffer; + std::shared_ptr out_indptr_buffer; + RETURN_NOT_OK( + AllocateBuffer(pool_, indices_elsize * total_size, &out_indices_buffer)); + RETURN_NOT_OK(AllocateBuffer(pool_, + indices_elsize * total_size - nonzero_count + ndim - 1, + &out_indptr_buffer)); + int64_t* out_indices = reinterpret_cast(out_indices_buffer->mutable_data()); + int64_t* out_indptr = reinterpret_cast(out_indptr_buffer->mutable_data()); + + for (int64_t i = 0; i < total_size; ++i) out_indices[i] = indices[i]; + + for (int64_t i = 0; i < total_size - nonzero_count + ndim - 1; ++i) + out_indptr[i] = indptr[i]; + + // Construct SparseCSFTensor + std::vector out_indptr_shape({total_size - nonzero_count + ndim - 1}); + std::shared_ptr out_indptr_tensor = + std::make_shared(int64(), out_indptr_buffer, out_indptr_shape); + + std::vector out_indices_shape({total_size}); + std::shared_ptr out_indices_tensor = + std::make_shared(int64(), out_indices_buffer, out_indices_shape); + + std::vector indptr_offsets(ndim - 1); + std::vector indices_offsets(ndim); + std::fill_n(indptr_offsets.begin(), ndim - 1, static_cast(0)); + std::fill_n(indices_offsets.begin(), ndim, static_cast(0)); + + for (int64_t i = 0; i < ndim - 2; ++i) + indptr_offsets[i + 1] = indptr_offsets[i] + counts[i] + 1; + + for (int64_t i = 0; i < ndim; ++i) + indices_offsets[i + 1] = indices_offsets[i] + counts[i]; + + sparse_index = + std::make_shared(out_indptr_tensor, out_indices_tensor, + indptr_offsets, indices_offsets, axis_order); + data = sparse_coo_tensor->data(); + + return Status::OK(); + } + +#define CALL_TYPE_SPECIFIC_CONVERT(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: \ + return Convert(); + + Status Convert() { + switch (index_value_type_->id()) { + ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_TYPE_SPECIFIC_CONVERT); + // LCOV_EXCL_START: The following invalid causes program failure. + default: + return Status::TypeError("Unsupported SparseTensor index value type"); + // LCOV_EXCL_STOP + } + } + +#undef CALL_TYPE_SPECIFIC_CONVERT + + std::shared_ptr sparse_index; + std::shared_ptr data; + + private: + using BaseClass::index_value_type_; + using BaseClass::pool_; + using BaseClass::tensor_; +}; + // ---------------------------------------------------------------------- // Instantiate templates @@ -502,7 +650,8 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor, return MakeSparseTensorFromTensor(tensor, index_value_type, pool, out_sparse_index, out_data); case SparseTensorFormat::CSF: - return Status::Invalid("Unsupported Tensor value type"); + return MakeSparseTensorFromTensor(tensor, index_value_type, pool, + out_sparse_index, out_data); // LCOV_EXCL_START: ignore program failure default: @@ -812,7 +961,7 @@ SparseCSFIndex::SparseCSFIndex(const std::shared_ptr& indptr, const std::vector& indptr_offsets, const std::vector& indices_offsets, const std::vector& axis_order) - : SparseIndexBase(indices->shape()[0] - indices_offsets.back()), + : SparseIndexBase(indices->size() - indices_offsets.back()), indptr_(indptr), indices_(indices), indptr_offsets_(indptr_offsets), diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 91588ac27a7..5496df8b003 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -982,4 +982,32 @@ TEST_F(TestSparseCSFTensor, TestToTensor) { ASSERT_TRUE(tensor.Equals(*dense_tensor)); } + +TEST_F(TestSparseCSFTensor, CreationFromTensor) { + std::vector values = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; + std::vector shape({3, 3, 3, 4}); + std::vector dim_names({"a", "b", "c", "d"}); + std::shared_ptr buffer = Buffer::Wrap(values); + Tensor tensor(int64(), buffer, shape, {}, dim_names); + + std::shared_ptr st; + ASSERT_OK(SparseCSFTensor::Make(tensor, &st)); + + ASSERT_EQ(8, st->non_zero_length()); + ASSERT_TRUE(st->is_mutable()); + + ASSERT_EQ(dim_names, st->dim_names()); + ASSERT_EQ("a", st->dim_name(0)); + ASSERT_EQ("b", st->dim_name(1)); + ASSERT_EQ("c", st->dim_name(2)); + ASSERT_EQ("d", st->dim_name(3)); + + std::shared_ptr dt; + ASSERT_OK(st->ToTensor(&dt)); + ASSERT_TRUE(tensor.Equals(*dt)); +} } // namespace arrow From f44d92cfd5c4a36b173f9a84e1c42ab8986c233f Mon Sep 17 00:00:00 2001 From: Rok Date: Mon, 25 Nov 2019 03:49:44 +0100 Subject: [PATCH 05/18] Adding SparseCSFIndex::Make. --- cpp/src/arrow/sparse_tensor.cc | 63 ++++++++++++++++++++------ cpp/src/arrow/sparse_tensor.h | 11 +++++ cpp/src/arrow/sparse_tensor_test.cc | 68 +++++++++++++++-------------- 3 files changed, 96 insertions(+), 46 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index f8f8bf425e3..e67fbfb3ebb 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -438,6 +438,7 @@ class SparseTensorConverter template Status Convert() { using c_index_value_type = typename IndexValueType::c_type; + RETURN_NOT_OK(CheckMaximumValue(std::numeric_limits::max())); const int64_t indices_elsize = sizeof(c_index_value_type); std::shared_ptr sparse_coo_tensor; @@ -463,8 +464,8 @@ class SparseTensorConverter AllocateBuffer(pool_, indices_elsize * ndim * nonzero_count, &indices_buffer)); RETURN_NOT_OK(AllocateBuffer(pool_, indices_elsize * (ndim - 1) * (nonzero_count + 1), &indptr_buffer)); - int64_t* indices = reinterpret_cast(indices_buffer->mutable_data()); - int64_t* indptr = reinterpret_cast(indptr_buffer->mutable_data()); + auto* indices = reinterpret_cast(indices_buffer->mutable_data()); + auto* indptr = reinterpret_cast(indptr_buffer->mutable_data()); for (int64_t row = 0; row < nonzero_count; ++row) { bool tree_split = false; @@ -477,16 +478,19 @@ class SparseTensorConverter indices[column * nonzero_count + counts[column]] = coords->Value({row, column}); - indptr[column * (nonzero_count + 1) + counts[column]] = counts[column + 1]; + indptr[column * (nonzero_count + 1) + counts[column]] = + static_cast(counts[column + 1]); ++counts[column]; } } } for (int64_t column = 0; column < ndim; ++column) { - indptr[column * (nonzero_count + 1) + counts[column]] = counts[column + 1]; + indptr[column * (nonzero_count + 1) + counts[column]] = + static_cast(counts[column + 1]); } + // Remove gaps from buffers int64_t total_size = counts[0]; for (int64_t column = 1; column < ndim; ++column) { for (int64_t i = 0; i < counts[column] + 1; ++i) { @@ -506,8 +510,10 @@ class SparseTensorConverter RETURN_NOT_OK(AllocateBuffer(pool_, indices_elsize * total_size - nonzero_count + ndim - 1, &out_indptr_buffer)); - int64_t* out_indices = reinterpret_cast(out_indices_buffer->mutable_data()); - int64_t* out_indptr = reinterpret_cast(out_indptr_buffer->mutable_data()); + auto* out_indices = + reinterpret_cast(out_indices_buffer->mutable_data()); + auto* out_indptr = + reinterpret_cast(out_indptr_buffer->mutable_data()); for (int64_t i = 0; i < total_size; ++i) out_indices[i] = indices[i]; @@ -516,12 +522,7 @@ class SparseTensorConverter // Construct SparseCSFTensor std::vector out_indptr_shape({total_size - nonzero_count + ndim - 1}); - std::shared_ptr out_indptr_tensor = - std::make_shared(int64(), out_indptr_buffer, out_indptr_shape); - std::vector out_indices_shape({total_size}); - std::shared_ptr out_indices_tensor = - std::make_shared(int64(), out_indices_buffer, out_indices_shape); std::vector indptr_offsets(ndim - 1); std::vector indices_offsets(ndim); @@ -534,9 +535,11 @@ class SparseTensorConverter for (int64_t i = 0; i < ndim; ++i) indices_offsets[i + 1] = indices_offsets[i] + counts[i]; - sparse_index = - std::make_shared(out_indptr_tensor, out_indices_tensor, - indptr_offsets, indices_offsets, axis_order); + sparse_index = std::make_shared( + std::make_shared(index_value_type_, out_indptr_buffer, out_indptr_shape), + std::make_shared(index_value_type_, out_indices_buffer, + out_indices_shape), + indptr_offsets, indices_offsets, axis_order); data = sparse_coo_tensor->data(); return Status::OK(); @@ -565,6 +568,22 @@ class SparseTensorConverter using BaseClass::index_value_type_; using BaseClass::pool_; using BaseClass::tensor_; + + template + inline Status CheckMaximumValue(const c_value_type type_max) const { + auto max_dimension = + *std::max_element(tensor_.shape().begin(), tensor_.shape().end()); + if (static_cast(type_max) < max_dimension) { + // LCOV_EXCL_START: The following invalid causes program failure. + return Status::Invalid("The bit width of the index value type is too small"); + // LCOV_EXCL_STOP + } + return Status::OK(); + } + + inline Status CheckMaximumValue(const int64_t) const { return Status::OK(); } + + inline Status CheckMaximumValue(const uint64_t) const { return Status::OK(); } }; // ---------------------------------------------------------------------- @@ -955,6 +974,22 @@ void CheckSparseCSXIndexValidity(const std::shared_ptr& indptr_type, // ---------------------------------------------------------------------- // SparseCSFIndex +Status SparseCSFIndex::Make(const std::shared_ptr indices_type, + const std::vector& indptr_shape, + const std::vector& indices_shape, + const std::vector& indptr_offsets, + const std::vector& indices_offsets, + const std::vector& axis_order, + std::shared_ptr indptr_data, + std::shared_ptr indices_data, + std::shared_ptr* out) { + *out = std::make_shared( + std::make_shared(indices_type, indptr_data, indptr_shape), + std::make_shared(indices_type, indices_data, indices_shape), indptr_offsets, + indices_offsets, axis_order); + return Status::OK(); +} + // Constructor with two index vectors SparseCSFIndex::SparseCSFIndex(const std::shared_ptr& indptr, const std::shared_ptr& indices, diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index a8e38b89ad2..c3c36ba8402 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -349,6 +349,17 @@ class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase indices_type, + const std::vector& indptr_shape, + const std::vector& indices_shape, + const std::vector& indptr_offsets, + const std::vector& indices_offsets, + const std::vector& axis_order, + std::shared_ptr indptr_data, + std::shared_ptr indices_data, + std::shared_ptr* out); + /// \brief Construct SparseCSFIndex from two index vectors explicit SparseCSFIndex(const std::shared_ptr& indptr, const std::shared_ptr& indices, diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 5496df8b003..170cc2cc29c 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -912,30 +912,37 @@ TEST_F(TestSparseCSCMatrix, TestToTensor) { template class TestSparseCSFTensorBase : public ::testing::Test { -public: - void SetUp() { - shape_ = {6, 4}; - dim_names_ = {"foo", "bar"}; - - // Dense representation: - // [ - // 1 0 2 0 - // 0 3 0 4 - // 5 0 6 0 - // 0 11 0 12 - // 13 0 14 0 - // 0 15 0 16 - // ] - std::vector dense_values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, - 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; - auto dense_data = Buffer::Wrap(dense_values); - NumericTensor dense_tensor(dense_data, shape_, {}, dim_names_); - } - -protected: - std::vector shape_; - std::vector dim_names_; - std::shared_ptr sparse_tensor_from_dense_; + public: + void SetUp() { + shape_ = {3, 3, 3, 4}; + dim_names_ = {"a", "b", "c", "d"}; + + // COO representation: + // X[1, 1, 1, 2] := 1 + // X[1, 1, 1, 3] := 2 + // X[1, 2, 1, 1] := 3 + // X[1, 2, 1, 3] := 4 + // X[1, 2, 2, 1] := 5 + // X[2, 2, 2, 1] := 6 + // X[2, 2, 2, 2] := 7 + // X[2, 2, 2, 3] := 8 + + std::vector dense_values = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; + auto dense_data = Buffer::Wrap(dense_values); + NumericTensor dense_tensor(dense_data, shape_, {}, dim_names_); + ASSERT_OK(SparseCSFTensor::Make(dense_tensor, + TypeTraits::type_singleton(), + &sparse_tensor_from_dense_)); + } + + protected: + std::vector shape_; + std::vector dim_names_; + std::shared_ptr sparse_tensor_from_dense_; }; class TestSparseCSFTensor : public TestSparseCSFTensorBase {}; @@ -957,15 +964,12 @@ TEST_F(TestSparseCSFTensor, TestToTensor) { std::shared_ptr indptr_buffer = Buffer::Wrap(indptr_values); std::shared_ptr indices_buffer = Buffer::Wrap(indices_values); - std::shared_ptr indptr = - std::make_shared(int64(), indptr_buffer, indptr_shape); - std::shared_ptr indices = - std::make_shared(int64(), indices_buffer, indices_shape); - - std::shared_ptr sparse_index = std::make_shared( - indptr, indices, indptr_offsets, indices_offsets, axis_order); + std::shared_ptr si; + ASSERT_OK(SparseCSFIndex::Make(int64(), indptr_shape, indices_shape, indptr_offsets, + indices_offsets, axis_order, indptr_buffer, + indices_buffer, &si)); std::shared_ptr sparse_tensor = std::make_shared( - sparse_index, int64(), data_buffer, sparse_tensor_shape, dim_names); + si, int64(), data_buffer, sparse_tensor_shape, dim_names); ASSERT_EQ(8, sparse_tensor->non_zero_length()); From a322ff5b26280a2263f1dda594e353a5434348b5 Mon Sep 17 00:00:00 2001 From: Rok Date: Mon, 25 Nov 2019 18:47:31 +0100 Subject: [PATCH 06/18] Adding tests for multiple index value types for SparseCSFIndex. --- cpp/src/arrow/sparse_tensor.cc | 3 +- cpp/src/arrow/sparse_tensor_test.cc | 106 ++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index e67fbfb3ebb..c18358b4388 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -477,7 +477,8 @@ class SparseTensorConverter if (row > 1) tree_split = true; indices[column * nonzero_count + counts[column]] = - coords->Value({row, column}); + static_cast( + coords->Value({row, column})); indptr[column * (nonzero_count + 1) + counts[column]] = static_cast(counts[column + 1]); ++counts[column]; diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 170cc2cc29c..103fc86a7ee 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -1014,4 +1014,110 @@ TEST_F(TestSparseCSFTensor, CreationFromTensor) { ASSERT_OK(st->ToTensor(&dt)); ASSERT_TRUE(tensor.Equals(*dt)); } + +template +class TestSparseCSFTensorForIndexValueType + : public TestSparseCSFTensorBase { + protected: + std::shared_ptr MakeSparseCSFIndex( + std::vector& indptr_values, + std::vector& indices_values, + const std::vector& indptr_offsets, + const std::vector& indices_offsets, + const std::vector& indptr_shape, const std::vector& indices_shape, + const std::vector& axis_order) const { + auto indptr_data = Buffer::Wrap(indptr_values); + auto indices_data = Buffer::Wrap(indices_values); + auto indptr = + std::make_shared>(indptr_data, indptr_shape); + auto indices = + std::make_shared>(indices_data, indices_shape); + return std::make_shared(indptr, indices, indptr_offsets, + indices_offsets, axis_order); + } + + template + std::shared_ptr MakeSparseTensor( + const std::shared_ptr& si, + std::vector& sparse_values) const { + auto data = Buffer::Wrap(sparse_values); + return std::make_shared(si, + CTypeTraits::type_singleton(), + data, this->shape_, this->dim_names_); + } +}; + +TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType); + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, ToTensor) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::vector data_values = {1, 2, 3, 4, 5, 6, 7, 8}; + std::vector indptr_values = {0, 2, 3, 0, 1, 3, 4, 0, 2, 4, 5, 8}; + std::vector indices_values = {1, 2, 1, 2, 2, 1, 1, 2, 2, + 2, 3, 1, 3, 1, 1, 2, 3}; + std::vector indices_offsets = {0, 2, 5, 9}; + std::vector indptr_offsets = {0, 3, 7}; + std::vector axis_order = {0, 1, 2, 3}; + std::vector sparse_tensor_shape({3, 3, 3, 4}); + std::vector indptr_shape({12}); + std::vector indices_shape({17}); + std::vector dim_names({"a", "b", "c", "d"}); + + std::vector dense_values = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; + + std::shared_ptr data_buffer = Buffer::Wrap(data_values); + std::shared_ptr indptr_buffer = Buffer::Wrap(indptr_values); + std::shared_ptr indices_buffer = Buffer::Wrap(indices_values); + std::shared_ptr dense_data = Buffer::Wrap(dense_values); + + std::shared_ptr si = + this->MakeSparseCSFIndex(indptr_values, indices_values, indptr_offsets, + indices_offsets, indptr_shape, indices_shape, axis_order); + std::shared_ptr st = this->MakeSparseTensor(si, data_values); + + ASSERT_EQ(8, st->non_zero_length()); + + std::shared_ptr dt; + ASSERT_OK(st->ToTensor(&dt)); + Tensor tensor(int64(), dense_data, sparse_tensor_shape, {}); + ASSERT_TRUE(tensor.Equals(*dt)); + + std::shared_ptr si2 = + arrow::internal::checked_pointer_cast( + this->sparse_tensor_from_dense_->sparse_index()); + + ASSERT_EQ(si->indices()->type(), si2->indices()->type()); + ASSERT_TRUE(si->indptr()->Equals(*si2->indptr())); + ASSERT_TRUE(si->indices()->Equals(*si2->indices())); + ASSERT_TRUE(si->indptr_offsets() == si2->indptr_offsets()); + ASSERT_TRUE(si->indices_offsets() == si2->indices_offsets()); + ASSERT_TRUE(si->indices_offsets() == si2->indices_offsets()); + ASSERT_TRUE(si->axis_order() == si2->axis_order()); + + ASSERT_TRUE(si->Equals(*si2)); + ASSERT_TRUE(st->data()->Equals(*this->sparse_tensor_from_dense_->data())); + // ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); +} + +REGISTER_TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType, ToTensor); + +INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestSparseCSFTensorForIndexValueType, Int8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestSparseCSFTensorForIndexValueType, UInt8Type); +// INSTANTIATE_TYPED_TEST_CASE_P(TestInt16, TestSparseCSFTensorForIndexValueType, +// Int16Type); INSTANTIATE_TYPED_TEST_CASE_P(TestUInt16, +// TestSparseCSFTensorForIndexValueType,UInt16Type); +// INSTANTIATE_TYPED_TEST_CASE_P(TestInt32, TestSparseCSFTensorForIndexValueType, +// Int32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt32, TestSparseCSFTensorForIndexValueType, + UInt32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestSparseCSFTensorForIndexValueType, Int64Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt64, TestSparseCSFTensorForIndexValueType, + UInt64Type); + } // namespace arrow From eb519471d72977e7ad35f1467daa1308e67ea655 Mon Sep 17 00:00:00 2001 From: Rok Date: Sun, 8 Dec 2019 16:21:06 +0100 Subject: [PATCH 07/18] Switching SparseCSFIndex to '2D' data structure. --- cpp/src/arrow/compare.cc | 10 ++ cpp/src/arrow/sparse_tensor.cc | 196 +++++++++++++--------------- cpp/src/arrow/sparse_tensor.h | 67 +++++----- cpp/src/arrow/sparse_tensor_test.cc | 158 ++++++++++------------ format/SparseTensor.fbs | 38 ++---- 5 files changed, 213 insertions(+), 256 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index f7431f80f5f..6e521a32c03 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -1194,6 +1194,12 @@ inline bool SparseTensorEqualsImplDispatch(const SparseTensorImpl&>(right); return SparseTensorEqualsImpl::Compare(left, right_csc); + + case SparseTensorFormat::CSF: { + const auto& right_csf = + checked_cast&>(right); + return SparseTensorEqualsImpl::Compare(left, + right_csf); } default: @@ -1230,6 +1236,10 @@ bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right) { case SparseTensorFormat::CSC: { const auto& left_csc = checked_cast&>(left); return SparseTensorEqualsImplDispatch(left_csc, right); + + case SparseTensorFormat::CSF: { + const auto& left_csf = checked_cast&>(left); + return SparseTensorEqualsImplDispatch(left_csf, right); } default: diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index c18358b4388..c917523a95a 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/buffer_builder.h" #include "arrow/compare.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" @@ -439,10 +440,9 @@ class SparseTensorConverter Status Convert() { using c_index_value_type = typename IndexValueType::c_type; RETURN_NOT_OK(CheckMaximumValue(std::numeric_limits::max())); - const int64_t indices_elsize = sizeof(c_index_value_type); std::shared_ptr sparse_coo_tensor; - RETURN_NOT_OK(SparseCOOTensor::Make(tensor_, &sparse_coo_tensor)); + ARROW_ASSIGN_OR_RAISE(sparse_coo_tensor, SparseCOOTensor::Make(tensor_)); std::shared_ptr coords = arrow::internal::checked_pointer_cast( sparse_coo_tensor->sparse_index()) @@ -458,14 +458,8 @@ class SparseTensorConverter std::vector axis_order(ndim); for (int64_t i = 0; i < ndim; ++i) axis_order[i] = i; - std::shared_ptr indices_buffer; - std::shared_ptr indptr_buffer; - RETURN_NOT_OK( - AllocateBuffer(pool_, indices_elsize * ndim * nonzero_count, &indices_buffer)); - RETURN_NOT_OK(AllocateBuffer(pool_, indices_elsize * (ndim - 1) * (nonzero_count + 1), - &indptr_buffer)); - auto* indices = reinterpret_cast(indices_buffer->mutable_data()); - auto* indptr = reinterpret_cast(indptr_buffer->mutable_data()); + std::vector> indptr_buffer_builders(ndim - 1); + std::vector> indices_buffer_builders(ndim); for (int64_t row = 0; row < nonzero_count; ++row) { bool tree_split = false; @@ -476,73 +470,37 @@ class SparseTensorConverter if (tree_split || change || row == 0) { if (row > 1) tree_split = true; - indices[column * nonzero_count + counts[column]] = - static_cast( - coords->Value({row, column})); - indptr[column * (nonzero_count + 1) + counts[column]] = - static_cast(counts[column + 1]); + if (column < ndim - 1) + RETURN_NOT_OK(indptr_buffer_builders[column].Append( + static_cast(counts[column + 1]))); + RETURN_NOT_OK( + indices_buffer_builders[column].Append(static_cast( + coords->Value({row, column})))); ++counts[column]; } } } - - for (int64_t column = 0; column < ndim; ++column) { - indptr[column * (nonzero_count + 1) + counts[column]] = - static_cast(counts[column + 1]); + for (int64_t column = 0; column < ndim - 1; ++column) { + RETURN_NOT_OK(indptr_buffer_builders[column].Append( + static_cast(counts[column + 1]))); } - // Remove gaps from buffers - int64_t total_size = counts[0]; - for (int64_t column = 1; column < ndim; ++column) { - for (int64_t i = 0; i < counts[column] + 1; ++i) { - if (column < ndim - 1) - indptr[total_size + column + i] = indptr[column * (nonzero_count + 1) + i]; - if (i < counts[column]) - indices[total_size + i] = indices[column * nonzero_count + i]; - } - total_size += counts[column]; - } + std::vector> indptr_buffers(ndim - 1); + std::vector> indices_buffers(ndim); + std::vector indptr_shapes(counts.begin(), counts.end() - 1); + std::vector indices_shapes = counts; - // Copy CSF index data into smaller buffers - std::shared_ptr out_indices_buffer; - std::shared_ptr out_indptr_buffer; - RETURN_NOT_OK( - AllocateBuffer(pool_, indices_elsize * total_size, &out_indices_buffer)); - RETURN_NOT_OK(AllocateBuffer(pool_, - indices_elsize * total_size - nonzero_count + ndim - 1, - &out_indptr_buffer)); - auto* out_indices = - reinterpret_cast(out_indices_buffer->mutable_data()); - auto* out_indptr = - reinterpret_cast(out_indptr_buffer->mutable_data()); - - for (int64_t i = 0; i < total_size; ++i) out_indices[i] = indices[i]; - - for (int64_t i = 0; i < total_size - nonzero_count + ndim - 1; ++i) - out_indptr[i] = indptr[i]; - - // Construct SparseCSFTensor - std::vector out_indptr_shape({total_size - nonzero_count + ndim - 1}); - std::vector out_indices_shape({total_size}); - - std::vector indptr_offsets(ndim - 1); - std::vector indices_offsets(ndim); - std::fill_n(indptr_offsets.begin(), ndim - 1, static_cast(0)); - std::fill_n(indices_offsets.begin(), ndim, static_cast(0)); - - for (int64_t i = 0; i < ndim - 2; ++i) - indptr_offsets[i + 1] = indptr_offsets[i] + counts[i] + 1; - - for (int64_t i = 0; i < ndim; ++i) - indices_offsets[i + 1] = indices_offsets[i] + counts[i]; - - sparse_index = std::make_shared( - std::make_shared(index_value_type_, out_indptr_buffer, out_indptr_shape), - std::make_shared(index_value_type_, out_indices_buffer, - out_indices_shape), - indptr_offsets, indices_offsets, axis_order); - data = sparse_coo_tensor->data(); + for (int64_t column = 0; column < ndim; ++column) + RETURN_NOT_OK( + indices_buffer_builders[column].Finish(&indices_buffers[column], true)); + + for (int64_t column = 0; column < ndim - 1; ++column) + RETURN_NOT_OK(indptr_buffer_builders[column].Finish(&indptr_buffers[column], true)); + ARROW_ASSIGN_OR_RAISE( + sparse_index, SparseCSFIndex::Make(index_value_type_, indices_shapes, axis_order, + indptr_buffers, indices_buffers)); + data = sparse_coo_tensor->data(); return Status::OK(); } @@ -686,23 +644,19 @@ void assign_values(int64_t dimension_index, int64_t offset, int64_t first_ptr, const int64_t* raw_data, const std::vector strides, const std::vector axis_order, TYPE* out) { auto dimension = axis_order[dimension_index]; - auto indices_offset = sparse_index->indices_offsets()[dimension]; - auto indptr_offset = sparse_index->indptr_offsets()[dimension]; - int64_t ndim = sparse_index->indices_offsets().size(); - - if (dimension == 0 && ndim > 1) - last_ptr = sparse_index->indptr_offsets()[dimension + 1] - 1; + int64_t ndim = axis_order.size(); + if (dimension == 0 && ndim > 1) last_ptr = sparse_index->indptr()[0]->size() - 1; for (int64_t i = first_ptr; i < last_ptr; ++i) { int64_t tmp_offset = - offset + sparse_index->indices()->Value({indices_offset + i}) * + offset + sparse_index->indices()[dimension]->Value({i}) * strides[dimension]; if (dimension_index < ndim - 1) assign_values( dimension + 1, tmp_offset, - sparse_index->indptr()->Value({indptr_offset + i}), - sparse_index->indptr()->Value({indptr_offset + i + 1}), - sparse_index, raw_data, strides, axis_order, out); + sparse_index->indptr()[dimension]->Value({i}), + sparse_index->indptr()[dimension]->Value({i + 1}), sparse_index, + raw_data, strides, axis_order, out); else out[tmp_offset] = static_cast(raw_data[i]); } @@ -840,8 +794,8 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t case SparseTensorFormat::CSF: { const auto& sparse_index = internal::checked_cast(*sparse_tensor->sparse_index()); - const std::shared_ptr indices = sparse_index.indices(); - type = indices->type(); + const std::vector> indices = sparse_index.indices(); + type = indices[0]->type(); break; } // LCOV_EXCL_START: ignore program failure @@ -975,40 +929,68 @@ void CheckSparseCSXIndexValidity(const std::shared_ptr& indptr_type, // ---------------------------------------------------------------------- // SparseCSFIndex -Status SparseCSFIndex::Make(const std::shared_ptr indices_type, - const std::vector& indptr_shape, - const std::vector& indices_shape, - const std::vector& indptr_offsets, - const std::vector& indices_offsets, - const std::vector& axis_order, - std::shared_ptr indptr_data, - std::shared_ptr indices_data, - std::shared_ptr* out) { - *out = std::make_shared( - std::make_shared(indices_type, indptr_data, indptr_shape), - std::make_shared(indices_type, indices_data, indices_shape), indptr_offsets, - indices_offsets, axis_order); +namespace { + +inline Status CheckSparseCSFIndexValidity(const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const int64_t num_indptrs, + const int64_t num_indices, + const std::vector& indptr_shape, + const std::vector& indices_shape, + const int64_t axis_order_size) { + if (!is_integer(indptr_type->id())) { + return Status::Invalid("Type of SparseCSFIndex indptr must be integer"); + } + if (!is_integer(indices_type->id())) { + return Status::Invalid("Type of SparseCSFIndex indices must be integer"); + } + if (num_indptrs + 1 != num_indices) { + return Status::Invalid( + "SparseCSFIndex length indices must be equal to length inptrs plus one."); + } + if (axis_order_size != num_indices) { + return Status::Invalid( + "SparseCSFIndex length of indices must be equal number of dimensions."); + } return Status::OK(); } +} // namespace + +Result> SparseCSFIndex::Make( + const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const std::vector& indices_shapes, const std::vector& axis_order, + std::vector> indptr_data, + std::vector> indices_data) { + int64_t ndim = axis_order.size(); + std::vector> indptr(ndim - 1); + std::vector> indices(ndim); + + for (int64_t i = 0; i < ndim - 1; ++i) + indptr[i] = std::make_shared(indptr_type, indptr_data[i], + std::vector({indices_shapes[i] + 1})); + + for (int64_t i = 0; i < ndim; ++i) + indices[i] = std::make_shared(indices_type, indices_data[i], + std::vector({indices_shapes[i]})); + + return std::make_shared(indptr, indices, axis_order); +} + // Constructor with two index vectors -SparseCSFIndex::SparseCSFIndex(const std::shared_ptr& indptr, - const std::shared_ptr& indices, - const std::vector& indptr_offsets, - const std::vector& indices_offsets, +SparseCSFIndex::SparseCSFIndex(std::vector>& indptr, + std::vector>& indices, const std::vector& axis_order) - : SparseIndexBase(indices->size() - indices_offsets.back()), + : SparseIndexBase(indices.back()->shape()[0]), indptr_(indptr), indices_(indices), - indptr_offsets_(indptr_offsets), - indices_offsets_(indices_offsets), axis_order_(axis_order) { - ARROW_CHECK(is_integer(indptr_->type_id())); - ARROW_CHECK_EQ(1, indptr_->ndim()); - ARROW_CHECK(is_integer(indices_->type_id())); - ARROW_CHECK_EQ(1, indices_->ndim()); - ARROW_CHECK_EQ(indptr_offsets_.size() + 1, indices_offsets_.size()); - ARROW_CHECK_EQ(axis_order_.size(), indices_offsets_.size()); + ARROW_CHECK(CheckSparseCSFIndexValidity(indptr_.front()->type(), + indices_.front()->type(), indptr_.size(), + indices_.size(), indptr_.back()->shape(), + indices_.back()->shape(), axis_order_.size()) + .ok()); } std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); } diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index c3c36ba8402..b75c42204f0 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -338,46 +338,44 @@ class ARROW_EXPORT SparseCSCIndex /// /// A CSF sparse index manages the location of its non-zero values by set of /// prefix trees. Each path from a root to leaf forms one tensor non-zero index. -/// CSF is implemented with five vectors. +/// CSF is implemented with three vectors. /// -/// Vectors indptr and indices are split into N-1 segments (by indptr_offsets) and -/// N segments (by indices_offsetsy, where N is the number of dimensions. -/// Indptr and indices segments describe the set of prefix trees. -/// -/// Trees traverse dimensions in order given by axis_order. +/// Vectors inptr and indices contain N-1 and N buffers respectively, where N is the +/// number of dimensions. Axis_order is a vector of integers of legth N. Indptr and +/// indices describe the set of prefix trees. Trees traverse dimensions in order given by +/// axis_order. class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase { public: static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSF; /// \brief Make SparseCSFIndex from raw properties - static Status Make(const std::shared_ptr indices_type, - const std::vector& indptr_shape, - const std::vector& indices_shape, - const std::vector& indptr_offsets, - const std::vector& indices_offsets, - const std::vector& axis_order, - std::shared_ptr indptr_data, - std::shared_ptr indices_data, - std::shared_ptr* out); + static Result> Make( + const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const std::vector& indices_shapes, const std::vector& axis_order, + std::vector> indptr_data, + std::vector> indices_data); + + /// \brief Make SparseCSFIndex from raw properties + static Result> Make( + const std::shared_ptr& indices_type, + const std::vector& indices_shapes, const std::vector& axis_order, + std::vector> indptr_data, + std::vector> indices_data) { + return Make(indices_type, indices_type, indices_shapes, axis_order, indptr_data, + indices_data); + } /// \brief Construct SparseCSFIndex from two index vectors - explicit SparseCSFIndex(const std::shared_ptr& indptr, - const std::shared_ptr& indices, - const std::vector& indptr_offsets, - const std::vector& indices_offsets, + explicit SparseCSFIndex(std::vector>& indptr, + std::vector>& indices, const std::vector& axis_order); /// \brief Return a 1D tensor of indptr vector - const std::shared_ptr& indptr() const { return indptr_; } + const std::vector>& indptr() const { return indptr_; } /// \brief Return a 1D tensor of indices vector - const std::shared_ptr& indices() const { return indices_; } - - /// \brief Return a 1D vector of indptr offsets - const std::vector& indptr_offsets() const { return indptr_offsets_; } - - /// \brief Return a vector of indices offsets - const std::vector& indices_offsets() const { return indices_offsets_; } + const std::vector>& indices() const { return indices_; } /// \brief Return a 1D vector specifying the order of axes const std::vector& axis_order() const { return axis_order_; } @@ -387,17 +385,16 @@ class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBaseEquals(*other.indptr()) && indices()->Equals(*other.indices()) && - indptr_offsets() == other.indptr_offsets() && - indices_offsets() == other.indices_offsets() && - axis_order() == other.axis_order(); + for (int64_t i = 0; i < static_cast(indices().size()); ++i) + if (!indices()[i]->Equals(*other.indices()[i])) return false; + for (int64_t i = 0; i < static_cast(indptr().size()); ++i) + if (!indptr()[i]->Equals(*other.indptr()[i])) return false; + return axis_order() == other.axis_order(); } protected: - std::shared_ptr indptr_; - std::shared_ptr indices_; - std::vector indptr_offsets_; - std::vector indices_offsets_; + std::vector> indptr_; + std::vector> indices_; std::vector axis_order_; }; diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 103fc86a7ee..314f1fea213 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -934,9 +934,9 @@ class TestSparseCSFTensorBase : public ::testing::Test { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; auto dense_data = Buffer::Wrap(dense_values); NumericTensor dense_tensor(dense_data, shape_, {}, dim_names_); - ASSERT_OK(SparseCSFTensor::Make(dense_tensor, - TypeTraits::type_singleton(), - &sparse_tensor_from_dense_)); + ASSERT_OK_AND_ASSIGN(sparse_tensor_from_dense_, + SparseCSFTensor::Make( + dense_tensor, TypeTraits::type_singleton())); } protected: @@ -947,46 +947,6 @@ class TestSparseCSFTensorBase : public ::testing::Test { class TestSparseCSFTensor : public TestSparseCSFTensorBase {}; -TEST_F(TestSparseCSFTensor, TestToTensor) { - std::vector data_values = {1, 2, 3, 4, 5, 6, 7, 8}; - std::vector indptr_values = {0, 2, 3, 0, 1, 3, 4, 0, 2, 4, 5, 8}; - std::vector indices_values = {1, 2, 1, 2, 2, 1, 1, 2, 2, - 2, 3, 1, 3, 1, 1, 2, 3}; - std::vector indices_offsets = {0, 2, 5, 9}; - std::vector indptr_offsets = {0, 3, 7}; - std::vector axis_order = {0, 1, 2, 3}; - std::vector sparse_tensor_shape({3, 3, 3, 4}); - std::vector indptr_shape({12}); - std::vector indices_shape({17}); - std::vector dim_names({"a", "b", "c", "d"}); - - std::shared_ptr data_buffer = Buffer::Wrap(data_values); - std::shared_ptr indptr_buffer = Buffer::Wrap(indptr_values); - std::shared_ptr indices_buffer = Buffer::Wrap(indices_values); - - std::shared_ptr si; - ASSERT_OK(SparseCSFIndex::Make(int64(), indptr_shape, indices_shape, indptr_offsets, - indices_offsets, axis_order, indptr_buffer, - indices_buffer, &si)); - std::shared_ptr sparse_tensor = std::make_shared( - si, int64(), data_buffer, sparse_tensor_shape, dim_names); - - ASSERT_EQ(8, sparse_tensor->non_zero_length()); - - std::shared_ptr dense_tensor; - ASSERT_OK(sparse_tensor->ToTensor(&dense_tensor)); - - std::vector dense_values = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; - auto dense_data = Buffer::Wrap(dense_values); - Tensor tensor(int64(), dense_data, sparse_tensor_shape, {}); - - ASSERT_TRUE(tensor.Equals(*dense_tensor)); -} - TEST_F(TestSparseCSFTensor, CreationFromTensor) { std::vector values = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -999,7 +959,7 @@ TEST_F(TestSparseCSFTensor, CreationFromTensor) { Tensor tensor(int64(), buffer, shape, {}, dim_names); std::shared_ptr st; - ASSERT_OK(SparseCSFTensor::Make(tensor, &st)); + ASSERT_OK_AND_ASSIGN(st, SparseCSFTensor::Make(tensor)); ASSERT_EQ(8, st->non_zero_length()); ASSERT_TRUE(st->is_mutable()); @@ -1049,71 +1009,91 @@ class TestSparseCSFTensorForIndexValueType TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType); -TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, ToTensor) { +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestSparseTensorFromTensor) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::vector data_values = {1, 2, 3, 4, 5, 6, 7, 8}; + std::vector> indptr_values = { + {0, 2, 3}, {0, 1, 3, 4}, {0, 2, 4, 5, 8}}; + std::vector> indices_values = { + {1, 2}, {1, 2, 2}, {1, 1, 2, 2}, {2, 3, 1, 3, 1, 1, 2, 3}}; + std::vector> indptr_buffers(3); + std::vector> indices_buffers(4); + std::vector axis_order = {0, 1, 2, 3}; + std::vector sparse_tensor_shape({3, 3, 3, 4}); + std::vector indices_shapes({2, 3, 4, 8}); + std::vector dim_names({"a", "b", "c", "d"}); + + std::shared_ptr data_buffer = Buffer::Wrap(data_values); + for (int64_t i = 0; i < static_cast(indptr_values.size()); ++i) + indptr_buffers[i] = Buffer::Wrap(indptr_values[i]); + for (int64_t i = 0; i < static_cast(indices_values.size()); ++i) + indices_buffers[i] = Buffer::Wrap(indices_values[i]); + + std::shared_ptr sparse_index; + ASSERT_OK_AND_ASSIGN( + sparse_index, + SparseCSFIndex::Make(TypeTraits::type_singleton(), indices_shapes, + axis_order, indptr_buffers, indices_buffers)); + std::shared_ptr sparse_tensor = std::make_shared( + sparse_index, int64(), data_buffer, sparse_tensor_shape, dim_names); + + ASSERT_TRUE(sparse_tensor->Equals(*this->sparse_tensor_from_dense_)); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestSparseTensorToTensor) { using IndexValueType = TypeParam; using c_index_value_type = typename IndexValueType::c_type; std::vector data_values = {1, 2, 3, 4, 5, 6, 7, 8}; - std::vector indptr_values = {0, 2, 3, 0, 1, 3, 4, 0, 2, 4, 5, 8}; - std::vector indices_values = {1, 2, 1, 2, 2, 1, 1, 2, 2, - 2, 3, 1, 3, 1, 1, 2, 3}; - std::vector indices_offsets = {0, 2, 5, 9}; - std::vector indptr_offsets = {0, 3, 7}; + std::vector> indptr_values = { + {0, 2, 3}, {0, 1, 3, 4}, {0, 2, 4, 5, 8}}; + std::vector> indices_values = { + {1, 2}, {1, 2, 2}, {1, 1, 2, 2}, {2, 3, 1, 3, 1, 1, 2, 3}}; + std::vector> indptr_buffers(3); + std::vector> indices_buffers(4); std::vector axis_order = {0, 1, 2, 3}; std::vector sparse_tensor_shape({3, 3, 3, 4}); - std::vector indptr_shape({12}); - std::vector indices_shape({17}); + std::vector indices_shapes({2, 3, 4, 8}); std::vector dim_names({"a", "b", "c", "d"}); + std::shared_ptr data_buffer = Buffer::Wrap(data_values); + for (int64_t i = 0; i < static_cast(indptr_values.size()); ++i) + indptr_buffers[i] = Buffer::Wrap(indptr_values[i]); + for (int64_t i = 0; i < static_cast(indices_values.size()); ++i) + indices_buffers[i] = Buffer::Wrap(indices_values[i]); + std::vector dense_values = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; - - std::shared_ptr data_buffer = Buffer::Wrap(data_values); - std::shared_ptr indptr_buffer = Buffer::Wrap(indptr_values); - std::shared_ptr indices_buffer = Buffer::Wrap(indices_values); - std::shared_ptr dense_data = Buffer::Wrap(dense_values); - - std::shared_ptr si = - this->MakeSparseCSFIndex(indptr_values, indices_values, indptr_offsets, - indices_offsets, indptr_shape, indices_shape, axis_order); - std::shared_ptr st = this->MakeSparseTensor(si, data_values); - - ASSERT_EQ(8, st->non_zero_length()); - - std::shared_ptr dt; - ASSERT_OK(st->ToTensor(&dt)); + auto dense_data = Buffer::Wrap(dense_values); Tensor tensor(int64(), dense_data, sparse_tensor_shape, {}); - ASSERT_TRUE(tensor.Equals(*dt)); - std::shared_ptr si2 = - arrow::internal::checked_pointer_cast( - this->sparse_tensor_from_dense_->sparse_index()); - - ASSERT_EQ(si->indices()->type(), si2->indices()->type()); - ASSERT_TRUE(si->indptr()->Equals(*si2->indptr())); - ASSERT_TRUE(si->indices()->Equals(*si2->indices())); - ASSERT_TRUE(si->indptr_offsets() == si2->indptr_offsets()); - ASSERT_TRUE(si->indices_offsets() == si2->indices_offsets()); - ASSERT_TRUE(si->indices_offsets() == si2->indices_offsets()); - ASSERT_TRUE(si->axis_order() == si2->axis_order()); - - ASSERT_TRUE(si->Equals(*si2)); - ASSERT_TRUE(st->data()->Equals(*this->sparse_tensor_from_dense_->data())); - // ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); + std::shared_ptr sparse_index; + ASSERT_OK_AND_ASSIGN( + sparse_index, + SparseCSFIndex::Make(TypeTraits::type_singleton(), indices_shapes, + axis_order, indptr_buffers, indices_buffers)); + std::shared_ptr sparse_tensor = std::make_shared( + sparse_index, int64(), data_buffer, sparse_tensor_shape, dim_names); + + std::shared_ptr dense_tensor; + ASSERT_OK(sparse_tensor->ToTensor(&dense_tensor)); + ASSERT_TRUE(tensor.Equals(*dense_tensor)); } -REGISTER_TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType, ToTensor); +REGISTER_TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType, + TestSparseTensorFromTensor, TestSparseTensorToTensor); INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestSparseCSFTensorForIndexValueType, Int8Type); INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestSparseCSFTensorForIndexValueType, UInt8Type); -// INSTANTIATE_TYPED_TEST_CASE_P(TestInt16, TestSparseCSFTensorForIndexValueType, -// Int16Type); INSTANTIATE_TYPED_TEST_CASE_P(TestUInt16, -// TestSparseCSFTensorForIndexValueType,UInt16Type); -// INSTANTIATE_TYPED_TEST_CASE_P(TestInt32, TestSparseCSFTensorForIndexValueType, -// Int32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt16, TestSparseCSFTensorForIndexValueType, Int16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt16, TestSparseCSFTensorForIndexValueType, + UInt16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt32, TestSparseCSFTensorForIndexValueType, Int32Type); INSTANTIATE_TYPED_TEST_CASE_P(TestUInt32, TestSparseCSFTensorForIndexValueType, UInt32Type); INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestSparseCSFTensorForIndexValueType, Int64Type); diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index 12e9f870376..56acdfc01e5 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -145,41 +145,29 @@ table SparseTensorIndexCSF { /// / \ / \ \ /|\ /// 2 3 1 3 1 1 2 3 - /// The type of values in indptrBuffer + /// The type of values in indptrBuffers indptrType: Int; - /// indptrBuffer stores the sparsity structure. - /// For example, the indptrBuffer for the above X is: - /// - /// indptrBuffer(X) = [0, 2, 3, 0, 1, 3, 4, 0, 2, 4, 5, 8]. - /// - indptrBuffer: Buffer; - - /// indptrOffsets stores per dimension offset in indptrBuffer. - /// For example, the indptrOffsets for the above X is: + /// indptrBuffers stores the sparsity structure. + /// Position in the indptrBuffers vector signifies the dimension. + /// For example, the indptrBuffers for the above X is: /// - /// indptrOffsets(X) = [0, 3, 7]. + /// indptrBuffer(X) = [[0, 2, 3], [0, 1, 3, 4], [0, 2, 4, 5, 8]]. /// - indptrOffsets: [int]; + indptrBuffers: [Buffer]; - /// The type of values in indicesBuffer + /// The type of values in indicesBuffers indicesType: Int; - /// indicesBuffer stores the label of each node, - /// For example, the indicesBuffer for the above X is: - /// - /// indicesBuffer(X) = [1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 3, 1, 3, 1, 1, 2, 3]. - /// - indicesBuffer: Buffer; - - /// indicesOffsets stores per dimension offset in indicesOffsets. - /// For example, the indicesBuffer for the above X is: + /// indicesBuffers stores the label of each node. + /// Position in the indicesBuffers vector signifies the dimension. + /// For example, the indicesBuffers for the above X is: /// - /// indicesOffsets(X) = [0, 2, 5, 9]. + /// indicesBuffer(X) = [[1, 2], [1, 2, 2], [1, 1, 2, 2], [2, 3, 1, 3, 1, 1, 2, 3]]. /// - indicesOffsets: [int]; + indicesBuffers: [Buffer]; - /// axisOrder stores the sequence in which dimensions were traversed. + /// axisOrder stores the sequence in which dimensions were traversed to produce the prefix tree. /// For example, the axisOrder for the above X is: /// /// axisOrder(X) = [0, 1, 2, 3]. From bd0d8c2f80608ec56b0f7fedb74c80236ac989be Mon Sep 17 00:00:00 2001 From: Rok Date: Fri, 13 Dec 2019 17:37:10 +0100 Subject: [PATCH 08/18] Dense to sparse CSF conversion now in order of dimension size. --- cpp/src/arrow/sparse_tensor.cc | 12 ++++++------ format/SparseTensor.fbs | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index c917523a95a..9c265f97e8a 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -17,6 +17,7 @@ #include "arrow/sparse_tensor.h" +#include #include #include #include @@ -454,9 +455,7 @@ class SparseTensorConverter std::vector counts(ndim); std::fill_n(counts.begin(), ndim, static_cast(0)); - - std::vector axis_order(ndim); - for (int64_t i = 0; i < ndim; ++i) axis_order[i] = i; + std::vector axis_order = internal::ArgSort(tensor_.shape()); std::vector> indptr_buffer_builders(ndim - 1); std::vector> indices_buffer_builders(ndim); @@ -464,8 +463,9 @@ class SparseTensorConverter for (int64_t row = 0; row < nonzero_count; ++row) { bool tree_split = false; for (int64_t column = 0; column < ndim; ++column) { - bool change = coords->Value({row, column}) != - coords->Value({row - 1, column}); + int64_t dimension = axis_order[column]; + bool change = coords->Value({row, dimension}) != + coords->Value({row - 1, dimension}); if (tree_split || change || row == 0) { if (row > 1) tree_split = true; @@ -475,7 +475,7 @@ class SparseTensorConverter static_cast(counts[column + 1]))); RETURN_NOT_OK( indices_buffer_builders[column].Append(static_cast( - coords->Value({row, column})))); + coords->Value({row, dimension})))); ++counts[column]; } } diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index 56acdfc01e5..e3e8df11d44 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -172,7 +172,7 @@ table SparseTensorIndexCSF { /// /// axisOrder(X) = [0, 1, 2, 3]. /// - axisOrder: [long]; + axisOrder: [Int]; } union SparseTensorIndex { From 6ceb406b6486e8f91c694a2102a8a851a62b1aa7 Mon Sep 17 00:00:00 2001 From: Rok Date: Tue, 17 Dec 2019 04:03:21 +0100 Subject: [PATCH 09/18] Implementing review feedback. --- cpp/src/arrow/sparse_tensor.cc | 25 +++++++++++++++++++------ cpp/src/arrow/sparse_tensor.h | 1 + 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 9c265f97e8a..2a4f5e792e4 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -442,6 +442,13 @@ class SparseTensorConverter using c_index_value_type = typename IndexValueType::c_type; RETURN_NOT_OK(CheckMaximumValue(std::numeric_limits::max())); + const int64_t ndim = tensor_.ndim(); + if (ndim < 2) { + // LCOV_EXCL_START: The following invalid causes program failure. + return Status::Invalid("Invalid tensor dimension"); + // LCOV_EXCL_STOP + } + std::shared_ptr sparse_coo_tensor; ARROW_ASSIGN_OR_RAISE(sparse_coo_tensor, SparseCOOTensor::Make(tensor_)); std::shared_ptr coords = @@ -449,8 +456,10 @@ class SparseTensorConverter sparse_coo_tensor->sparse_index()) ->indices(); + // TODO(rok): Coords should be sorted with axis_order priority to improve compression. + // ARROW-4221 would help here as well. + // Convert SparseCOOTensor to long CSF buffers - const int64_t ndim = tensor_.ndim(); const int64_t nonzero_count = sparse_coo_tensor->non_zero_length(); std::vector counts(ndim); @@ -939,18 +948,18 @@ inline Status CheckSparseCSFIndexValidity(const std::shared_ptr& indpt const std::vector& indices_shape, const int64_t axis_order_size) { if (!is_integer(indptr_type->id())) { - return Status::Invalid("Type of SparseCSFIndex indptr must be integer"); + return Status::TypeError("Type of SparseCSFIndex indptr must be integer"); } if (!is_integer(indices_type->id())) { - return Status::Invalid("Type of SparseCSFIndex indices must be integer"); + return Status::TypeError("Type of SparseCSFIndex indices must be integer"); } if (num_indptrs + 1 != num_indices) { return Status::Invalid( - "SparseCSFIndex length indices must be equal to length inptrs plus one."); + "Length of indices must be equal to length of inptrs + 1 for SparseCSFIndex."); } if (axis_order_size != num_indices) { return Status::Invalid( - "SparseCSFIndex length of indices must be equal number of dimensions."); + "Length of indices must be equal number of dimensions for SparseCSFIndex."); } return Status::OK(); } @@ -970,11 +979,15 @@ Result> SparseCSFIndex::Make( for (int64_t i = 0; i < ndim - 1; ++i) indptr[i] = std::make_shared(indptr_type, indptr_data[i], std::vector({indices_shapes[i] + 1})); - for (int64_t i = 0; i < ndim; ++i) indices[i] = std::make_shared(indices_type, indices_data[i], std::vector({indices_shapes[i]})); + ARROW_CHECK(CheckSparseCSFIndexValidity(indptr_type, indices_type, indptr.size(), + indices.size(), indptr.back()->shape(), + indices.back()->shape(), axis_order.size()) + .ok()); + return std::make_shared(indptr, indices, axis_order); } diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index b75c42204f0..64e730b78d3 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -347,6 +347,7 @@ class ARROW_EXPORT SparseCSCIndex class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase { public: static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSF; + static constexpr char const* kTypeName = "SparseCSFIndex"; /// \brief Make SparseCSFIndex from raw properties static Result> Make( From 4f2bf00ddc5b2feaf5810df51d39157859f30193 Mon Sep 17 00:00:00 2001 From: Rok Date: Tue, 17 Dec 2019 22:06:50 +0100 Subject: [PATCH 10/18] Work on CSF index tests. --- cpp/src/arrow/sparse_tensor.cc | 26 +++++---- cpp/src/arrow/sparse_tensor_test.cc | 91 ++++++++++++++++------------- 2 files changed, 67 insertions(+), 50 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 2a4f5e792e4..404cff5a841 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -443,6 +443,8 @@ class SparseTensorConverter RETURN_NOT_OK(CheckMaximumValue(std::numeric_limits::max())); const int64_t ndim = tensor_.ndim(); + std::vector axis_order = internal::ArgSort(tensor_.shape()); + if (ndim < 2) { // LCOV_EXCL_START: The following invalid causes program failure. return Status::Invalid("Invalid tensor dimension"); @@ -464,8 +466,6 @@ class SparseTensorConverter std::vector counts(ndim); std::fill_n(counts.begin(), ndim, static_cast(0)); - std::vector axis_order = internal::ArgSort(tensor_.shape()); - std::vector> indptr_buffer_builders(ndim - 1); std::vector> indices_buffer_builders(ndim); @@ -477,7 +477,7 @@ class SparseTensorConverter coords->Value({row - 1, dimension}); if (tree_split || change || row == 0) { - if (row > 1) tree_split = true; + if (row > 1 || change) tree_split = true; if (column < ndim - 1) RETURN_NOT_OK(indptr_buffer_builders[column].Append( @@ -648,19 +648,18 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor, } template -void assign_values(int64_t dimension_index, int64_t offset, int64_t first_ptr, - int64_t last_ptr, const SparseCSFIndex* sparse_index, - const int64_t* raw_data, const std::vector strides, +void assign_values(int64_t dimension, int64_t offset, int64_t first_ptr, int64_t last_ptr, + const SparseCSFIndex* sparse_index, const int64_t* raw_data, + const std::vector strides, const std::vector axis_order, TYPE* out) { - auto dimension = axis_order[dimension_index]; int64_t ndim = axis_order.size(); - if (dimension == 0 && ndim > 1) last_ptr = sparse_index->indptr()[0]->size() - 1; for (int64_t i = first_ptr; i < last_ptr; ++i) { int64_t tmp_offset = offset + sparse_index->indices()[dimension]->Value({i}) * - strides[dimension]; - if (dimension_index < ndim - 1) + strides[axis_order[dimension]]; + + if (dimension < ndim - 1) assign_values( dimension + 1, tmp_offset, sparse_index->indptr()[dimension]->Value({i}), @@ -756,8 +755,13 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t case SparseTensorFormat::CSF: { const auto& sparse_index = internal::checked_cast(*sparse_tensor->sparse_index()); + int64_t last_ptr_index = sparse_index.indptr()[0]->size() - 1; + int64_t first_ptr = sparse_index.indptr()[0]->Value({0}); + int64_t last_ptr = + sparse_index.indptr()[0]->Value({last_ptr_index}); + assign_values( - 0, 0, 0, 0, &sparse_index, + 0, 0, first_ptr, last_ptr, &sparse_index, reinterpret_cast(sparse_tensor->raw_data()), strides, sparse_index.axis_order(), values); *out = std::make_shared(sparse_tensor->type(), values_buffer, diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 314f1fea213..a5723aa2adb 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -914,24 +914,15 @@ template class TestSparseCSFTensorBase : public ::testing::Test { public: void SetUp() { - shape_ = {3, 3, 3, 4}; + shape_ = {4, 3, 5, 2}; dim_names_ = {"a", "b", "c", "d"}; - // COO representation: - // X[1, 1, 1, 2] := 1 - // X[1, 1, 1, 3] := 2 - // X[1, 2, 1, 1] := 3 - // X[1, 2, 1, 3] := 4 - // X[1, 2, 2, 1] := 5 - // X[2, 2, 2, 1] := 6 - // X[2, 2, 2, 2] := 7 - // X[2, 2, 2, 3] := 8 - std::vector dense_values = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; + 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 7, 0, 8}; auto dense_data = Buffer::Wrap(dense_values); NumericTensor dense_tensor(dense_data, shape_, {}, dim_names_); ASSERT_OK_AND_ASSIGN(sparse_tensor_from_dense_, @@ -949,11 +940,12 @@ class TestSparseCSFTensor : public TestSparseCSFTensorBase {}; TEST_F(TestSparseCSFTensor, CreationFromTensor) { std::vector values = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; - std::vector shape({3, 3, 3, 4}); + 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 7, 0, 8}; + std::vector shape({4, 3, 5, 2}); std::vector dim_names({"a", "b", "c", "d"}); std::shared_ptr buffer = Buffer::Wrap(values); Tensor tensor(int64(), buffer, shape, {}, dim_names); @@ -961,8 +953,29 @@ TEST_F(TestSparseCSFTensor, CreationFromTensor) { std::shared_ptr st; ASSERT_OK_AND_ASSIGN(st, SparseCSFTensor::Make(tensor)); + std::vector> indptr_values = { + {0, 1, 4, 6}, {0, 1, 2, 3, 4, 5, 6}, {0, 1, 2, 3, 5, 6, 8}}; + std::vector> indices_values = { + {1, 0, 1}, {0, 0, 1, 0, 1, 2}, {0, 0, 0, 1, 3, 3}, {0, 1, 0, 0, 3, 4, 3, 4}}; + std::vector> indptr_buffers(3); + std::vector> indices_buffers(4); + std::vector axis_order = {3, 1, 0, 2}; + std::vector indices_shapes = {3, 6, 6, 8}; + + for (int64_t i = 0; i < static_cast(indptr_values.size()); ++i) + indptr_buffers[i] = Buffer::Wrap(indptr_values[i]); + for (int64_t i = 0; i < static_cast(indices_values.size()); ++i) + indices_buffers[i] = Buffer::Wrap(indices_values[i]); + + std::shared_ptr sparse_index; + ASSERT_OK_AND_ASSIGN(sparse_index, + SparseCSFIndex::Make(tensor.type(), indices_shapes, axis_order, + indptr_buffers, indices_buffers)); + + const auto& si = internal::checked_cast(*st->sparse_index()); ASSERT_EQ(8, st->non_zero_length()); ASSERT_TRUE(st->is_mutable()); + ASSERT_TRUE(si.Equals(*sparse_index)); ASSERT_EQ(dim_names, st->dim_names()); ASSERT_EQ("a", st->dim_name(0)); @@ -1015,14 +1028,14 @@ TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestSparseTensorFromTensor) { std::vector data_values = {1, 2, 3, 4, 5, 6, 7, 8}; std::vector> indptr_values = { - {0, 2, 3}, {0, 1, 3, 4}, {0, 2, 4, 5, 8}}; + {0, 1, 4, 6}, {0, 1, 2, 3, 4, 5, 6}, {0, 1, 2, 3, 5, 6, 8}}; std::vector> indices_values = { - {1, 2}, {1, 2, 2}, {1, 1, 2, 2}, {2, 3, 1, 3, 1, 1, 2, 3}}; + {1, 0, 1}, {0, 0, 1, 0, 1, 2}, {0, 0, 0, 1, 3, 3}, {0, 1, 0, 0, 3, 4, 3, 4}}; std::vector> indptr_buffers(3); std::vector> indices_buffers(4); - std::vector axis_order = {0, 1, 2, 3}; - std::vector sparse_tensor_shape({3, 3, 3, 4}); - std::vector indices_shapes({2, 3, 4, 8}); + std::vector axis_order = {3, 1, 0, 2}; + std::vector sparse_tensor_shape({4, 3, 5, 2}); + std::vector indices_shapes = {3, 6, 6, 8}; std::vector dim_names({"a", "b", "c", "d"}); std::shared_ptr data_buffer = Buffer::Wrap(data_values); @@ -1048,14 +1061,14 @@ TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestSparseTensorToTensor) { std::vector data_values = {1, 2, 3, 4, 5, 6, 7, 8}; std::vector> indptr_values = { - {0, 2, 3}, {0, 1, 3, 4}, {0, 2, 4, 5, 8}}; + {0, 1, 4, 6}, {0, 1, 2, 3, 4, 5, 6}, {0, 1, 2, 3, 5, 6, 8}}; std::vector> indices_values = { - {1, 2}, {1, 2, 2}, {1, 1, 2, 2}, {2, 3, 1, 3, 1, 1, 2, 3}}; + {1, 0, 1}, {0, 0, 1, 0, 1, 2}, {0, 0, 0, 1, 3, 3}, {0, 1, 0, 0, 3, 4, 3, 4}}; std::vector> indptr_buffers(3); std::vector> indices_buffers(4); - std::vector axis_order = {0, 1, 2, 3}; - std::vector sparse_tensor_shape({3, 3, 3, 4}); - std::vector indices_shapes({2, 3, 4, 8}); + std::vector axis_order = {3, 1, 0, 2}; + std::vector indices_shapes = {3, 6, 6, 8}; + std::vector sparse_tensor_shape({4, 3, 5, 2}); std::vector dim_names({"a", "b", "c", "d"}); std::shared_ptr data_buffer = Buffer::Wrap(data_values); @@ -1065,10 +1078,11 @@ TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestSparseTensorToTensor) { indices_buffers[i] = Buffer::Wrap(indices_values[i]); std::vector dense_values = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; + 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 7, 0, 8}; auto dense_data = Buffer::Wrap(dense_values); Tensor tensor(int64(), dense_data, sparse_tensor_shape, {}); @@ -1078,11 +1092,11 @@ TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestSparseTensorToTensor) { SparseCSFIndex::Make(TypeTraits::type_singleton(), indices_shapes, axis_order, indptr_buffers, indices_buffers)); std::shared_ptr sparse_tensor = std::make_shared( - sparse_index, int64(), data_buffer, sparse_tensor_shape, dim_names); + sparse_index, tensor.type(), data_buffer, sparse_tensor_shape, dim_names); - std::shared_ptr dense_tensor; - ASSERT_OK(sparse_tensor->ToTensor(&dense_tensor)); - ASSERT_TRUE(tensor.Equals(*dense_tensor)); + // std::shared_ptr dense_tensor; + // ASSERT_OK(sparse_tensor->ToTensor(&dense_tensor)); + // ASSERT_TRUE(tensor.Equals(*dense_tensor)); } REGISTER_TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType, @@ -1099,5 +1113,4 @@ INSTANTIATE_TYPED_TEST_CASE_P(TestUInt32, TestSparseCSFTensorForIndexValueType, INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestSparseCSFTensorForIndexValueType, Int64Type); INSTANTIATE_TYPED_TEST_CASE_P(TestUInt64, TestSparseCSFTensorForIndexValueType, UInt64Type); - } // namespace arrow From 24a831f3eab637260064c91f87e48b7cbd3148d2 Mon Sep 17 00:00:00 2001 From: Rok Date: Wed, 18 Dec 2019 11:11:45 +0100 Subject: [PATCH 11/18] Style. --- cpp/src/arrow/compare.cc | 2 ++ cpp/src/arrow/sparse_tensor.cc | 2 -- cpp/src/arrow/sparse_tensor_test.cc | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 6e521a32c03..d2322009ea8 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -1194,6 +1194,7 @@ inline bool SparseTensorEqualsImplDispatch(const SparseTensorImpl&>(right); return SparseTensorEqualsImpl::Compare(left, right_csc); + } case SparseTensorFormat::CSF: { const auto& right_csf = @@ -1236,6 +1237,7 @@ bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right) { case SparseTensorFormat::CSC: { const auto& left_csc = checked_cast&>(left); return SparseTensorEqualsImplDispatch(left_csc, right); + } case SparseTensorFormat::CSF: { const auto& left_csf = checked_cast&>(left); diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 404cff5a841..4f73f7940db 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -735,8 +735,6 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t internal::checked_cast(*sparse_tensor->sparse_index()); const std::shared_ptr indptr = sparse_index.indptr(); const std::shared_ptr indices = sparse_index.indices(); - const auto raw_data = - reinterpret_cast(sparse_tensor->raw_data()); int64_t offset; for (int64_t j = 0; j < indptr->size() - 1; ++j) { diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index a5723aa2adb..de0a793ede9 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -933,7 +933,7 @@ class TestSparseCSFTensorBase : public ::testing::Test { protected: std::vector shape_; std::vector dim_names_; - std::shared_ptr sparse_tensor_from_dense_; + std::shared_ptr sparse_tensor_from_dense_; }; class TestSparseCSFTensor : public TestSparseCSFTensorBase {}; From d9ff47e67c3e0147c0a5b142a459cf0ade82da88 Mon Sep 17 00:00:00 2001 From: Rok Date: Fri, 10 Jan 2020 00:05:00 +0100 Subject: [PATCH 12/18] Further work and implementing review feedback. --- cpp/src/arrow/sparse_tensor.cc | 127 ++++++----- cpp/src/arrow/sparse_tensor_test.cc | 322 +++++++++++++++++++--------- 2 files changed, 290 insertions(+), 159 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 4f73f7940db..c8f08c453ff 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -444,56 +444,70 @@ class SparseTensorConverter const int64_t ndim = tensor_.ndim(); std::vector axis_order = internal::ArgSort(tensor_.shape()); + int64_t nonzero_count = -1; + RETURN_NOT_OK(tensor_.CountNonZero(&nonzero_count)); - if (ndim < 2) { - // LCOV_EXCL_START: The following invalid causes program failure. - return Status::Invalid("Invalid tensor dimension"); - // LCOV_EXCL_STOP - } + std::shared_ptr values_buffer; + RETURN_NOT_OK( + AllocateBuffer(pool_, sizeof(value_type) * nonzero_count, &values_buffer)); + value_type* values = reinterpret_cast(values_buffer->mutable_data()); - std::shared_ptr sparse_coo_tensor; - ARROW_ASSIGN_OR_RAISE(sparse_coo_tensor, SparseCOOTensor::Make(tensor_)); - std::shared_ptr coords = - arrow::internal::checked_pointer_cast( - sparse_coo_tensor->sparse_index()) - ->indices(); + std::vector counts(ndim, 0); + std::vector coord(ndim, 0); + std::vector previous_coord(ndim, -1); + std::vector> indptr_buffer_builders(ndim - 1); + std::vector> indices_buffer_builders(ndim); - // TODO(rok): Coords should be sorted with axis_order priority to improve compression. - // ARROW-4221 would help here as well. + if (ndim <= 1) { + return Status::NotImplemented("TODO for ndim <= 1"); + } else { + const std::vector& shape = tensor_.shape(); + for (int64_t n = tensor_.size(); n > 0; n--) { + const value_type x = tensor_.Value(coord); - // Convert SparseCOOTensor to long CSF buffers - const int64_t nonzero_count = sparse_coo_tensor->non_zero_length(); + if (tensor_.Value(coord) != 0) { + bool tree_split = false; + *values++ = x; - std::vector counts(ndim); - std::fill_n(counts.begin(), ndim, static_cast(0)); - std::vector> indptr_buffer_builders(ndim - 1); - std::vector> indices_buffer_builders(ndim); + for (int64_t i = 0; i < ndim; ++i) { + int64_t dimension = axis_order[i]; + bool change = coord[dimension] != previous_coord[dimension]; + + if (tree_split || change) { + if (change) tree_split = true; + + if (i < ndim - 1) + RETURN_NOT_OK(indptr_buffer_builders[i].Append( + static_cast(counts[dimension + 1]))); + RETURN_NOT_OK(indices_buffer_builders[i].Append( + static_cast(coord[dimension]))); + ++counts[dimension]; + } + } + previous_coord = coord; + } - for (int64_t row = 0; row < nonzero_count; ++row) { - bool tree_split = false; - for (int64_t column = 0; column < ndim; ++column) { - int64_t dimension = axis_order[column]; - bool change = coords->Value({row, dimension}) != - coords->Value({row - 1, dimension}); - - if (tree_split || change || row == 0) { - if (row > 1 || change) tree_split = true; - - if (column < ndim - 1) - RETURN_NOT_OK(indptr_buffer_builders[column].Append( - static_cast(counts[column + 1]))); - RETURN_NOT_OK( - indices_buffer_builders[column].Append(static_cast( - coords->Value({row, dimension})))); - ++counts[column]; + // increment index + ++coord[ndim - 1]; + if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) { + int64_t d = ndim - 1; + while (d > 0 && coord[d] == shape[d]) { + coord[d] = 0; + ++coord[d - 1]; + --d; + } } } } + for (int64_t column = 0; column < ndim - 1; ++column) { RETURN_NOT_OK(indptr_buffer_builders[column].Append( static_cast(counts[column + 1]))); } + // make results + data = values_buffer; + std::vector> indptr_buffers(ndim - 1); std::vector> indices_buffers(ndim); std::vector indptr_shapes(counts.begin(), counts.end() - 1); @@ -509,7 +523,6 @@ class SparseTensorConverter ARROW_ASSIGN_OR_RAISE( sparse_index, SparseCSFIndex::Make(index_value_type_, indices_shapes, axis_order, indptr_buffers, indices_buffers)); - data = sparse_coo_tensor->data(); return Status::OK(); } @@ -647,11 +660,14 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor, } } +namespace { + template -void assign_values(int64_t dimension, int64_t offset, int64_t first_ptr, int64_t last_ptr, - const SparseCSFIndex* sparse_index, const int64_t* raw_data, - const std::vector strides, - const std::vector axis_order, TYPE* out) { +void ExpandSparseCSFTensorValues(int64_t dimension, int64_t offset, int64_t first_ptr, + int64_t last_ptr, const SparseCSFIndex* sparse_index, + const int64_t* raw_data, + const std::vector strides, + const std::vector axis_order, TYPE* out) { int64_t ndim = axis_order.size(); for (int64_t i = first_ptr; i < last_ptr; ++i) { @@ -660,7 +676,7 @@ void assign_values(int64_t dimension, int64_t offset, int64_t first_ptr, int64_t strides[axis_order[dimension]]; if (dimension < ndim - 1) - assign_values( + ExpandSparseCSFTensorValues( dimension + 1, tmp_offset, sparse_index->indptr()[dimension]->Value({i}), sparse_index->indptr()[dimension]->Value({i + 1}), sparse_index, @@ -670,6 +686,8 @@ void assign_values(int64_t dimension, int64_t offset, int64_t first_ptr, int64_t } } +} // namespace + template Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_tensor, std::shared_ptr* out) { @@ -753,13 +771,9 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t case SparseTensorFormat::CSF: { const auto& sparse_index = internal::checked_cast(*sparse_tensor->sparse_index()); - int64_t last_ptr_index = sparse_index.indptr()[0]->size() - 1; - int64_t first_ptr = sparse_index.indptr()[0]->Value({0}); - int64_t last_ptr = - sparse_index.indptr()[0]->Value({last_ptr_index}); - assign_values( - 0, 0, first_ptr, last_ptr, &sparse_index, + ExpandSparseCSFTensorValues( + 0, 0, 0, sparse_index.indptr()[0]->size() - 1, &sparse_index, reinterpret_cast(sparse_tensor->raw_data()), strides, sparse_index.axis_order(), values); *out = std::make_shared(sparse_tensor->type(), values_buffer, @@ -985,10 +999,9 @@ Result> SparseCSFIndex::Make( indices[i] = std::make_shared(indices_type, indices_data[i], std::vector({indices_shapes[i]})); - ARROW_CHECK(CheckSparseCSFIndexValidity(indptr_type, indices_type, indptr.size(), - indices.size(), indptr.back()->shape(), - indices.back()->shape(), axis_order.size()) - .ok()); + RETURN_NOT_OK(CheckSparseCSFIndexValidity(indptr_type, indices_type, indptr.size(), + indices.size(), indptr.back()->shape(), + indices.back()->shape(), axis_order.size())); return std::make_shared(indptr, indices, axis_order); } @@ -997,15 +1010,13 @@ Result> SparseCSFIndex::Make( SparseCSFIndex::SparseCSFIndex(std::vector>& indptr, std::vector>& indices, const std::vector& axis_order) - : SparseIndexBase(indices.back()->shape()[0]), + : SparseIndexBase(indices.back()->size()), indptr_(indptr), indices_(indices), axis_order_(axis_order) { - ARROW_CHECK(CheckSparseCSFIndexValidity(indptr_.front()->type(), - indices_.front()->type(), indptr_.size(), - indices_.size(), indptr_.back()->shape(), - indices_.back()->shape(), axis_order_.size()) - .ok()); + ARROW_CHECK_OK(CheckSparseCSFIndexValidity( + indptr_.front()->type(), indices_.front()->type(), indptr_.size(), indices_.size(), + indptr_.back()->shape(), indices_.back()->shape(), axis_order_.size())); } std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); } diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index de0a793ede9..6c9a64e61a1 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -26,6 +26,7 @@ #include +#include #include "arrow/sparse_tensor.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" @@ -914,15 +915,24 @@ template class TestSparseCSFTensorBase : public ::testing::Test { public: void SetUp() { - shape_ = {4, 3, 5, 2}; + shape_ = {3, 3, 3, 4}; dim_names_ = {"a", "b", "c", "d"}; + // COO representation: + // X[1, 1, 1, 2] := 1 + // X[1, 1, 1, 4] := 2 + // X[1, 2, 1, 1] := 3 + // X[1, 2, 1, 3] := 4 + // X[1, 2, 2, 1] := 5 + // X[2, 2, 2, 1] := 6 + // X[2, 2, 2, 2] := 7 + // X[2, 2, 2, 3] := 8 + std::vector dense_values = { - 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 7, 0, 8}; + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; auto dense_data = Buffer::Wrap(dense_values); NumericTensor dense_tensor(dense_data, shape_, {}, dim_names_); ASSERT_OK_AND_ASSIGN(sparse_tensor_from_dense_, @@ -938,51 +948,63 @@ class TestSparseCSFTensorBase : public ::testing::Test { class TestSparseCSFTensor : public TestSparseCSFTensorBase {}; -TEST_F(TestSparseCSFTensor, CreationFromTensor) { - std::vector values = { - 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 7, 0, 8}; - std::vector shape({4, 3, 5, 2}); - std::vector dim_names({"a", "b", "c", "d"}); +TEST_F(TestSparseCSFTensor, CreateFromBuffers1) { + std::vector> indptr_values = {{0, 2, 3, 5}}; + std::vector> indices_values = {{0, 1, 3}, {0, 3, 1, 3, 5}}; + std::vector indices_shapes({3, 5}); + std::vector axis_order = {0, 1}; + std::vector dim_names({"a", "b"}); + std::vector data_values = {1, 3, 2, 4, 5}; + std::vector values = {1, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 5}; + std::vector shape({4, 6}); + + std::vector> indptr_buffers; + std::vector> indices_buffers; + std::shared_ptr data_buffer = Buffer::Wrap(data_values); std::shared_ptr buffer = Buffer::Wrap(values); - Tensor tensor(int64(), buffer, shape, {}, dim_names); - - std::shared_ptr st; - ASSERT_OK_AND_ASSIGN(st, SparseCSFTensor::Make(tensor)); + for (auto& indptr : indptr_values) indptr_buffers.push_back(Buffer::Wrap(indptr)); + for (auto& indices : indices_values) indices_buffers.push_back(Buffer::Wrap(indices)); - std::vector> indptr_values = { - {0, 1, 4, 6}, {0, 1, 2, 3, 4, 5, 6}, {0, 1, 2, 3, 5, 6, 8}}; - std::vector> indices_values = { - {1, 0, 1}, {0, 0, 1, 0, 1, 2}, {0, 0, 0, 1, 3, 3}, {0, 1, 0, 0, 3, 4, 3, 4}}; - std::vector> indptr_buffers(3); - std::vector> indices_buffers(4); - std::vector axis_order = {3, 1, 0, 2}; - std::vector indices_shapes = {3, 6, 6, 8}; - - for (int64_t i = 0; i < static_cast(indptr_values.size()); ++i) - indptr_buffers[i] = Buffer::Wrap(indptr_values[i]); - for (int64_t i = 0; i < static_cast(indices_values.size()); ++i) - indices_buffers[i] = Buffer::Wrap(indices_values[i]); + Tensor tensor(int64(), buffer, shape, {}, this->dim_names_); std::shared_ptr sparse_index; ASSERT_OK_AND_ASSIGN(sparse_index, - SparseCSFIndex::Make(tensor.type(), indices_shapes, axis_order, - indptr_buffers, indices_buffers)); + SparseCSFIndex::Make(tensor.type(), tensor.type(), indices_shapes, + axis_order, indptr_buffers, indices_buffers)); + std::shared_ptr st = std::make_shared( + sparse_index, int64(), data_buffer, shape, dim_names); + std::shared_ptr dt; + ASSERT_OK(st->ToTensor(&dt)); + ASSERT_TRUE(tensor.Equals(*dt)); +} - const auto& si = internal::checked_cast(*st->sparse_index()); - ASSERT_EQ(8, st->non_zero_length()); - ASSERT_TRUE(st->is_mutable()); - ASSERT_TRUE(si.Equals(*sparse_index)); +TEST_F(TestSparseCSFTensor, CreateFromBuffers2) { + std::vector> indptr_values = {{0, 1, 2, 4, 5}}; + std::vector> indices_values = {{0, 1, 3, 5}, {0, 1, 0, 3, 3}}; + std::vector indices_shapes({4, 5}); + std::vector axis_order = {1, 0}; + std::vector dim_names({"a", "b"}); + std::vector data_values = {1, 2, 3, 4, 5}; + std::vector values = {1, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 5}; + std::vector shape({4, 6}); + + std::vector> indptr_buffers; + std::vector> indices_buffers; + std::shared_ptr data_buffer = Buffer::Wrap(data_values); + std::shared_ptr buffer = Buffer::Wrap(values); + for (auto& indptr : indptr_values) indptr_buffers.push_back(Buffer::Wrap(indptr)); + for (auto& indices : indices_values) indices_buffers.push_back(Buffer::Wrap(indices)); - ASSERT_EQ(dim_names, st->dim_names()); - ASSERT_EQ("a", st->dim_name(0)); - ASSERT_EQ("b", st->dim_name(1)); - ASSERT_EQ("c", st->dim_name(2)); - ASSERT_EQ("d", st->dim_name(3)); + Tensor tensor(int64(), buffer, shape, {}, this->dim_names_); + std::shared_ptr sparse_index; + ASSERT_OK_AND_ASSIGN(sparse_index, + SparseCSFIndex::Make(tensor.type(), tensor.type(), indices_shapes, + axis_order, indptr_buffers, indices_buffers)); + std::shared_ptr st = std::make_shared( + sparse_index, int64(), data_buffer, shape, dim_names); std::shared_ptr dt; ASSERT_OK(st->ToTensor(&dt)); ASSERT_TRUE(tensor.Equals(*dt)); @@ -1022,85 +1044,183 @@ class TestSparseCSFTensorForIndexValueType TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType); -TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestSparseTensorFromTensor) { +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestBufferToSparseTensor) { using IndexValueType = TypeParam; using c_index_value_type = typename IndexValueType::c_type; - std::vector data_values = {1, 2, 3, 4, 5, 6, 7, 8}; std::vector> indptr_values = { - {0, 1, 4, 6}, {0, 1, 2, 3, 4, 5, 6}, {0, 1, 2, 3, 5, 6, 8}}; + {0, 2, 3}, {0, 1, 3, 4}, {0, 2, 4, 5, 8}}; std::vector> indices_values = { - {1, 0, 1}, {0, 0, 1, 0, 1, 2}, {0, 0, 0, 1, 3, 3}, {0, 1, 0, 0, 3, 4, 3, 4}}; - std::vector> indptr_buffers(3); - std::vector> indices_buffers(4); - std::vector axis_order = {3, 1, 0, 2}; - std::vector sparse_tensor_shape({4, 3, 5, 2}); - std::vector indices_shapes = {3, 6, 6, 8}; - std::vector dim_names({"a", "b", "c", "d"}); - - std::shared_ptr data_buffer = Buffer::Wrap(data_values); - for (int64_t i = 0; i < static_cast(indptr_values.size()); ++i) - indptr_buffers[i] = Buffer::Wrap(indptr_values[i]); - for (int64_t i = 0; i < static_cast(indices_values.size()); ++i) - indices_buffers[i] = Buffer::Wrap(indices_values[i]); - - std::shared_ptr sparse_index; + {1, 2}, {1, 2, 2}, {1, 1, 2, 2}, {2, 3, 1, 3, 1, 1, 2, 3}}; + std::vector indices_shapes = {2, 3, 4, 8}; + std::vector axis_order = {0, 1, 2, 3}; + std::vector sparse_values = {1, 2, 3, 4, 5, 6, 7, 8}; + std::vector shape = {3, 3, 3, 4}; + std::vector dim_names = {"a", "b", "c", "d"}; + + std::shared_ptr data_buffer = Buffer::Wrap(sparse_values); + std::vector> indptr_buffers; + std::vector> indices_buffers; + for (auto& indptr : indptr_values) indptr_buffers.push_back(Buffer::Wrap(indptr)); + for (auto& indices : indices_values) indices_buffers.push_back(Buffer::Wrap(indices)); + + std::shared_ptr si; ASSERT_OK_AND_ASSIGN( - sparse_index, + si, SparseCSFIndex::Make(TypeTraits::type_singleton(), indices_shapes, axis_order, indptr_buffers, indices_buffers)); - std::shared_ptr sparse_tensor = std::make_shared( - sparse_index, int64(), data_buffer, sparse_tensor_shape, dim_names); + std::shared_ptr st = + std::make_shared(si, int64(), data_buffer, shape, dim_names); - ASSERT_TRUE(sparse_tensor->Equals(*this->sparse_tensor_from_dense_)); + ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); } -TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestSparseTensorToTensor) { +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestTensorToSparseTensor) { using IndexValueType = TypeParam; - using c_index_value_type = typename IndexValueType::c_type; + std::vector shape = {3, 3, 3, 4}; + std::vector values = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; - std::vector data_values = {1, 2, 3, 4, 5, 6, 7, 8}; - std::vector> indptr_values = { - {0, 1, 4, 6}, {0, 1, 2, 3, 4, 5, 6}, {0, 1, 2, 3, 5, 6, 8}}; - std::vector> indices_values = { - {1, 0, 1}, {0, 0, 1, 0, 1, 2}, {0, 0, 0, 1, 3, 3}, {0, 1, 0, 0, 3, 4, 3, 4}}; - std::vector> indptr_buffers(3); - std::vector> indices_buffers(4); - std::vector axis_order = {3, 1, 0, 2}; - std::vector indices_shapes = {3, 6, 6, 8}; - std::vector sparse_tensor_shape({4, 3, 5, 2}); - std::vector dim_names({"a", "b", "c", "d"}); + std::shared_ptr buffer = Buffer::Wrap(values); + Tensor tensor(int64(), buffer, shape, {}, this->dim_names_); - std::shared_ptr data_buffer = Buffer::Wrap(data_values); - for (int64_t i = 0; i < static_cast(indptr_values.size()); ++i) - indptr_buffers[i] = Buffer::Wrap(indptr_values[i]); - for (int64_t i = 0; i < static_cast(indices_values.size()); ++i) - indices_buffers[i] = Buffer::Wrap(indices_values[i]); + std::shared_ptr sparse_tensor; + ASSERT_OK_AND_ASSIGN( + sparse_tensor, + SparseCSFTensor::Make(tensor, TypeTraits::type_singleton())); + ASSERT_EQ(8, sparse_tensor->non_zero_length()); + ASSERT_TRUE(sparse_tensor->is_mutable()); + ASSERT_TRUE(sparse_tensor->Equals(*this->sparse_tensor_from_dense_)); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestSparseTensorToTensor) { + std::vector shape = {3, 3, 3, 4}; std::vector dense_values = { - 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 7, 0, 8}; + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; auto dense_data = Buffer::Wrap(dense_values); - Tensor tensor(int64(), dense_data, sparse_tensor_shape, {}); + Tensor tensor(int64(), dense_data, shape, {}); - std::shared_ptr sparse_index; - ASSERT_OK_AND_ASSIGN( - sparse_index, - SparseCSFIndex::Make(TypeTraits::type_singleton(), indices_shapes, - axis_order, indptr_buffers, indices_buffers)); - std::shared_ptr sparse_tensor = std::make_shared( - sparse_index, tensor.type(), data_buffer, sparse_tensor_shape, dim_names); + std::shared_ptr dense_tensor; + ASSERT_OK(this->sparse_tensor_from_dense_->ToTensor(&dense_tensor)); + ASSERT_TRUE(tensor.Equals(*dense_tensor)); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, CreateFromBuffers) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::vector> indptr_values_1 = { + {0, 2, 3}, {0, 1, 3, 4}, {0, 2, 4, 5, 8}}; + std::vector> indices_values_1 = { + {1, 2}, {1, 2, 2}, {1, 1, 2, 2}, {2, 3, 1, 3, 1, 1, 2, 3}}; + std::vector indices_shapes_1 = {2, 3, 4, 8}; + std::vector axis_order_1 = {0, 1, 2, 3}; + std::vector sparse_values_1 = {1, 2, 3, 4, 5, 6, 7, 8}; + std::vector shape_1 = {3, 3, 3, 4}; + std::vector dim_names_1 = {"a", "b", "c", "d"}; + + std::vector> indptr_values_2 = { + {0, 2, 4, 6}, {0, 1, 2, 3, 4, 6, 7}, {0, 2, 3, 4, 5, 6, 7, 8}}; + std::vector> indices_values_2 = { + {1, 2, 3}, {1, 2, 1, 2, 1, 2}, {2, 2, 1, 2, 1, 2, 2}, {1, 2, 2, 1, 2, 1, 1, 2}}; + std::vector indices_shapes_2 = {3, 6, 7, 8}; + std::vector axis_order_2 = {3, 0, 1, 2}; + std::vector sparse_values_2 = {3, 5, 6, 1, 7, 2, 4, 8}; + std::vector shape_2 = {5, 5, 5, 4}; + std::vector dim_names_2 = {"d", "a", "b", "c"}; + + std::vector> indptr_buffers_1; + std::vector> indices_buffers_1; + for (auto& indptr : indptr_values_1) indptr_buffers_1.push_back(Buffer::Wrap(indptr)); + for (auto& indices : indices_values_1) + indices_buffers_1.push_back(Buffer::Wrap(indices)); + + std::vector> indptr_buffers_2; + std::vector> indices_buffers_2; + for (auto& indptr : indptr_values_2) indptr_buffers_2.push_back(Buffer::Wrap(indptr)); + for (auto& indices : indices_values_2) + indices_buffers_2.push_back(Buffer::Wrap(indices)); + + std::vector dense_values_1 = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; + + std::vector dense_values_2 = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + std::shared_ptr dense_buffer_1 = Buffer::Wrap(dense_values_1); + std::shared_ptr dense_buffer_2 = Buffer::Wrap(dense_values_2); + Tensor tensor_1(int64(), dense_buffer_1, shape_1, {}, dim_names_1); + Tensor tensor_2(int64(), dense_buffer_2, shape_2, {}, dim_names_2); + std::shared_ptr sparse_buffer_1 = Buffer::Wrap(sparse_values_1); + std::shared_ptr sparse_buffer_2 = Buffer::Wrap(sparse_values_2); + + std::shared_ptr si_1; + std::shared_ptr si_2; + std::shared_ptr si_3; - // std::shared_ptr dense_tensor; - // ASSERT_OK(sparse_tensor->ToTensor(&dense_tensor)); - // ASSERT_TRUE(tensor.Equals(*dense_tensor)); + ASSERT_OK_AND_ASSIGN( + si_1, + SparseCSFIndex::Make(TypeTraits::type_singleton(), indices_shapes_1, + axis_order_1, indptr_buffers_1, indices_buffers_1)); + ASSERT_OK_AND_ASSIGN( + si_2, + SparseCSFIndex::Make(TypeTraits::type_singleton(), indices_shapes_2, + axis_order_2, indptr_buffers_2, indices_buffers_2)); + ASSERT_OK_AND_ASSIGN( + si_3, + SparseCSFIndex::Make(TypeTraits::type_singleton(), indices_shapes_2, + axis_order_2, indptr_buffers_2, indices_buffers_2)); + + std::shared_ptr st_1 = std::make_shared( + si_1, int64(), sparse_buffer_1, shape_1, dim_names_1); + std::shared_ptr st_2 = std::make_shared( + si_2, int64(), sparse_buffer_2, shape_1, dim_names_2); + std::shared_ptr st_3 = std::make_shared( + si_3, int64(), sparse_buffer_2, shape_2, dim_names_2); + + std::shared_ptr dt_1; + std::shared_ptr dt_2; + std::shared_ptr dt_3; + ASSERT_OK(st_1->ToTensor(&dt_1)); + ASSERT_OK(st_2->ToTensor(&dt_2)); + ASSERT_OK(st_3->ToTensor(&dt_3)); + + ASSERT_TRUE(dt_1->Equals(*dt_2)); + ASSERT_FALSE(dt_1->Equals(*dt_3)); + ASSERT_TRUE(tensor_1.Equals(*dt_1)); + ASSERT_TRUE(tensor_2.Equals(*dt_3)); } -REGISTER_TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType, - TestSparseTensorFromTensor, TestSparseTensorToTensor); +REGISTER_TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType, TestBufferToSparseTensor, + TestTensorToSparseTensor, TestSparseTensorToTensor, + CreateFromBuffers); INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestSparseCSFTensorForIndexValueType, Int8Type); INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestSparseCSFTensorForIndexValueType, UInt8Type); From 3291abc907de160919e4aa3f4ce97551c0c24fc4 Mon Sep 17 00:00:00 2001 From: Rok Date: Tue, 28 Jan 2020 15:12:13 +0100 Subject: [PATCH 13/18] Marking indptrBuffers, indicesBuffers and axisOrder required. --- format/SparseTensor.fbs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index e3e8df11d44..e637e5a3f1c 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -146,7 +146,7 @@ table SparseTensorIndexCSF { /// 2 3 1 3 1 1 2 3 /// The type of values in indptrBuffers - indptrType: Int; + indptrType: Int (required); /// indptrBuffers stores the sparsity structure. /// Position in the indptrBuffers vector signifies the dimension. @@ -154,10 +154,10 @@ table SparseTensorIndexCSF { /// /// indptrBuffer(X) = [[0, 2, 3], [0, 1, 3, 4], [0, 2, 4, 5, 8]]. /// - indptrBuffers: [Buffer]; + indptrBuffers: [Buffer] (required); /// The type of values in indicesBuffers - indicesType: Int; + indicesType: Int (required); /// indicesBuffers stores the label of each node. /// Position in the indicesBuffers vector signifies the dimension. @@ -165,14 +165,14 @@ table SparseTensorIndexCSF { /// /// indicesBuffer(X) = [[1, 2], [1, 2, 2], [1, 1, 2, 2], [2, 3, 1, 3, 1, 1, 2, 3]]. /// - indicesBuffers: [Buffer]; + indicesBuffers: [Buffer] (required); /// axisOrder stores the sequence in which dimensions were traversed to produce the prefix tree. /// For example, the axisOrder for the above X is: /// /// axisOrder(X) = [0, 1, 2, 3]. /// - axisOrder: [Int]; + axisOrder: [Int] (required); } union SparseTensorIndex { From 28d38cb5e356be429f6521a88f579592feb07628 Mon Sep 17 00:00:00 2001 From: Rok Date: Tue, 28 Jan 2020 15:22:45 +0100 Subject: [PATCH 14/18] Removing backslashes from comments. --- format/SparseTensor.fbs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index e637e5a3f1c..7ed302141ab 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -142,8 +142,8 @@ table SparseTensorIndexCSF { /// 1 2 2 /// / / \ | /// 1 1 2 2 - /// / \ / \ \ /|\ - /// 2 3 1 3 1 1 2 3 + /// /| /| | /| | + /// 2 3 1 3 1 1 2 3 /// The type of values in indptrBuffers indptrType: Int (required); From 6f4f4a8f99961662de1cb369982be21c8fc00326 Mon Sep 17 00:00:00 2001 From: Rok Date: Thu, 30 Jan 2020 17:11:36 +0100 Subject: [PATCH 15/18] Implementing feedback review. --- cpp/src/arrow/sparse_tensor.cc | 62 +++-- cpp/src/arrow/sparse_tensor_test.cc | 404 +++++++++++----------------- format/SparseTensor.fbs | 64 +---- 3 files changed, 196 insertions(+), 334 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index c8f08c453ff..83e07baeb5a 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -17,7 +17,6 @@ #include "arrow/sparse_tensor.h" -#include #include #include #include @@ -28,6 +27,7 @@ #include "arrow/compare.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" +#include "arrow/util/sort.h" #include "arrow/visitor_inline.h" namespace arrow { @@ -130,7 +130,6 @@ class SparseTensorConverter *indices++ = static_cast(coord[i]); } } - // increment index ++coord[ndim - 1]; if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) { @@ -443,6 +442,8 @@ class SparseTensorConverter RETURN_NOT_OK(CheckMaximumValue(std::numeric_limits::max())); const int64_t ndim = tensor_.ndim(); + // Axis order as ascending order of dimension size is a good heuristic but is not + // necessarily optimal. std::vector axis_order = internal::ArgSort(tensor_.shape()); int64_t nonzero_count = -1; RETURN_NOT_OK(tensor_.CountNonZero(&nonzero_count)); @@ -465,7 +466,7 @@ class SparseTensorConverter for (int64_t n = tensor_.size(); n > 0; n--) { const value_type x = tensor_.Value(coord); - if (tensor_.Value(coord) != 0) { + if (x != 0) { bool tree_split = false; *values++ = x; @@ -476,24 +477,25 @@ class SparseTensorConverter if (tree_split || change) { if (change) tree_split = true; - if (i < ndim - 1) + if (i < ndim - 1) { RETURN_NOT_OK(indptr_buffer_builders[i].Append( - static_cast(counts[dimension + 1]))); + static_cast(counts[i + 1]))); + } RETURN_NOT_OK(indices_buffer_builders[i].Append( static_cast(coord[dimension]))); - ++counts[dimension]; + ++counts[i]; } } previous_coord = coord; } - // increment index - ++coord[ndim - 1]; - if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) { + int64_t last_axis = axis_order[ndim - 1]; + ++coord[last_axis]; + if (n > 1 && coord[last_axis] == shape[last_axis]) { int64_t d = ndim - 1; - while (d > 0 && coord[d] == shape[d]) { - coord[d] = 0; - ++coord[d - 1]; + while (d > 0 && coord[axis_order[d]] == shape[axis_order[d]]) { + coord[axis_order[d]] = 0; + ++coord[axis_order[d - 1]]; --d; } } @@ -513,12 +515,13 @@ class SparseTensorConverter std::vector indptr_shapes(counts.begin(), counts.end() - 1); std::vector indices_shapes = counts; - for (int64_t column = 0; column < ndim; ++column) + for (int64_t column = 0; column < ndim; ++column) { RETURN_NOT_OK( indices_buffer_builders[column].Finish(&indices_buffers[column], true)); - - for (int64_t column = 0; column < ndim - 1; ++column) + } + for (int64_t column = 0; column < ndim - 1; ++column) { RETURN_NOT_OK(indptr_buffer_builders[column].Finish(&indptr_buffers[column], true)); + } ARROW_ASSIGN_OR_RAISE( sparse_index, SparseCSFIndex::Make(index_value_type_, indices_shapes, axis_order, @@ -665,8 +668,7 @@ namespace { template void ExpandSparseCSFTensorValues(int64_t dimension, int64_t offset, int64_t first_ptr, int64_t last_ptr, const SparseCSFIndex* sparse_index, - const int64_t* raw_data, - const std::vector strides, + const TYPE* raw_data, const std::vector strides, const std::vector axis_order, TYPE* out) { int64_t ndim = axis_order.size(); @@ -675,14 +677,15 @@ void ExpandSparseCSFTensorValues(int64_t dimension, int64_t offset, int64_t firs offset + sparse_index->indices()[dimension]->Value({i}) * strides[axis_order[dimension]]; - if (dimension < ndim - 1) + if (dimension < ndim - 1) { ExpandSparseCSFTensorValues( dimension + 1, tmp_offset, sparse_index->indptr()[dimension]->Value({i}), sparse_index->indptr()[dimension]->Value({i + 1}), sparse_index, raw_data, strides, axis_order, out); - else - out[tmp_offset] = static_cast(raw_data[i]); + } else { + out[tmp_offset] = raw_data[i]; + } } } @@ -703,8 +706,10 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t std::fill_n(values, sparse_tensor->size(), static_cast(0)); std::vector strides(sparse_tensor->ndim(), 1); - for (int i = sparse_tensor->ndim() - 1; i > 0; --i) + for (int i = sparse_tensor->ndim() - 1; i > 0; --i) { strides[i - 1] *= strides[i] * sparse_tensor->shape()[i]; + } + std::vector empty_strides; const auto raw_data = reinterpret_cast(sparse_tensor->raw_data()); @@ -724,7 +729,8 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t values[offset] = raw_data[i]; } *out = std::make_shared(sparse_tensor->type(), values_buffer, - sparse_tensor->shape()); + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); return Status::OK(); } @@ -744,7 +750,8 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t } } *out = std::make_shared(sparse_tensor->type(), values_buffer, - sparse_tensor->shape()); + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); return Status::OK(); } @@ -764,7 +771,8 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t } } *out = std::make_shared(sparse_tensor->type(), values_buffer, - sparse_tensor->shape()); + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); return Status::OK(); } @@ -773,11 +781,11 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t internal::checked_cast(*sparse_tensor->sparse_index()); ExpandSparseCSFTensorValues( - 0, 0, 0, sparse_index.indptr()[0]->size() - 1, &sparse_index, - reinterpret_cast(sparse_tensor->raw_data()), strides, + 0, 0, 0, sparse_index.indptr()[0]->size() - 1, &sparse_index, raw_data, strides, sparse_index.axis_order(), values); *out = std::make_shared(sparse_tensor->type(), values_buffer, - sparse_tensor->shape()); + sparse_tensor->shape(), empty_strides, + sparse_tensor->dim_names()); return Status::OK(); } } diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 6c9a64e61a1..2b5186acda2 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -915,29 +915,25 @@ template class TestSparseCSFTensorBase : public ::testing::Test { public: void SetUp() { - shape_ = {3, 3, 3, 4}; dim_names_ = {"a", "b", "c", "d"}; - - // COO representation: - // X[1, 1, 1, 2] := 1 - // X[1, 1, 1, 4] := 2 - // X[1, 2, 1, 1] := 3 - // X[1, 2, 1, 3] := 4 - // X[1, 2, 2, 1] := 5 - // X[2, 2, 2, 1] := 6 - // X[2, 2, 2, 2] := 7 - // X[2, 2, 2, 3] := 8 - - std::vector dense_values = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; - auto dense_data = Buffer::Wrap(dense_values); - NumericTensor dense_tensor(dense_data, shape_, {}, dim_names_); - ASSERT_OK_AND_ASSIGN(sparse_tensor_from_dense_, - SparseCSFTensor::Make( - dense_tensor, TypeTraits::type_singleton())); + shape_ = {2, 3, 4, 5}; + int16_t dense_values[2][3][4][5] = {}; // zero-initialized + + dense_values[0][0][0][1] = 1; + dense_values[0][0][0][2] = 2; + dense_values[0][1][0][0] = 3; + dense_values[0][1][0][2] = 4; + dense_values[0][1][1][0] = 5; + dense_values[1][1][1][0] = 6; + dense_values[1][1][1][1] = 7; + dense_values[1][1][1][2] = 8; + + auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values)); + Tensor dense_tensor_(int16(), dense_buffer, shape_, {}, dim_names_); + ASSERT_OK_AND_ASSIGN( + sparse_tensor_from_dense_, + SparseCSFTensor::Make(dense_tensor_, + TypeTraits::type_singleton())); } protected: @@ -948,279 +944,199 @@ class TestSparseCSFTensorBase : public ::testing::Test { class TestSparseCSFTensor : public TestSparseCSFTensorBase {}; -TEST_F(TestSparseCSFTensor, CreateFromBuffers1) { - std::vector> indptr_values = {{0, 2, 3, 5}}; - std::vector> indices_values = {{0, 1, 3}, {0, 3, 1, 3, 5}}; - std::vector indices_shapes({3, 5}); - std::vector axis_order = {0, 1}; - std::vector dim_names({"a", "b"}); - std::vector data_values = {1, 3, 2, 4, 5}; - std::vector values = {1, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 5}; - std::vector shape({4, 6}); - - std::vector> indptr_buffers; - std::vector> indices_buffers; - std::shared_ptr data_buffer = Buffer::Wrap(data_values); - std::shared_ptr buffer = Buffer::Wrap(values); - for (auto& indptr : indptr_values) indptr_buffers.push_back(Buffer::Wrap(indptr)); - for (auto& indices : indices_values) indices_buffers.push_back(Buffer::Wrap(indices)); - - Tensor tensor(int64(), buffer, shape, {}, this->dim_names_); - - std::shared_ptr sparse_index; - ASSERT_OK_AND_ASSIGN(sparse_index, - SparseCSFIndex::Make(tensor.type(), tensor.type(), indices_shapes, - axis_order, indptr_buffers, indices_buffers)); - std::shared_ptr st = std::make_shared( - sparse_index, int64(), data_buffer, shape, dim_names); - std::shared_ptr dt; - ASSERT_OK(st->ToTensor(&dt)); - ASSERT_TRUE(tensor.Equals(*dt)); -} - -TEST_F(TestSparseCSFTensor, CreateFromBuffers2) { - std::vector> indptr_values = {{0, 1, 2, 4, 5}}; - std::vector> indices_values = {{0, 1, 3, 5}, {0, 1, 0, 3, 3}}; - std::vector indices_shapes({4, 5}); - std::vector axis_order = {1, 0}; - std::vector dim_names({"a", "b"}); - std::vector data_values = {1, 2, 3, 4, 5}; - std::vector values = {1, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 5}; - std::vector shape({4, 6}); - - std::vector> indptr_buffers; - std::vector> indices_buffers; - std::shared_ptr data_buffer = Buffer::Wrap(data_values); - std::shared_ptr buffer = Buffer::Wrap(values); - for (auto& indptr : indptr_values) indptr_buffers.push_back(Buffer::Wrap(indptr)); - for (auto& indices : indices_values) indices_buffers.push_back(Buffer::Wrap(indices)); - - Tensor tensor(int64(), buffer, shape, {}, this->dim_names_); - - std::shared_ptr sparse_index; - ASSERT_OK_AND_ASSIGN(sparse_index, - SparseCSFIndex::Make(tensor.type(), tensor.type(), indices_shapes, - axis_order, indptr_buffers, indices_buffers)); - std::shared_ptr st = std::make_shared( - sparse_index, int64(), data_buffer, shape, dim_names); - std::shared_ptr dt; - ASSERT_OK(st->ToTensor(&dt)); - ASSERT_TRUE(tensor.Equals(*dt)); -} - template class TestSparseCSFTensorForIndexValueType : public TestSparseCSFTensorBase { protected: std::shared_ptr MakeSparseCSFIndex( - std::vector& indptr_values, - std::vector& indices_values, - const std::vector& indptr_offsets, - const std::vector& indices_offsets, - const std::vector& indptr_shape, const std::vector& indices_shape, - const std::vector& axis_order) const { - auto indptr_data = Buffer::Wrap(indptr_values); - auto indices_data = Buffer::Wrap(indices_values); - auto indptr = - std::make_shared>(indptr_data, indptr_shape); - auto indices = - std::make_shared>(indices_data, indices_shape); - return std::make_shared(indptr, indices, indptr_offsets, - indices_offsets, axis_order); + const std::vector axis_order, + std::vector>& indptr_values, + std::vector>& indices_values) const { + int64_t ndim = axis_order.size(); + std::vector> indptr(ndim - 1); + std::vector> indices(ndim); + + for (int64_t i = 0; i < ndim - 1; ++i) { + indptr[i] = std::make_shared( + TypeTraits::type_singleton(), Buffer::Wrap(indptr_values[i]), + std::vector({static_cast(indptr_values[i].size())})); + } + for (int64_t i = 0; i < ndim; ++i) { + indices[i] = std::make_shared( + TypeTraits::type_singleton(), Buffer::Wrap(indices_values[i]), + std::vector({static_cast(indices_values[i].size())})); + } + return std::make_shared(indptr, indices, axis_order); } template std::shared_ptr MakeSparseTensor( - const std::shared_ptr& si, - std::vector& sparse_values) const { - auto data = Buffer::Wrap(sparse_values); - return std::make_shared(si, - CTypeTraits::type_singleton(), - data, this->shape_, this->dim_names_); + const std::shared_ptr& si, std::vector& sparse_values, + const std::vector shape, const std::vector dim_names) const { + auto data_buffer = Buffer::Wrap(sparse_values); + return std::make_shared( + si, CTypeTraits::type_singleton(), data_buffer, shape, dim_names); } }; TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType); -TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestBufferToSparseTensor) { +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestCreateSparseTensor) { using IndexValueType = TypeParam; using c_index_value_type = typename IndexValueType::c_type; + std::vector shape = {2, 3, 4, 5}; + std::vector dim_names = {"a", "b", "c", "d"}; + std::vector axis_order = {0, 1, 2, 3}; + std::vector sparse_values = {1, 2, 3, 4, 5, 6, 7, 8}; std::vector> indptr_values = { {0, 2, 3}, {0, 1, 3, 4}, {0, 2, 4, 5, 8}}; std::vector> indices_values = { - {1, 2}, {1, 2, 2}, {1, 1, 2, 2}, {2, 3, 1, 3, 1, 1, 2, 3}}; - std::vector indices_shapes = {2, 3, 4, 8}; - std::vector axis_order = {0, 1, 2, 3}; - std::vector sparse_values = {1, 2, 3, 4, 5, 6, 7, 8}; - std::vector shape = {3, 3, 3, 4}; - std::vector dim_names = {"a", "b", "c", "d"}; + {0, 1}, {0, 1, 1}, {0, 0, 1, 1}, {1, 2, 0, 2, 0, 0, 1, 2}}; - std::shared_ptr data_buffer = Buffer::Wrap(sparse_values); - std::vector> indptr_buffers; - std::vector> indices_buffers; - for (auto& indptr : indptr_values) indptr_buffers.push_back(Buffer::Wrap(indptr)); - for (auto& indices : indices_values) indices_buffers.push_back(Buffer::Wrap(indices)); - - std::shared_ptr si; - ASSERT_OK_AND_ASSIGN( - si, - SparseCSFIndex::Make(TypeTraits::type_singleton(), indices_shapes, - axis_order, indptr_buffers, indices_buffers)); - std::shared_ptr st = - std::make_shared(si, int64(), data_buffer, shape, dim_names); + auto si = this->MakeSparseCSFIndex(axis_order, indptr_values, indices_values); + auto st = this->MakeSparseTensor(si, sparse_values, shape, dim_names); ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); } TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestTensorToSparseTensor) { using IndexValueType = TypeParam; - std::vector shape = {3, 3, 3, 4}; - std::vector values = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; - - std::shared_ptr buffer = Buffer::Wrap(values); - Tensor tensor(int64(), buffer, shape, {}, this->dim_names_); + std::vector shape = {2, 3, 4, 5}; + int16_t dense_values[2][3][4][5] = {}; // zero-initialized + dense_values[0][0][0][1] = 1; + dense_values[0][0][0][2] = 2; + dense_values[0][1][0][0] = 3; + dense_values[0][1][0][2] = 4; + dense_values[0][1][1][0] = 5; + dense_values[1][1][1][0] = 6; + dense_values[1][1][1][1] = 7; + dense_values[1][1][1][2] = 8; + auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values)); + Tensor dense_tensor(int16(), dense_buffer, shape, {}, this->dim_names_); std::shared_ptr sparse_tensor; ASSERT_OK_AND_ASSIGN( sparse_tensor, - SparseCSFTensor::Make(tensor, TypeTraits::type_singleton())); + SparseCSFTensor::Make(dense_tensor, TypeTraits::type_singleton())); ASSERT_EQ(8, sparse_tensor->non_zero_length()); ASSERT_TRUE(sparse_tensor->is_mutable()); ASSERT_TRUE(sparse_tensor->Equals(*this->sparse_tensor_from_dense_)); + ASSERT_EQ(sparse_tensor->dim_names(), dense_tensor.dim_names()); } TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestSparseTensorToTensor) { - std::vector shape = {3, 3, 3, 4}; - std::vector dense_values = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; - auto dense_data = Buffer::Wrap(dense_values); - Tensor tensor(int64(), dense_data, shape, {}); + std::vector shape = {2, 3, 4, 5}; + int16_t dense_values[2][3][4][5] = {}; // zero-initialized + dense_values[0][0][0][1] = 1; + dense_values[0][0][0][2] = 2; + dense_values[0][1][0][0] = 3; + dense_values[0][1][0][2] = 4; + dense_values[0][1][1][0] = 5; + dense_values[1][1][1][0] = 6; + dense_values[1][1][1][1] = 7; + dense_values[1][1][1][2] = 8; + auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values)); + Tensor dense_tensor(int16(), dense_buffer, shape, {}, this->dim_names_); - std::shared_ptr dense_tensor; - ASSERT_OK(this->sparse_tensor_from_dense_->ToTensor(&dense_tensor)); - ASSERT_TRUE(tensor.Equals(*dense_tensor)); + std::shared_ptr dt; + ASSERT_OK(this->sparse_tensor_from_dense_->ToTensor(&dt)); + ASSERT_TRUE(dense_tensor.Equals(*dt)); + ASSERT_EQ(dense_tensor.dim_names(), dt->dim_names()); } -TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, CreateFromBuffers) { +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestRoundTrip) { using IndexValueType = TypeParam; - using c_index_value_type = typename IndexValueType::c_type; - - std::vector> indptr_values_1 = { - {0, 2, 3}, {0, 1, 3, 4}, {0, 2, 4, 5, 8}}; - std::vector> indices_values_1 = { - {1, 2}, {1, 2, 2}, {1, 1, 2, 2}, {2, 3, 1, 3, 1, 1, 2, 3}}; - std::vector indices_shapes_1 = {2, 3, 4, 8}; - std::vector axis_order_1 = {0, 1, 2, 3}; - std::vector sparse_values_1 = {1, 2, 3, 4, 5, 6, 7, 8}; - std::vector shape_1 = {3, 3, 3, 4}; - std::vector dim_names_1 = {"a", "b", "c", "d"}; - - std::vector> indptr_values_2 = { - {0, 2, 4, 6}, {0, 1, 2, 3, 4, 6, 7}, {0, 2, 3, 4, 5, 6, 7, 8}}; - std::vector> indices_values_2 = { - {1, 2, 3}, {1, 2, 1, 2, 1, 2}, {2, 2, 1, 2, 1, 2, 2}, {1, 2, 2, 1, 2, 1, 1, 2}}; - std::vector indices_shapes_2 = {3, 6, 7, 8}; - std::vector axis_order_2 = {3, 0, 1, 2}; - std::vector sparse_values_2 = {3, 5, 6, 1, 7, 2, 4, 8}; - std::vector shape_2 = {5, 5, 5, 4}; - std::vector dim_names_2 = {"d", "a", "b", "c"}; - - std::vector> indptr_buffers_1; - std::vector> indices_buffers_1; - for (auto& indptr : indptr_values_1) indptr_buffers_1.push_back(Buffer::Wrap(indptr)); - for (auto& indices : indices_values_1) - indices_buffers_1.push_back(Buffer::Wrap(indices)); - - std::vector> indptr_buffers_2; - std::vector> indices_buffers_2; - for (auto& indptr : indptr_values_2) indptr_buffers_2.push_back(Buffer::Wrap(indptr)); - for (auto& indices : indices_values_2) - indices_buffers_2.push_back(Buffer::Wrap(indices)); - - std::vector dense_values_1 = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; - - std::vector dense_values_2 = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - - std::shared_ptr dense_buffer_1 = Buffer::Wrap(dense_values_1); - std::shared_ptr dense_buffer_2 = Buffer::Wrap(dense_values_2); - Tensor tensor_1(int64(), dense_buffer_1, shape_1, {}, dim_names_1); - Tensor tensor_2(int64(), dense_buffer_2, shape_2, {}, dim_names_2); - std::shared_ptr sparse_buffer_1 = Buffer::Wrap(sparse_values_1); - std::shared_ptr sparse_buffer_2 = Buffer::Wrap(sparse_values_2); - - std::shared_ptr si_1; - std::shared_ptr si_2; - std::shared_ptr si_3; + std::shared_ptr dt; + ASSERT_OK(this->sparse_tensor_from_dense_->ToTensor(&dt)); + std::shared_ptr st; ASSERT_OK_AND_ASSIGN( - si_1, - SparseCSFIndex::Make(TypeTraits::type_singleton(), indices_shapes_1, - axis_order_1, indptr_buffers_1, indices_buffers_1)); - ASSERT_OK_AND_ASSIGN( - si_2, - SparseCSFIndex::Make(TypeTraits::type_singleton(), indices_shapes_2, - axis_order_2, indptr_buffers_2, indices_buffers_2)); - ASSERT_OK_AND_ASSIGN( - si_3, - SparseCSFIndex::Make(TypeTraits::type_singleton(), indices_shapes_2, - axis_order_2, indptr_buffers_2, indices_buffers_2)); - - std::shared_ptr st_1 = std::make_shared( - si_1, int64(), sparse_buffer_1, shape_1, dim_names_1); - std::shared_ptr st_2 = std::make_shared( - si_2, int64(), sparse_buffer_2, shape_1, dim_names_2); - std::shared_ptr st_3 = std::make_shared( - si_3, int64(), sparse_buffer_2, shape_2, dim_names_2); - - std::shared_ptr dt_1; - std::shared_ptr dt_2; - std::shared_ptr dt_3; + st, SparseCSFTensor::Make(*dt, TypeTraits::type_singleton())); + + ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestAlternativeAxisOrder) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::vector dense_values = {1, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 5}; + std::vector shape = {4, 6}; + std::vector dim_names = {"a", "b"}; + std::shared_ptr dense_buffer = Buffer::Wrap(dense_values); + Tensor tensor(int16(), dense_buffer, shape, {}, dim_names); + + // Axis order 1 + std::vector axis_order_1 = {0, 1}; + std::vector sparse_values_1 = {1, 3, 2, 4, 5}; + std::vector> indptr_values_1 = {{0, 2, 3, 5}}; + std::vector> indices_values_1 = {{0, 1, 3}, + {0, 3, 1, 3, 5}}; + auto si_1 = this->MakeSparseCSFIndex(axis_order_1, indptr_values_1, indices_values_1); + auto st_1 = this->MakeSparseTensor(si_1, sparse_values_1, shape, dim_names); + + // Axis order 2 + std::vector axis_order_2 = {1, 0}; + std::vector sparse_values_2 = {1, 2, 3, 4, 5}; + std::vector> indptr_values_2 = {{0, 1, 2, 4, 5}}; + std::vector> indices_values_2 = {{0, 1, 3, 5}, + {0, 1, 0, 3, 3}}; + auto si_2 = this->MakeSparseCSFIndex(axis_order_2, indptr_values_2, indices_values_2); + auto st_2 = this->MakeSparseTensor(si_2, sparse_values_2, shape, dim_names); + + std::shared_ptr dt_1, dt_2; ASSERT_OK(st_1->ToTensor(&dt_1)); ASSERT_OK(st_2->ToTensor(&dt_2)); - ASSERT_OK(st_3->ToTensor(&dt_3)); + ASSERT_FALSE(st_1->Equals(*st_2)); ASSERT_TRUE(dt_1->Equals(*dt_2)); - ASSERT_FALSE(dt_1->Equals(*dt_3)); - ASSERT_TRUE(tensor_1.Equals(*dt_1)); - ASSERT_TRUE(tensor_2.Equals(*dt_3)); + ASSERT_TRUE(dt_1->Equals(tensor)); +} + +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestNonAscendingShape) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + std::vector shape = {5, 2, 3, 4}; + int16_t dense_values[5][2][3][4] = {}; // zero-initialized + dense_values[0][0][0][1] = 1; + dense_values[0][0][0][2] = 2; + dense_values[0][1][0][0] = 3; + dense_values[0][1][0][2] = 4; + dense_values[0][1][1][0] = 5; + dense_values[1][1][1][0] = 6; + dense_values[1][1][1][1] = 7; + dense_values[1][1][1][2] = 8; + auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values)); + Tensor dense_tensor(int16(), dense_buffer, shape, {}, this->dim_names_); + + std::shared_ptr sparse_tensor; + ASSERT_OK_AND_ASSIGN( + sparse_tensor, + SparseCSFTensor::Make(dense_tensor, TypeTraits::type_singleton())); + + std::vector> indptr_values = { + {0, 1, 3}, {0, 2, 4, 7}, {0, 1, 2, 3, 4, 6, 7, 8}}; + std::vector> indices_values = { + {0, 1}, {0, 0, 1}, {1, 2, 0, 2, 0, 1, 2}, {0, 0, 0, 0, 0, 1, 1, 1}}; + std::vector axis_order = {1, 2, 3, 0}; + std::vector sparse_values = {1, 2, 3, 4, 5, 6, 7, 8}; + auto si = this->MakeSparseCSFIndex(axis_order, indptr_values, indices_values); + auto st = this->MakeSparseTensor(si, sparse_values, shape, this->dim_names_); + + std::shared_ptr dt; + ASSERT_OK(st->ToTensor(&dt)); + ASSERT_TRUE(dt->Equals(dense_tensor)); + ASSERT_TRUE(st->Equals(*sparse_tensor)); } -REGISTER_TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType, TestBufferToSparseTensor, +REGISTER_TYPED_TEST_CASE_P(TestSparseCSFTensorForIndexValueType, TestCreateSparseTensor, TestTensorToSparseTensor, TestSparseTensorToTensor, - CreateFromBuffers); + TestAlternativeAxisOrder, TestNonAscendingShape, + TestRoundTrip); INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestSparseCSFTensorForIndexValueType, Int8Type); INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestSparseCSFTensorForIndexValueType, UInt8Type); diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index 7ed302141ab..9c8ddae0b7c 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -114,71 +114,9 @@ table SparseMatrixIndexCSX { indicesBuffer: Buffer (required); } -/// Compressed Sparse Fiber (CSF) sparse tensor index. -table SparseTensorIndexCSF { - /// CSF is a generalization of compressed sparse row (CSR) index. - /// See [smith2017knl]: http://shaden.io/pub-files/smith2017knl.pdf - /// - /// CSF recursively compresses each mode of the tensor into a set - /// of prefix trees. Each path from a root to leaf forms one tensor - /// non-zero index. CSF is implemented with two buffers and three arrays. - /// - /// For example, let X be a 3x3x3x4 tensor, and it has the following - /// 8 non-zero values: - /// - /// X[1, 1, 1, 2] := 1 - /// X[1, 1, 1, 3] := 2 - /// X[1, 2, 1, 1] := 3 - /// X[1, 2, 1, 3] := 4 - /// X[1, 2, 2, 1] := 5 - /// X[2, 2, 2, 1] := 6 - /// X[2, 2, 2, 2] := 7 - /// X[2, 2, 2, 3] := 8 - /// - /// As a prefix tree this would be represented be: - /// - /// 1 2 - /// / \ | - /// 1 2 2 - /// / / \ | - /// 1 1 2 2 - /// /| /| | /| | - /// 2 3 1 3 1 1 2 3 - - /// The type of values in indptrBuffers - indptrType: Int (required); - - /// indptrBuffers stores the sparsity structure. - /// Position in the indptrBuffers vector signifies the dimension. - /// For example, the indptrBuffers for the above X is: - /// - /// indptrBuffer(X) = [[0, 2, 3], [0, 1, 3, 4], [0, 2, 4, 5, 8]]. - /// - indptrBuffers: [Buffer] (required); - - /// The type of values in indicesBuffers - indicesType: Int (required); - - /// indicesBuffers stores the label of each node. - /// Position in the indicesBuffers vector signifies the dimension. - /// For example, the indicesBuffers for the above X is: - /// - /// indicesBuffer(X) = [[1, 2], [1, 2, 2], [1, 1, 2, 2], [2, 3, 1, 3, 1, 1, 2, 3]]. - /// - indicesBuffers: [Buffer] (required); - - /// axisOrder stores the sequence in which dimensions were traversed to produce the prefix tree. - /// For example, the axisOrder for the above X is: - /// - /// axisOrder(X) = [0, 1, 2, 3]. - /// - axisOrder: [Int] (required); -} - union SparseTensorIndex { SparseTensorIndexCOO, - SparseMatrixIndexCSX, - SparseTensorIndexCSF + SparseMatrixIndexCSX } table SparseTensor { From 11b81bb044bc99864977a7596e44fdb77e8642de Mon Sep 17 00:00:00 2001 From: Rok Date: Mon, 3 Feb 2020 14:40:05 +0100 Subject: [PATCH 16/18] Factoring out index incrementing for dense to COO and CSF indices. --- cpp/src/arrow/sparse_tensor.cc | 54 ++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 83e07baeb5a..609cd2321d6 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -57,6 +57,37 @@ class SparseTensorConverter { Status Convert() { return Status::Invalid("Unsupported sparse index"); } }; +// ---------------------------------------------------------------------- +// IncrementIndex for SparseCOOIndex and SparseCSFIndex + +void IncrementIndex(std::vector& coord, const std::vector shape) { + const int64_t ndim = shape.size(); + ++coord[ndim - 1]; + if (coord[ndim - 1] == shape[ndim - 1]) { + int64_t d = ndim - 1; + while (d > 0 && coord[d] == shape[d]) { + coord[d] = 0; + ++coord[d - 1]; + --d; + } + } +} + +void IncrementIndex(std::vector& coord, const std::vector shape, + std::vector axis_order) { + const int64_t ndim = shape.size(); + const int64_t last_axis = axis_order[ndim - 1]; + ++coord[last_axis]; + if (coord[last_axis] == shape[last_axis]) { + int64_t d = ndim - 1; + while (d > 0 && coord[axis_order[d]] == shape[axis_order[d]]) { + coord[axis_order[d]] = 0; + ++coord[axis_order[d - 1]]; + --d; + } + } +} + // ---------------------------------------------------------------------- // SparseTensorConverter for SparseCOOIndex @@ -130,15 +161,8 @@ class SparseTensorConverter *indices++ = static_cast(coord[i]); } } - // increment index - ++coord[ndim - 1]; - if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) { - int64_t d = ndim - 1; - while (d > 0 && coord[d] == shape[d]) { - coord[d] = 0; - ++coord[d - 1]; - --d; - } + if (n > 1) { + IncrementIndex(coord, shape); } } } @@ -488,16 +512,8 @@ class SparseTensorConverter } previous_coord = coord; } - // increment index - int64_t last_axis = axis_order[ndim - 1]; - ++coord[last_axis]; - if (n > 1 && coord[last_axis] == shape[last_axis]) { - int64_t d = ndim - 1; - while (d > 0 && coord[axis_order[d]] == shape[axis_order[d]]) { - coord[axis_order[d]] = 0; - ++coord[axis_order[d - 1]]; - --d; - } + if (n > 1) { + IncrementIndex(coord, shape, axis_order); } } } From 1b922f6ae8a719b0050ec3674dd32085e5b48e3e Mon Sep 17 00:00:00 2001 From: Rok Date: Tue, 4 Feb 2020 22:24:41 +0100 Subject: [PATCH 17/18] Implementing review feedback. --- cpp/src/arrow/python/serialize.cc | 5 ++- cpp/src/arrow/sparse_tensor.cc | 60 ++++++++++++++++------------- cpp/src/arrow/sparse_tensor.h | 12 ++---- cpp/src/arrow/sparse_tensor_test.cc | 34 ++++------------ 4 files changed, 47 insertions(+), 64 deletions(-) diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 88d763b7877..06c85648591 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -664,11 +664,12 @@ Status CountSparseTensors( case SparseTensorFormat::CSR: ++num_csr; break; - case SparseTensorFormat::CSC: - // TODO(mrkn): support csc case SparseTensorFormat::CSF: ++num_csf; break; + case SparseTensorFormat::CSC: + // TODO(mrkn): support csc + break; } } diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 609cd2321d6..0a6a91ab9d4 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -60,7 +60,8 @@ class SparseTensorConverter { // ---------------------------------------------------------------------- // IncrementIndex for SparseCOOIndex and SparseCSFIndex -void IncrementIndex(std::vector& coord, const std::vector shape) { +inline void IncrementIndex(std::vector& coord, + const std::vector& shape) { const int64_t ndim = shape.size(); ++coord[ndim - 1]; if (coord[ndim - 1] == shape[ndim - 1]) { @@ -73,8 +74,8 @@ void IncrementIndex(std::vector& coord, const std::vector shap } } -void IncrementIndex(std::vector& coord, const std::vector shape, - std::vector axis_order) { +inline void IncrementIndex(std::vector& coord, const std::vector& shape, + const std::vector& axis_order) { const int64_t ndim = shape.size(); const int64_t last_axis = axis_order[ndim - 1]; ++coord[last_axis]; @@ -161,9 +162,7 @@ class SparseTensorConverter *indices++ = static_cast(coord[i]); } } - if (n > 1) { - IncrementIndex(coord, shape); - } + IncrementIndex(coord, shape); } } @@ -496,11 +495,9 @@ class SparseTensorConverter for (int64_t i = 0; i < ndim; ++i) { int64_t dimension = axis_order[i]; - bool change = coord[dimension] != previous_coord[dimension]; - - if (tree_split || change) { - if (change) tree_split = true; + tree_split = tree_split || (coord[dimension] != previous_coord[dimension]); + if (tree_split) { if (i < ndim - 1) { RETURN_NOT_OK(indptr_buffer_builders[i].Append( static_cast(counts[i + 1]))); @@ -512,9 +509,7 @@ class SparseTensorConverter } previous_coord = coord; } - if (n > 1) { - IncrementIndex(coord, shape, axis_order); - } + IncrementIndex(coord, shape, axis_order); } } @@ -682,25 +677,26 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor, namespace { template -void ExpandSparseCSFTensorValues(int64_t dimension, int64_t offset, int64_t first_ptr, - int64_t last_ptr, const SparseCSFIndex* sparse_index, - const TYPE* raw_data, const std::vector strides, - const std::vector axis_order, TYPE* out) { +void ExpandSparseCSFTensorValues(int64_t dimension, int64_t dense_offset, + int64_t first_ptr, int64_t last_ptr, + const SparseCSFIndex& sparse_index, const TYPE* raw_data, + const std::vector& strides, + const std::vector& axis_order, TYPE* out) { int64_t ndim = axis_order.size(); for (int64_t i = first_ptr; i < last_ptr; ++i) { - int64_t tmp_offset = - offset + sparse_index->indices()[dimension]->Value({i}) * - strides[axis_order[dimension]]; + int64_t tmp_dense_offset = + dense_offset + sparse_index.indices()[dimension]->Value({i}) * + strides[axis_order[dimension]]; if (dimension < ndim - 1) { ExpandSparseCSFTensorValues( - dimension + 1, tmp_offset, - sparse_index->indptr()[dimension]->Value({i}), - sparse_index->indptr()[dimension]->Value({i + 1}), sparse_index, + dimension + 1, tmp_dense_offset, + sparse_index.indptr()[dimension]->Value({i}), + sparse_index.indptr()[dimension]->Value({i + 1}), sparse_index, raw_data, strides, axis_order, out); } else { - out[tmp_offset] = raw_data[i]; + out[tmp_dense_offset] = raw_data[i]; } } } @@ -797,7 +793,7 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t internal::checked_cast(*sparse_tensor->sparse_index()); ExpandSparseCSFTensorValues( - 0, 0, 0, sparse_index.indptr()[0]->size() - 1, &sparse_index, raw_data, strides, + 0, 0, 0, sparse_index.indptr()[0]->size() - 1, sparse_index, raw_data, strides, sparse_index.axis_order(), values); *out = std::make_shared(sparse_tensor->type(), values_buffer, sparse_tensor->shape(), empty_strides, @@ -995,11 +991,11 @@ inline Status CheckSparseCSFIndexValidity(const std::shared_ptr& indpt } if (num_indptrs + 1 != num_indices) { return Status::Invalid( - "Length of indices must be equal to length of inptrs + 1 for SparseCSFIndex."); + "Length of indices must be equal to length of indptrs + 1 for SparseCSFIndex."); } if (axis_order_size != num_indices) { return Status::Invalid( - "Length of indices must be equal number of dimensions for SparseCSFIndex."); + "Length of indices must be equal to number of dimensions for SparseCSFIndex."); } return Status::OK(); } @@ -1045,6 +1041,16 @@ SparseCSFIndex::SparseCSFIndex(std::vector>& indptr, std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); } +bool SparseCSFIndex::Equals(const SparseCSFIndex& other) const { + for (int64_t i = 0; i < static_cast(indices().size()); ++i) { + if (!indices()[i]->Equals(*other.indices()[i])) return false; + } + for (int64_t i = 0; i < static_cast(indptr().size()); ++i) { + if (!indptr()[i]->Equals(*other.indptr()[i])) return false; + } + return axis_order() == other.axis_order(); +} + // ---------------------------------------------------------------------- // SparseTensor diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 64e730b78d3..4071f31c5ca 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -372,10 +372,10 @@ class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase>& indices, const std::vector& axis_order); - /// \brief Return a 1D tensor of indptr vector + /// \brief Return a 1D vector of indptr tensors const std::vector>& indptr() const { return indptr_; } - /// \brief Return a 1D tensor of indices vector + /// \brief Return a 1D vector of indices tensors const std::vector>& indices() const { return indices_; } /// \brief Return a 1D vector specifying the order of axes @@ -385,13 +385,7 @@ class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase(indices().size()); ++i) - if (!indices()[i]->Equals(*other.indices()[i])) return false; - for (int64_t i = 0; i < static_cast(indptr().size()); ++i) - if (!indptr()[i]->Equals(*other.indptr()[i])) return false; - return axis_order() == other.axis_order(); - } + bool Equals(const SparseCSFIndex& other) const; protected: std::vector> indptr_; diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 2b5186acda2..d6e88324934 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -26,11 +26,11 @@ #include -#include #include "arrow/sparse_tensor.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/util/sort.h" namespace arrow { @@ -949,7 +949,7 @@ class TestSparseCSFTensorForIndexValueType : public TestSparseCSFTensorBase { protected: std::shared_ptr MakeSparseCSFIndex( - const std::vector axis_order, + const std::vector& axis_order, std::vector>& indptr_values, std::vector>& indices_values) const { int64_t ndim = axis_order.size(); @@ -972,7 +972,8 @@ class TestSparseCSFTensorForIndexValueType template std::shared_ptr MakeSparseTensor( const std::shared_ptr& si, std::vector& sparse_values, - const std::vector shape, const std::vector dim_names) const { + const std::vector& shape, + const std::vector& dim_names) const { auto data_buffer = Buffer::Wrap(sparse_values); return std::make_shared( si, CTypeTraits::type_singleton(), data_buffer, shape, dim_names); @@ -1001,29 +1002,10 @@ TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestCreateSparseTensor) { } TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestTensorToSparseTensor) { - using IndexValueType = TypeParam; - std::vector shape = {2, 3, 4, 5}; - int16_t dense_values[2][3][4][5] = {}; // zero-initialized - dense_values[0][0][0][1] = 1; - dense_values[0][0][0][2] = 2; - dense_values[0][1][0][0] = 3; - dense_values[0][1][0][2] = 4; - dense_values[0][1][1][0] = 5; - dense_values[1][1][1][0] = 6; - dense_values[1][1][1][1] = 7; - dense_values[1][1][1][2] = 8; - auto dense_buffer = Buffer::Wrap(dense_values, sizeof(dense_values)); - Tensor dense_tensor(int16(), dense_buffer, shape, {}, this->dim_names_); - - std::shared_ptr sparse_tensor; - ASSERT_OK_AND_ASSIGN( - sparse_tensor, - SparseCSFTensor::Make(dense_tensor, TypeTraits::type_singleton())); - - ASSERT_EQ(8, sparse_tensor->non_zero_length()); - ASSERT_TRUE(sparse_tensor->is_mutable()); - ASSERT_TRUE(sparse_tensor->Equals(*this->sparse_tensor_from_dense_)); - ASSERT_EQ(sparse_tensor->dim_names(), dense_tensor.dim_names()); + std::vector dim_names = {"a", "b", "c", "d"}; + ASSERT_EQ(8, this->sparse_tensor_from_dense_->non_zero_length()); + ASSERT_TRUE(this->sparse_tensor_from_dense_->is_mutable()); + ASSERT_EQ(dim_names, this->sparse_tensor_from_dense_->dim_names()); } TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestSparseTensorToTensor) { From 9ca93ab60da5ddfe8ab2d83615cc905b55658c47 Mon Sep 17 00:00:00 2001 From: Rok Date: Wed, 5 Feb 2020 14:07:56 +0100 Subject: [PATCH 18/18] Implementing review feedback. --- cpp/src/arrow/sparse_tensor.cc | 8 ++++---- cpp/src/arrow/sparse_tensor.h | 12 ++++++------ cpp/src/arrow/sparse_tensor_test.cc | 5 +++-- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 0a6a91ab9d4..549223c0798 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -1006,8 +1006,8 @@ Result> SparseCSFIndex::Make( const std::shared_ptr& indptr_type, const std::shared_ptr& indices_type, const std::vector& indices_shapes, const std::vector& axis_order, - std::vector> indptr_data, - std::vector> indices_data) { + const std::vector>& indptr_data, + const std::vector>& indices_data) { int64_t ndim = axis_order.size(); std::vector> indptr(ndim - 1); std::vector> indices(ndim); @@ -1027,8 +1027,8 @@ Result> SparseCSFIndex::Make( } // Constructor with two index vectors -SparseCSFIndex::SparseCSFIndex(std::vector>& indptr, - std::vector>& indices, +SparseCSFIndex::SparseCSFIndex(const std::vector>& indptr, + const std::vector>& indices, const std::vector& axis_order) : SparseIndexBase(indices.back()->size()), indptr_(indptr), diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 4071f31c5ca..33a53761e14 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -354,22 +354,22 @@ class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase& indptr_type, const std::shared_ptr& indices_type, const std::vector& indices_shapes, const std::vector& axis_order, - std::vector> indptr_data, - std::vector> indices_data); + const std::vector>& indptr_data, + const std::vector>& indices_data); /// \brief Make SparseCSFIndex from raw properties static Result> Make( const std::shared_ptr& indices_type, const std::vector& indices_shapes, const std::vector& axis_order, - std::vector> indptr_data, - std::vector> indices_data) { + const std::vector>& indptr_data, + const std::vector>& indices_data) { return Make(indices_type, indices_type, indices_shapes, axis_order, indptr_data, indices_data); } /// \brief Construct SparseCSFIndex from two index vectors - explicit SparseCSFIndex(std::vector>& indptr, - std::vector>& indices, + explicit SparseCSFIndex(const std::vector>& indptr, + const std::vector>& indices, const std::vector& axis_order); /// \brief Return a 1D vector of indptr tensors diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index d6e88324934..45cb8dcc8f3 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -950,8 +950,9 @@ class TestSparseCSFTensorForIndexValueType protected: std::shared_ptr MakeSparseCSFIndex( const std::vector& axis_order, - std::vector>& indptr_values, - std::vector>& indices_values) const { + const std::vector>& indptr_values, + const std::vector>& indices_values) + const { int64_t ndim = axis_order.size(); std::vector> indptr(ndim - 1); std::vector> indices(ndim);