From 308a0f35ee8681af35ee5e732b7388dda12690b5 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Tue, 26 Nov 2019 18:22:13 +0900 Subject: [PATCH 01/15] Replace SparseMatrixIndesCSR with SparseMatrixIndexCSX --- format/SparseTensor.fbs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index 96d954d1edf..c1b0491454b 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -64,8 +64,13 @@ table SparseTensorIndexCOO { indicesBuffer: Buffer; } -/// Compressed Sparse Row format, that is matrix-specific. -table SparseMatrixIndexCSR { +enum SparseMatrixCompressedAxis: short { Row, Column } + +/// Compressed Sparse format, that is matrix-specific. +table SparseMatrixIndexCSX { + /// Which axis, row or column, is compressed + compressedAxis: SparseMatrixCompressedAxis; + /// The type of values in indptrBuffer indptrType: Int; @@ -110,7 +115,7 @@ table SparseMatrixIndexCSR { union SparseTensorIndex { SparseTensorIndexCOO, - SparseMatrixIndexCSR + SparseMatrixIndexCSX, } table SparseTensor { From 6f85b4cef07dd3344dd7e14020f19bcd52479f52 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Tue, 3 Dec 2019 15:33:21 +0900 Subject: [PATCH 02/15] Update flatbuffers generated files --- cpp/src/generated/SparseTensor_generated.h | 103 ++++++++++++++------- 1 file changed, 72 insertions(+), 31 deletions(-) diff --git a/cpp/src/generated/SparseTensor_generated.h b/cpp/src/generated/SparseTensor_generated.h index 17c04ca6848..cb590b302d4 100644 --- a/cpp/src/generated/SparseTensor_generated.h +++ b/cpp/src/generated/SparseTensor_generated.h @@ -16,23 +16,53 @@ namespace flatbuf { struct SparseTensorIndexCOO; -struct SparseMatrixIndexCSR; +struct SparseMatrixIndexCSX; struct SparseTensor; +enum SparseMatrixCompressedAxis { + SparseMatrixCompressedAxis_Row = 0, + SparseMatrixCompressedAxis_Column = 1, + SparseMatrixCompressedAxis_MIN = SparseMatrixCompressedAxis_Row, + SparseMatrixCompressedAxis_MAX = SparseMatrixCompressedAxis_Column +}; + +inline const SparseMatrixCompressedAxis (&EnumValuesSparseMatrixCompressedAxis())[2] { + static const SparseMatrixCompressedAxis values[] = { + SparseMatrixCompressedAxis_Row, + SparseMatrixCompressedAxis_Column + }; + return values; +} + +inline const char * const *EnumNamesSparseMatrixCompressedAxis() { + static const char * const names[3] = { + "Row", + "Column", + nullptr + }; + return names; +} + +inline const char *EnumNameSparseMatrixCompressedAxis(SparseMatrixCompressedAxis e) { + if (e < SparseMatrixCompressedAxis_Row || e > SparseMatrixCompressedAxis_Column) return ""; + const size_t index = static_cast(e); + return EnumNamesSparseMatrixCompressedAxis()[index]; +} + enum SparseTensorIndex { SparseTensorIndex_NONE = 0, SparseTensorIndex_SparseTensorIndexCOO = 1, - SparseTensorIndex_SparseMatrixIndexCSR = 2, + SparseTensorIndex_SparseMatrixIndexCSX = 2, SparseTensorIndex_MIN = SparseTensorIndex_NONE, - SparseTensorIndex_MAX = SparseTensorIndex_SparseMatrixIndexCSR + SparseTensorIndex_MAX = SparseTensorIndex_SparseMatrixIndexCSX }; inline const SparseTensorIndex (&EnumValuesSparseTensorIndex())[3] { static const SparseTensorIndex values[] = { SparseTensorIndex_NONE, SparseTensorIndex_SparseTensorIndexCOO, - SparseTensorIndex_SparseMatrixIndexCSR + SparseTensorIndex_SparseMatrixIndexCSX }; return values; } @@ -41,14 +71,14 @@ inline const char * const *EnumNamesSparseTensorIndex() { static const char * const names[4] = { "NONE", "SparseTensorIndexCOO", - "SparseMatrixIndexCSR", + "SparseMatrixIndexCSX", nullptr }; return names; } inline const char *EnumNameSparseTensorIndex(SparseTensorIndex e) { - if (e < SparseTensorIndex_NONE || e > SparseTensorIndex_SparseMatrixIndexCSR) return ""; + if (e < SparseTensorIndex_NONE || e > SparseTensorIndex_SparseMatrixIndexCSX) return ""; const size_t index = static_cast(e); return EnumNamesSparseTensorIndex()[index]; } @@ -61,8 +91,8 @@ template<> struct SparseTensorIndexTraits struct SparseTensorIndexTraits { - static const SparseTensorIndex enum_value = SparseTensorIndex_SparseMatrixIndexCSR; +template<> struct SparseTensorIndexTraits { + static const SparseTensorIndex enum_value = SparseTensorIndex_SparseMatrixIndexCSX; }; bool VerifySparseTensorIndex(flatbuffers::Verifier &verifier, const void *obj, SparseTensorIndex type); @@ -176,14 +206,19 @@ inline flatbuffers::Offset CreateSparseTensorIndexCOODirec indicesBuffer); } -/// Compressed Sparse Row format, that is matrix-specific. -struct SparseMatrixIndexCSR FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { +/// Compressed Sparse format, that is matrix-specific. +struct SparseMatrixIndexCSX FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { - VT_INDPTRTYPE = 4, - VT_INDPTRBUFFER = 6, - VT_INDICESTYPE = 8, - VT_INDICESBUFFER = 10 + VT_COMPRESSEDAXIS = 4, + VT_INDPTRTYPE = 6, + VT_INDPTRBUFFER = 8, + VT_INDICESTYPE = 10, + VT_INDICESBUFFER = 12 }; + /// Which axis, row or column, is compressed + org::apache::arrow::flatbuf::SparseMatrixCompressedAxis compressedAxis() const { + return static_cast(GetField(VT_COMPRESSEDAXIS, 0)); + } /// The type of values in indptrBuffer const org::apache::arrow::flatbuf::Int *indptrType() const { return GetPointer(VT_INDPTRTYPE); @@ -231,6 +266,7 @@ struct SparseMatrixIndexCSR FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && + VerifyField(verifier, VT_COMPRESSEDAXIS) && VerifyOffset(verifier, VT_INDPTRTYPE) && verifier.VerifyTable(indptrType()) && VerifyField(verifier, VT_INDPTRBUFFER) && @@ -241,44 +277,49 @@ struct SparseMatrixIndexCSR FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table } }; -struct SparseMatrixIndexCSRBuilder { +struct SparseMatrixIndexCSXBuilder { flatbuffers::FlatBufferBuilder &fbb_; flatbuffers::uoffset_t start_; + void add_compressedAxis(org::apache::arrow::flatbuf::SparseMatrixCompressedAxis compressedAxis) { + fbb_.AddElement(SparseMatrixIndexCSX::VT_COMPRESSEDAXIS, static_cast(compressedAxis), 0); + } void add_indptrType(flatbuffers::Offset indptrType) { - fbb_.AddOffset(SparseMatrixIndexCSR::VT_INDPTRTYPE, indptrType); + fbb_.AddOffset(SparseMatrixIndexCSX::VT_INDPTRTYPE, indptrType); } void add_indptrBuffer(const org::apache::arrow::flatbuf::Buffer *indptrBuffer) { - fbb_.AddStruct(SparseMatrixIndexCSR::VT_INDPTRBUFFER, indptrBuffer); + fbb_.AddStruct(SparseMatrixIndexCSX::VT_INDPTRBUFFER, indptrBuffer); } void add_indicesType(flatbuffers::Offset indicesType) { - fbb_.AddOffset(SparseMatrixIndexCSR::VT_INDICESTYPE, indicesType); + fbb_.AddOffset(SparseMatrixIndexCSX::VT_INDICESTYPE, indicesType); } void add_indicesBuffer(const org::apache::arrow::flatbuf::Buffer *indicesBuffer) { - fbb_.AddStruct(SparseMatrixIndexCSR::VT_INDICESBUFFER, indicesBuffer); + fbb_.AddStruct(SparseMatrixIndexCSX::VT_INDICESBUFFER, indicesBuffer); } - explicit SparseMatrixIndexCSRBuilder(flatbuffers::FlatBufferBuilder &_fbb) + explicit SparseMatrixIndexCSXBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); } - SparseMatrixIndexCSRBuilder &operator=(const SparseMatrixIndexCSRBuilder &); - flatbuffers::Offset Finish() { + SparseMatrixIndexCSXBuilder &operator=(const SparseMatrixIndexCSXBuilder &); + flatbuffers::Offset Finish() { const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); + auto o = flatbuffers::Offset(end); return o; } }; -inline flatbuffers::Offset CreateSparseMatrixIndexCSR( +inline flatbuffers::Offset CreateSparseMatrixIndexCSX( flatbuffers::FlatBufferBuilder &_fbb, + org::apache::arrow::flatbuf::SparseMatrixCompressedAxis compressedAxis = org::apache::arrow::flatbuf::SparseMatrixCompressedAxis_Row, flatbuffers::Offset indptrType = 0, const org::apache::arrow::flatbuf::Buffer *indptrBuffer = 0, flatbuffers::Offset indicesType = 0, const org::apache::arrow::flatbuf::Buffer *indicesBuffer = 0) { - SparseMatrixIndexCSRBuilder builder_(_fbb); + SparseMatrixIndexCSXBuilder builder_(_fbb); builder_.add_indicesBuffer(indicesBuffer); builder_.add_indicesType(indicesType); builder_.add_indptrBuffer(indptrBuffer); builder_.add_indptrType(indptrType); + builder_.add_compressedAxis(compressedAxis); return builder_.Finish(); } @@ -384,8 +425,8 @@ struct SparseTensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { const org::apache::arrow::flatbuf::SparseTensorIndexCOO *sparseIndex_as_SparseTensorIndexCOO() const { return sparseIndex_type() == org::apache::arrow::flatbuf::SparseTensorIndex_SparseTensorIndexCOO ? static_cast(sparseIndex()) : nullptr; } - const org::apache::arrow::flatbuf::SparseMatrixIndexCSR *sparseIndex_as_SparseMatrixIndexCSR() const { - return sparseIndex_type() == org::apache::arrow::flatbuf::SparseTensorIndex_SparseMatrixIndexCSR ? static_cast(sparseIndex()) : nullptr; + const org::apache::arrow::flatbuf::SparseMatrixIndexCSX *sparseIndex_as_SparseMatrixIndexCSX() const { + return sparseIndex_type() == org::apache::arrow::flatbuf::SparseTensorIndex_SparseMatrixIndexCSX ? static_cast(sparseIndex()) : nullptr; } /// The location and size of the tensor's data const org::apache::arrow::flatbuf::Buffer *data() const { @@ -496,8 +537,8 @@ template<> inline const org::apache::arrow::flatbuf::SparseTensorIndexCOO *Spars return sparseIndex_as_SparseTensorIndexCOO(); } -template<> inline const org::apache::arrow::flatbuf::SparseMatrixIndexCSR *SparseTensor::sparseIndex_as() const { - return sparseIndex_as_SparseMatrixIndexCSR(); +template<> inline const org::apache::arrow::flatbuf::SparseMatrixIndexCSX *SparseTensor::sparseIndex_as() const { + return sparseIndex_as_SparseMatrixIndexCSX(); } struct SparseTensorBuilder { @@ -586,8 +627,8 @@ inline bool VerifySparseTensorIndex(flatbuffers::Verifier &verifier, const void auto ptr = reinterpret_cast(obj); return verifier.VerifyTable(ptr); } - case SparseTensorIndex_SparseMatrixIndexCSR: { - auto ptr = reinterpret_cast(obj); + case SparseTensorIndex_SparseMatrixIndexCSX: { + auto ptr = reinterpret_cast(obj); return verifier.VerifyTable(ptr); } default: return true; From 60e6b2a1f4ec34ab85fcb65195c868045ebd40b4 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 27 Nov 2019 17:06:37 +0900 Subject: [PATCH 03/15] Introduce internal::SparseCSXIndex --- cpp/src/arrow/ipc/metadata_internal.cc | 27 +++-- cpp/src/arrow/ipc/metadata_internal.h | 2 +- cpp/src/arrow/ipc/reader.cc | 57 +++++++---- cpp/src/arrow/python/serialize.cc | 3 + cpp/src/arrow/sparse_tensor.cc | 68 +++++-------- cpp/src/arrow/sparse_tensor.h | 133 +++++++++++++++++++------ 6 files changed, 185 insertions(+), 105 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index e9dbf6a0404..960b029c924 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -992,7 +992,7 @@ Status MakeSparseMatrixIndexCSR(FBB& fbb, const SparseCSRIndex& sparse_index, const std::vector& buffers, flatbuf::SparseTensorIndex* fb_sparse_index_type, Offset* fb_sparse_index, size_t* num_buffers) { - *fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseMatrixIndexCSR; + *fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseMatrixIndexCSX; // We assume that the value type of indptr tensor is an integer. const auto& indptr_value_type = @@ -1012,8 +1012,9 @@ Status MakeSparseMatrixIndexCSR(FBB& fbb, const SparseCSRIndex& sparse_index, const BufferMetadata& indices_metadata = buffers[1]; flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length); - *fb_sparse_index = flatbuf::CreateSparseMatrixIndexCSR(fbb, indptr_type_offset, &indptr, - indices_type_offset, &indices) + *fb_sparse_index = flatbuf::CreateSparseMatrixIndexCSX( + fbb, flatbuf::SparseMatrixCompressedAxis_Row, indptr_type_offset, + &indptr, indices_type_offset, &indices) .Union(); *num_buffers = 2; return Status::OK(); @@ -1225,7 +1226,7 @@ Status GetSparseCOOIndexMetadata(const flatbuf::SparseTensorIndexCOO* sparse_ind return IntFromFlatbuffer(sparse_index->indicesType(), indices_type); } -Status GetSparseCSRIndexMetadata(const flatbuf::SparseMatrixIndexCSR* sparse_index, +Status GetSparseCSXIndexMetadata(const flatbuf::SparseMatrixIndexCSX* sparse_index, std::shared_ptr* indptr_type, std::shared_ptr* indices_type) { RETURN_NOT_OK(IntFromFlatbuffer(sparse_index->indptrType(), indptr_type)); @@ -1276,9 +1277,21 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr *sparse_tensor_format_id = SparseTensorFormat::COO; break; - case flatbuf::SparseTensorIndex_SparseMatrixIndexCSR: - *sparse_tensor_format_id = SparseTensorFormat::CSR; - break; + case flatbuf::SparseTensorIndex_SparseMatrixIndexCSX: { + auto cs = sparse_tensor->sparseIndex_as_SparseMatrixIndexCSX(); + switch (cs->compressedAxis()) { + case flatbuf::SparseMatrixCompressedAxis_Row: + *sparse_tensor_format_id = SparseTensorFormat::CSR; + break; + + case flatbuf::SparseMatrixCompressedAxis_Column: + *sparse_tensor_format_id = SparseTensorFormat::CSC; + break; + + default: + return Status::Invalid("Invalid value of SparseMatrixCompressedAxis"); + } + } break; default: return Status::Invalid("Unrecognized sparse index type"); diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index 82678bd4ee1..1f4b083fb13 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -112,7 +112,7 @@ Status GetSparseCOOIndexMetadata(const flatbuf::SparseTensorIndexCOO* sparse_ind std::shared_ptr* indices_type); // EXPERIMENTAL: Extracting metadata of a SparseCSRIndex from the message -Status GetSparseCSRIndexMetadata(const flatbuf::SparseMatrixIndexCSR* sparse_index, +Status GetSparseCSXIndexMetadata(const flatbuf::SparseMatrixIndexCSX* sparse_index, std::shared_ptr* indptr_type, std::shared_ptr* indices_type); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 6fa387ceb29..eb482e08412 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -859,9 +859,9 @@ Status ReadTensor(const Message& message, std::shared_ptr* out) { namespace { -Status ReadSparseCOOIndex(const flatbuf::SparseTensor* sparse_tensor, - const std::vector& shape, int64_t non_zero_length, - io::RandomAccessFile* file, std::shared_ptr* out) { +Result> ReadSparseCOOIndex( + const flatbuf::SparseTensor* sparse_tensor, const std::vector& shape, + int64_t non_zero_length, io::RandomAccessFile* file) { auto* sparse_index = sparse_tensor->sparseIndex_as_SparseTensorIndexCOO(); std::shared_ptr indices_type; @@ -877,19 +877,18 @@ Status ReadSparseCOOIndex(const flatbuf::SparseTensor* sparse_tensor, // Assume indices_strides is a 2-length array. strides.push_back(indices_strides->Get(0)); strides.push_back(indices_strides->Get(1)); - *out = std::make_shared( + return std::make_shared( std::make_shared(indices_type, indices_data, indices_shape, strides)); - return Status::OK(); } -Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, - const std::vector& shape, int64_t non_zero_length, - io::RandomAccessFile* file, std::shared_ptr* out) { - auto* sparse_index = sparse_tensor->sparseIndex_as_SparseMatrixIndexCSR(); +Result> ReadSparseCSXIndex( + const flatbuf::SparseTensor* sparse_tensor, const std::vector& shape, + int64_t non_zero_length, io::RandomAccessFile* file) { + auto* sparse_index = sparse_tensor->sparseIndex_as_SparseMatrixIndexCSX(); std::shared_ptr indptr_type, indices_type; RETURN_NOT_OK( - internal::GetSparseCSRIndexMetadata(sparse_index, &indptr_type, &indices_type)); + internal::GetSparseCSXIndexMetadata(sparse_index, &indptr_type, &indices_type)); auto* indptr_buffer = sparse_index->indptrBuffer(); ARROW_ASSIGN_OR_RAISE(auto indptr_data, @@ -902,10 +901,18 @@ Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, std::vector indptr_shape({shape[0] + 1}); std::vector indices_shape({non_zero_length}); - *out = std::make_shared( - std::make_shared(indptr_type, indptr_data, indptr_shape), - std::make_shared(indices_type, indices_data, indices_shape)); - return Status::OK(); + switch (sparse_index->compressedAxis()) { + case flatbuf::SparseMatrixCompressedAxis_Row: + return std::make_shared( + std::make_shared(indptr_type, indptr_data, indptr_shape), + std::make_shared(indices_type, indices_data, indices_shape)); + + case flatbuf::SparseMatrixCompressedAxis_Column: + return Status::NotImplemented("CSC sparse index is not supported"); + + default: + return Status::Invalid("Invalid value of SparseMatrixCompressedAxis"); + } } Status MakeSparseTensorWithSparseCOOIndex( @@ -1039,8 +1046,8 @@ Status ReadSparseTensorPayload(const IpcPayload& payload, std::shared_ptr sparse_index; std::shared_ptr indptr_type; std::shared_ptr indices_type; - RETURN_NOT_OK(internal::GetSparseCSRIndexMetadata( - sparse_tensor->sparseIndex_as_SparseMatrixIndexCSR(), &indptr_type, + RETURN_NOT_OK(internal::GetSparseCSXIndexMetadata( + sparse_tensor->sparseIndex_as_SparseMatrixIndexCSX(), &indptr_type, &indices_type)); ARROW_CHECK_EQ(indptr_type, indices_type); ARROW_ASSIGN_OR_RAISE( @@ -1078,19 +1085,29 @@ Status ReadSparseTensor(const Buffer& metadata, io::RandomAccessFile* file, std::shared_ptr sparse_index; switch (sparse_tensor_format_id) { case SparseTensorFormat::COO: - RETURN_NOT_OK( - ReadSparseCOOIndex(sparse_tensor, shape, non_zero_length, file, &sparse_index)); + RETURN_NOT_OK(ReadSparseCOOIndex(sparse_tensor, shape, non_zero_length, file) + .Value(&sparse_index)); return MakeSparseTensorWithSparseCOOIndex( type, shape, dim_names, checked_pointer_cast(sparse_index), non_zero_length, data, out); case SparseTensorFormat::CSR: - RETURN_NOT_OK( - ReadSparseCSRIndex(sparse_tensor, shape, non_zero_length, file, &sparse_index)); + RETURN_NOT_OK(ReadSparseCSXIndex(sparse_tensor, shape, non_zero_length, file) + .Value(&sparse_index)); return MakeSparseTensorWithSparseCSRIndex( type, shape, dim_names, checked_pointer_cast(sparse_index), non_zero_length, data, out); + case SparseTensorFormat::CSC: + return Status::NotImplemented("CSC sparse index is not supported"); // TODO: + + // RETURN_NOT_OK( + // ReadSparseCSXIndex(sparse_tensor, shape, non_zero_length, + // file).Value(&sparse_index)); + // return MakeSparseTensorWithSparseCSCIndex( + // type, shape, dim_names, checked_pointer_cast(sparse_index), + // non_zero_length, data, out); + default: return Status::Invalid("Unsupported sparse index format"); } diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index eeb85bd0075..09a322b1060 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -663,6 +663,9 @@ Status CountSparseTensors( case SparseTensorFormat::CSR: ++num_csr; break; + case SparseTensorFormat::CSC: + // TODO(mrkn): support csc + break; } } diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 6a9a120a74a..4756e7b998a 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -337,7 +337,7 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor, pool); RETURN_NOT_OK(converter.Convert()); - *out_sparse_index = converter.sparse_index; + *out_sparse_index = checked_pointer_cast(converter.sparse_index); *out_data = converter.data; return Status::OK(); } @@ -446,6 +446,10 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t sparse_tensor->shape()); return Status::OK(); } + + case SparseTensorFormat::CSC: { + return Status::NotImplemented("CSC format is not implemented yet"); + } } return Status::NotImplemented("Unsupported SparseIndex format type"); } @@ -564,64 +568,40 @@ SparseCOOIndex::SparseCOOIndex(const std::shared_ptr& coords) std::string SparseCOOIndex::ToString() const { return std::string("SparseCOOIndex"); } // ---------------------------------------------------------------------- -// SparseCSRIndex +// SparseCSXIndex -namespace { +namespace internal { -inline Status CheckSparseCSRIndexValidity(const std::shared_ptr& indptr_type, - const std::shared_ptr& indices_type, - const std::vector& indptr_shape, - const std::vector& indices_shape) { +Status ValidateSparseCSXIndex(const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const std::vector& indptr_shape, + const std::vector& indices_shape, + char const* type_name) { if (!is_integer(indptr_type->id())) { - return Status::Invalid("Type of SparseCSRIndex indptr must be integer"); + return Status::Invalid("Type of ", type_name, " indptr must be integer"); } if (indptr_shape.size() != 1) { - return Status::Invalid("SparseCSRIndex indptr must be a vector"); + return Status::Invalid(type_name, " indptr must be a vector"); } if (!is_integer(indices_type->id())) { - return Status::Invalid("Type of SparseCSRIndex indices must be integer"); + return Status::Invalid("Type of ", type_name, " indices must be integer"); } if (indices_shape.size() != 1) { - return Status::Invalid("SparseCSRIndex indices must be a vector"); + return Status::Invalid(type_name, " indices must be a vector"); } return Status::OK(); } -} // namespace - -Result> SparseCSRIndex::Make( - const std::shared_ptr& indptr_type, - const std::shared_ptr& indices_type, - const std::vector& indptr_shape, const std::vector& indices_shape, - std::shared_ptr indptr_data, std::shared_ptr indices_data) { - RETURN_NOT_OK(CheckSparseCSRIndexValidity(indptr_type, indices_type, indptr_shape, - indices_shape)); - return std::make_shared( - std::make_shared(indptr_type, indptr_data, indptr_shape), - std::make_shared(indices_type, indices_data, indices_shape)); +void CheckSparseCSXIndexValidity(const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const std::vector& indptr_shape, + const std::vector& indices_shape, + char const* type_name) { + ARROW_CHECK_OK(ValidateSparseCSXIndex(indptr_type, indices_type, indptr_shape, + indices_shape, type_name)); } -Result> SparseCSRIndex::Make( - const std::shared_ptr& indptr_type, - const std::shared_ptr& indices_type, const std::vector& shape, - int64_t non_zero_length, std::shared_ptr indptr_data, - std::shared_ptr indices_data) { - std::vector indptr_shape({shape[0] + 1}); - std::vector indices_shape({non_zero_length}); - return Make(indptr_type, indices_type, indptr_shape, indices_shape, indptr_data, - indices_data); -} - -// Constructor with two index vectors -SparseCSRIndex::SparseCSRIndex(const std::shared_ptr& indptr, - const std::shared_ptr& indices) - : SparseIndexBase(indices->shape()[0]), indptr_(indptr), indices_(indices) { - ARROW_CHECK(CheckSparseCSRIndexValidity(indptr_->type(), indices_->type(), - indptr_->shape(), indices_->shape()) - .ok()); -} - -std::string SparseCSRIndex::ToString() const { return std::string("SparseCSRIndex"); } +} // namespace internal // ---------------------------------------------------------------------- // SparseTensor diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 5194f2fec3a..2cf85973cd1 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -36,7 +37,9 @@ struct SparseTensorFormat { /// Coordinate list (COO) format. COO, /// Compressed sparse row (CSR) format. - CSR + CSR, + /// Compressed sparse column (CSC) format. + CSC, }; }; @@ -136,34 +139,57 @@ class ARROW_EXPORT SparseCOOIndex : public internal::SparseIndexBase coords_; }; -// ---------------------------------------------------------------------- -// SparseCSRIndex class +namespace internal { -/// \brief EXPERIMENTAL: The index data for a CSR sparse matrix -/// -/// A CSR sparse index manages the location of its non-zero values by two -/// vectors. -/// -/// The first vector, called indptr, represents the range of the rows; the i-th -/// row spans from indptr[i] to indptr[i+1] in the corresponding value vector. -/// So the length of an indptr vector is the number of rows + 1. -/// -/// The other vector, called indices, represents the column indices of the -/// corresponding non-zero values. So the length of an indices vector is same -/// as the number of non-zero-values. -class ARROW_EXPORT SparseCSRIndex : public internal::SparseIndexBase { - public: - static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSR; +struct SparseMatrixCompressedAxis { + /// EXPERIMENTAL: The axis to be compressed + enum type { + /// The value for CSR matrix + ROW, + /// The value for CSC matrix + COLUMN + }; +}; - /// \brief Make SparseCSRIndex from raw properties - static Result> Make( +ARROW_EXPORT +Status ValidateSparseCSXIndex(const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const std::vector& indptr_shape, + const std::vector& indices_shape, + char const* type_name); + +ARROW_EXPORT +void CheckSparseCSXIndexValidity(const std::shared_ptr& indptr_type, + const std::shared_ptr& indices_type, + const std::vector& indptr_shape, + const std::vector& indices_shape, + char const* type_name); + +template +class ARROW_EXPORT SparseCSXIndex : public SparseIndexBase { + public: + static constexpr SparseTensorFormat::type format_id = std::conditional< + COMPRESSED_AXIS == SparseMatrixCompressedAxis::ROW, + std::integral_constant, + std::integral_constant>::type::value; + + /// \brief Make a subclass of SparseCSXIndex from raw properties + static Result> Make( const std::shared_ptr& indptr_type, const std::shared_ptr& indices_type, const std::vector& indptr_shape, const std::vector& indices_shape, - std::shared_ptr indptr_data, std::shared_ptr indices_data); + std::shared_ptr indptr_data, std::shared_ptr indices_data) { + ARROW_RETURN_NOT_OK(ValidateSparseCSXIndex(indptr_type, indices_type, indptr_shape, + indices_shape, + SparseIndexType::TYPE_NAME)); + return std::make_shared( + std::make_shared(indptr_type, indptr_data, indptr_shape), + std::make_shared(indices_type, indices_data, indices_shape)); + } - /// \brief Make SparseCSRIndex from raw properties - static Result> Make( + /// \brief Make a subclass of SparseCSRIndex from raw properties + static Result> Make( const std::shared_ptr& indices_type, const std::vector& indptr_shape, const std::vector& indices_shape, std::shared_ptr indptr_data, std::shared_ptr indices_data) { @@ -171,15 +197,22 @@ class ARROW_EXPORT SparseCSRIndex : public internal::SparseIndexBase> Make( + /// \brief Make a subclass of SparseCSXIndex from sparse tensor's shape properties and + /// data + static Result> Make( const std::shared_ptr& indptr_type, const std::shared_ptr& indices_type, const std::vector& shape, int64_t non_zero_length, std::shared_ptr indptr_data, - std::shared_ptr indices_data); + std::shared_ptr indices_data) { + std::vector indptr_shape({shape[0] + 1}); + std::vector indices_shape({non_zero_length}); + return Make(indptr_type, indices_type, indptr_shape, indices_shape, indptr_data, + indices_data); + } - /// \brief Make SparseCSRIndex from sparse tensor's shape properties and data - static Result> Make( + /// \brief Make a subclass of SparseCSXIndex from sparse tensor's shape properties and + /// data + static Result> Make( const std::shared_ptr& indices_type, const std::vector& shape, int64_t non_zero_length, std::shared_ptr indptr_data, std::shared_ptr indices_data) { @@ -188,8 +221,14 @@ class ARROW_EXPORT SparseCSRIndex : public internal::SparseIndexBase& indptr, - const std::shared_ptr& indices); + explicit SparseCSXIndex(const std::shared_ptr& indptr, + const std::shared_ptr& indices) + : SparseIndexBase(indices->shape()[0]), + indptr_(indptr), + indices_(indices) { + CheckSparseCSXIndexValidity(indptr_->type(), indices_->type(), indptr_->shape(), + indices_->shape(), SparseIndexType::TYPE_NAME); + } /// \brief Return a 1D tensor of indptr vector const std::shared_ptr& indptr() const { return indptr_; } @@ -198,10 +237,12 @@ class ARROW_EXPORT SparseCSRIndex : public internal::SparseIndexBase& indices() const { return indices_; } /// \brief Return a string representation of the sparse index - std::string ToString() const override; + std::string ToString() const override { + return std::string(SparseIndexType::TYPE_NAME); + } /// \brief Return whether the CSR indices are equal - bool Equals(const SparseCSRIndex& other) const { + bool Equals(const SparseIndexType& other) const { return indptr()->Equals(*other.indptr()) && indices()->Equals(*other.indices()); } @@ -220,7 +261,7 @@ class ARROW_EXPORT SparseCSRIndex : public internal::SparseIndexBase indices_; }; +} // namespace internal + +// ---------------------------------------------------------------------- +// SparseCSRIndex class + +/// \brief EXPERIMENTAL: The index data for a CSR sparse matrix +/// +/// A CSR sparse index manages the location of its non-zero values by two +/// vectors. +/// +/// The first vector, called indptr, represents the range of the rows; the i-th +/// row spans from indptr[i] to indptr[i+1] in the corresponding value vector. +/// So the length of an indptr vector is the number of rows + 1. +/// +/// The other vector, called indices, represents the column indices of the +/// corresponding non-zero values. So the length of an indices vector is same +/// as the number of non-zero-values. +class ARROW_EXPORT SparseCSRIndex + : public internal::SparseCSXIndex { + public: + constexpr static char const* TYPE_NAME = "SparseCSRIndex"; + + using SparseCSXIndex::SparseCSXIndex; +}; + // ---------------------------------------------------------------------- // SparseTensor class From 4ce999f885dd40863e052e71f16eb7492d7510d6 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Mon, 2 Dec 2019 15:18:24 +0900 Subject: [PATCH 04/15] Add SparseCSCIndex and SparseCSCMatrix --- cpp/src/arrow/compare.cc | 12 ++ cpp/src/arrow/sparse_tensor.cc | 151 +++++++++++++++++++++ cpp/src/arrow/sparse_tensor.h | 31 ++++- cpp/src/arrow/sparse_tensor_test.cc | 199 +++++++++++++++++++++++++++- 4 files changed, 388 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index b030d434d72..f7431f80f5f 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -1189,6 +1189,13 @@ inline bool SparseTensorEqualsImplDispatch(const SparseTensorImpl&>(right); + return SparseTensorEqualsImpl::Compare(left, + right_csc); + } + default: return false; } @@ -1220,6 +1227,11 @@ bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right) { return SparseTensorEqualsImplDispatch(left_csr, right); } + case SparseTensorFormat::CSC: { + const auto& left_csc = checked_cast&>(left); + return SparseTensorEqualsImplDispatch(left_csc, right); + } + default: return false; } diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 4756e7b998a..21694a71706 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -301,6 +301,127 @@ class SparseTensorConverter inline Status CheckMaximumValue(const uint64_t) const { return Status::OK(); } }; +// ---------------------------------------------------------------------- +// SparseTensorConverter for SparseCSCIndex + +template +class SparseTensorConverter + : private SparseTensorConverterBase { + public: + using BaseClass = SparseTensorConverterBase; + using NumericTensorType = typename BaseClass::NumericTensorType; + using value_type = typename BaseClass::value_type; + + SparseTensorConverter(const NumericTensorType& tensor, + const std::shared_ptr& index_value_type, + MemoryPool* pool) + : BaseClass(tensor, index_value_type, pool) {} + + template + Status Convert() { + using c_index_value_type = typename IndexValueType::c_type; + RETURN_NOT_OK(CheckMaximumValue(std::numeric_limits::max())); + const int64_t indices_elsize = sizeof(c_index_value_type); + + const int64_t ndim = tensor_.ndim(); + if (ndim > 2) { + // LCOV_EXCL_START: The following invalid causes program failure. + return Status::Invalid("Invalid tensor dimension"); + // LCOV_EXCL_STOP + } + + const int64_t nr = tensor_.shape()[0]; + const int64_t nc = tensor_.shape()[1]; + int64_t nonzero_count = -1; + RETURN_NOT_OK(tensor_.CountNonZero(&nonzero_count)); + + std::shared_ptr indptr_buffer; + std::shared_ptr indices_buffer; + + std::shared_ptr values_buffer; + RETURN_NOT_OK( + AllocateBuffer(pool_, sizeof(value_type) * nonzero_count, &values_buffer)); + value_type* values = reinterpret_cast(values_buffer->mutable_data()); + + if (ndim <= 1) { + return Status::NotImplemented("TODO for ndim <= 1"); + } else { + RETURN_NOT_OK(AllocateBuffer(pool_, indices_elsize * (nc + 1), &indptr_buffer)); + auto* indptr = reinterpret_cast(indptr_buffer->mutable_data()); + + RETURN_NOT_OK( + AllocateBuffer(pool_, indices_elsize * nonzero_count, &indices_buffer)); + auto* indices = + reinterpret_cast(indices_buffer->mutable_data()); + + c_index_value_type k = 0; + *indptr++ = 0; + for (int64_t j = 0; j < nc; ++j) { + for (int64_t i = 0; i < nr; ++i) { + const value_type x = tensor_.Value({i, j}); + if (x != 0) { + *values++ = x; + *indices++ = static_cast(i); + k++; + } + } + *indptr++ = k; + } + } + + std::vector indptr_shape({nc + 1}); + std::shared_ptr indptr_tensor = + std::make_shared(index_value_type_, indptr_buffer, indptr_shape); + + std::vector indices_shape({nonzero_count}); + std::shared_ptr indices_tensor = + std::make_shared(index_value_type_, indices_buffer, indices_shape); + + sparse_index = std::make_shared(indptr_tensor, indices_tensor); + data = values_buffer; + + return Status::OK(); + } + +#define CALL_TYPE_SPECIFIC_CONVERT(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: \ + return Convert(); + + Status Convert() { + switch (index_value_type_->id()) { + ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_TYPE_SPECIFIC_CONVERT); + // LCOV_EXCL_START: The following invalid causes program failure. + default: + return Status::Invalid("Unsupported SparseTensor index value type"); + // LCOV_EXCL_STOP + } + } + +#undef CALL_TYPE_SPECIFIC_CONVERT + + std::shared_ptr sparse_index; + std::shared_ptr data; + + private: + using BaseClass::index_value_type_; + using BaseClass::pool_; + using BaseClass::tensor_; + + template + inline Status CheckMaximumValue(const c_value_type type_max) const { + if (static_cast(type_max) < tensor_.shape()[1]) { + // LCOV_EXCL_START: The following invalid causes program failure. + return Status::Invalid("The bit width of the index value type is too small"); + // LCOV_EXCL_STOP + } + return Status::OK(); + } + + inline Status CheckMaximumValue(const int64_t) const { return Status::OK(); } + + inline Status CheckMaximumValue(const uint64_t) const { return Status::OK(); } +}; + // ---------------------------------------------------------------------- // Instantiate templates @@ -319,6 +440,7 @@ class SparseTensorConverter INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCOOIndex); INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSRIndex); +INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSCIndex); } // namespace @@ -378,6 +500,9 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor, case SparseTensorFormat::CSR: return MakeSparseTensorFromTensor(tensor, index_value_type, pool, out_sparse_index, out_data); + case SparseTensorFormat::CSC: + return MakeSparseTensorFromTensor(tensor, index_value_type, pool, + out_sparse_index, out_data); // LCOV_EXCL_START: ignore program failure default: return Status::Invalid("Invalid sparse tensor format"); @@ -448,6 +573,25 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t } case SparseTensorFormat::CSC: { + const auto& sparse_index = + internal::checked_cast(*sparse_tensor->sparse_index()); + const std::shared_ptr indptr = sparse_index.indptr(); + const std::shared_ptr indices = sparse_index.indices(); + const auto raw_data = + reinterpret_cast(sparse_tensor->raw_data()); + + int64_t offset; + for (int64_t j = 0; j < indptr->size() - 1; ++j) { + const int64_t start = indptr->Value({j}); + const int64_t stop = indptr->Value({j + 1}); + for (int64_t i = start; i < stop; ++i) { + offset = j + indices->Value({i}) * sparse_tensor->shape()[1]; + values[offset] = raw_data[i]; + } + } + *out = std::make_shared(sparse_tensor->type(), values_buffer, + sparse_tensor->shape()); + return Status::OK(); return Status::NotImplemented("CSC format is not implemented yet"); } } @@ -478,6 +622,13 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t const std::shared_ptr indices = sparse_index.indices(); type = indices->type(); break; + } + case SparseTensorFormat::CSC: { + const auto& sparse_index = + internal::checked_cast(*sparse_tensor->sparse_index()); + const std::shared_ptr indices = sparse_index.indices(); + type = indices->type(); + break; } // LCOV_EXCL_START: ignore program failure default: diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 2cf85973cd1..1be6196c48e 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -166,7 +166,7 @@ void CheckSparseCSXIndexValidity(const std::shared_ptr& indptr_type, char const* type_name); template -class ARROW_EXPORT SparseCSXIndex : public SparseIndexBase { +class SparseCSXIndex : public SparseIndexBase { public: static constexpr SparseTensorFormat::type format_id = std::conditional< COMPRESSED_AXIS == SparseMatrixCompressedAxis::ROW, @@ -292,6 +292,32 @@ class ARROW_EXPORT SparseCSRIndex public: constexpr static char const* TYPE_NAME = "SparseCSRIndex"; + using SparseCSXIndex::Make; + using SparseCSXIndex::SparseCSXIndex; +}; + +// ---------------------------------------------------------------------- +// SparseCSCIndex class + +/// \brief EXPERIMENTAL: The index data for a CSC sparse matrix +/// +/// A CSC sparse index manages the location of its non-zero values by two +/// vectors. +/// +/// The first vector, called indptr, represents the range of the column; the i-th +/// column spans from indptr[i] to indptr[i+1] in the corresponding value vector. +/// So the length of an indptr vector is the number of columns + 1. +/// +/// The other vector, called indices, represents the row indices of the +/// corresponding non-zero values. So the length of an indices vector is same +/// as the number of non-zero-values. +class ARROW_EXPORT SparseCSCIndex + : public internal::SparseCSXIndex { + public: + constexpr static char const* TYPE_NAME = "SparseCSCIndex"; + + using SparseCSXIndex::Make; using SparseCSXIndex::SparseCSXIndex; }; @@ -486,6 +512,9 @@ using SparseCOOTensor = SparseTensorImpl; /// \brief EXPERIMENTAL: Type alias for CSR sparse matrix using SparseCSRMatrix = SparseTensorImpl; +/// \brief EXPERIMENTAL: Type alias for CSC sparse matrix +using SparseCSCMatrix = SparseTensorImpl; + } // namespace arrow #endif // ARROW_SPARSE_TENSOR_H diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 3cc5e882433..e78b9a395a2 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -91,6 +91,7 @@ TEST(TestSparseCSRIndex, Make) { ASSERT_EQ(indptr_data->data(), si->indptr()->raw_data()); ASSERT_EQ(indices_shape, si->indices()->shape()); ASSERT_EQ(indices_data->data(), si->indices()->raw_data()); + ASSERT_EQ(std::string("SparseCSRIndex"), si->ToString()); // Non-integer type auto res = SparseCSRIndex::Make(float32(), indptr_shape, indices_shape, indptr_data, @@ -98,12 +99,43 @@ TEST(TestSparseCSRIndex, Make) { ASSERT_RAISES(Invalid, res); // Non-vector indptr shape - res = SparseCSRIndex::Make(int32(), {1, 2}, indices_shape, indptr_data, indices_data); - ASSERT_RAISES(Invalid, res); + ASSERT_RAISES(Invalid, SparseCSRIndex::Make(int32(), {1, 2}, indices_shape, indptr_data, + indices_data)); // Non-vector indices shape - res = SparseCSRIndex::Make(int32(), indptr_shape, {1, 2}, indptr_data, indices_data); - ASSERT_RAISES(Invalid, res); + ASSERT_RAISES(Invalid, SparseCSRIndex::Make(int32(), indptr_shape, {1, 2}, indptr_data, + indices_data)); +} + +TEST(TestSparseCSCIndex, Make) { + std::vector indptr_values = {0, 2, 4, 6, 8, 10, 12}; + std::vector indices_values = {0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}; + auto indptr_data = Buffer::Wrap(indptr_values); + auto indices_data = Buffer::Wrap(indices_values); + std::vector indptr_shape = {7}; + std::vector indices_shape = {12}; + + // OK + std::shared_ptr si; + ASSERT_OK_AND_ASSIGN(si, SparseCSCIndex::Make(int32(), indptr_shape, indices_shape, + indptr_data, indices_data)); + ASSERT_EQ(indptr_shape, si->indptr()->shape()); + ASSERT_EQ(indptr_data->data(), si->indptr()->raw_data()); + ASSERT_EQ(indices_shape, si->indices()->shape()); + ASSERT_EQ(indices_data->data(), si->indices()->raw_data()); + ASSERT_EQ(std::string("SparseCSCIndex"), si->ToString()); + + // Non-integer type + ASSERT_RAISES(Invalid, SparseCSCIndex::Make(float32(), indptr_shape, indices_shape, + indptr_data, indices_data)); + + // Non-vector indptr shape + ASSERT_RAISES(Invalid, SparseCSCIndex::Make(int32(), {1, 2}, indices_shape, indptr_data, + indices_data)); + + // Non-vector indices shape + ASSERT_RAISES(Invalid, SparseCSCIndex::Make(int32(), indptr_shape, {1, 2}, indptr_data, + indices_data)); } template @@ -712,4 +744,163 @@ INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestSparseCSRMatrixForIndexValueType, I INSTANTIATE_TYPED_TEST_CASE_P(TestUInt64, TestSparseCSRMatrixForIndexValueType, UInt64Type); +template +class TestSparseCSCMatrixBase : public ::testing::Test { + public: + void SetUp() { + shape_ = {6, 4}; + dim_names_ = {"foo", "bar"}; + + // Dense representation: + // [ + // 1 0 2 0 + // 0 3 0 4 + // 5 0 6 0 + // 0 11 0 12 + // 13 0 14 0 + // 0 15 0 16 + // ] + std::vector dense_values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + auto dense_data = Buffer::Wrap(dense_values); + NumericTensor dense_tensor(dense_data, shape_, {}, dim_names_); + ASSERT_OK_AND_ASSIGN(sparse_tensor_from_dense_, + SparseCSCMatrix::Make( + dense_tensor, TypeTraits::type_singleton())); + } + + protected: + std::vector shape_; + std::vector dim_names_; + std::shared_ptr sparse_tensor_from_dense_; +}; + +class TestSparseCSCMatrix : public TestSparseCSCMatrixBase {}; + +TEST_F(TestSparseCSCMatrix, CreationFromNumericTensor2D) { + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::shared_ptr buffer = Buffer::Wrap(values); + NumericTensor tensor(buffer, this->shape_); + + std::shared_ptr st1; + ASSERT_OK_AND_ASSIGN(st1, SparseCSCMatrix::Make(tensor)); + + auto st2 = this->sparse_tensor_from_dense_; + + CheckSparseIndexFormatType(SparseTensorFormat::CSC, *st1); + + ASSERT_EQ(12, st1->non_zero_length()); + ASSERT_TRUE(st1->is_mutable()); + + ASSERT_EQ(std::vector({"foo", "bar"}), st2->dim_names()); + ASSERT_EQ("foo", st2->dim_name(0)); + ASSERT_EQ("bar", st2->dim_name(1)); + + ASSERT_EQ(std::vector({}), st1->dim_names()); + ASSERT_EQ("", st1->dim_name(0)); + ASSERT_EQ("", st1->dim_name(1)); + ASSERT_EQ("", st1->dim_name(2)); + + const int64_t* raw_data = reinterpret_cast(st1->raw_data()); + AssertNumericDataEqual(raw_data, {1, 5, 13, 3, 11, 15, 2, 6, 14, 4, 12, 16}); + + auto si = internal::checked_pointer_cast(st1->sparse_index()); + ASSERT_EQ(std::string("SparseCSCIndex"), si->ToString()); + ASSERT_EQ(1, si->indptr()->ndim()); + ASSERT_EQ(1, si->indices()->ndim()); + + const int64_t* indptr_begin = + reinterpret_cast(si->indptr()->raw_data()); + std::vector indptr_values(indptr_begin, + indptr_begin + si->indptr()->shape()[0]); + + ASSERT_EQ(5, indptr_values.size()); + ASSERT_EQ(std::vector({0, 3, 6, 9, 12}), indptr_values); + + const int64_t* indices_begin = + reinterpret_cast(si->indices()->raw_data()); + std::vector indices_values(indices_begin, + indices_begin + si->indices()->shape()[0]); + + ASSERT_EQ(12, indices_values.size()); + ASSERT_EQ(std::vector({0, 2, 4, 1, 3, 5, 0, 2, 4, 1, 3, 5}), indices_values); +} + +TEST_F(TestSparseCSCMatrix, CreationFromNonContiguousTensor) { + std::vector values = {1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 0, + 5, 0, 0, 0, 6, 0, 0, 0, 0, 0, 11, 0, 0, 0, 12, 0, + 13, 0, 0, 0, 14, 0, 0, 0, 0, 0, 15, 0, 0, 0, 16, 0}; + std::vector strides = {64, 16}; + std::shared_ptr buffer = Buffer::Wrap(values); + Tensor tensor(int64(), buffer, this->shape_, strides); + + std::shared_ptr st; + ASSERT_OK_AND_ASSIGN(st, SparseCSCMatrix::Make(tensor)); + + ASSERT_EQ(12, st->non_zero_length()); + ASSERT_TRUE(st->is_mutable()); + + const int64_t* raw_data = reinterpret_cast(st->raw_data()); + AssertNumericDataEqual(raw_data, {1, 5, 13, 3, 11, 15, 2, 6, 14, 4, 12, 16}); + + auto si = internal::checked_pointer_cast(st->sparse_index()); + ASSERT_EQ(1, si->indptr()->ndim()); + ASSERT_EQ(1, si->indices()->ndim()); + + const int64_t* indptr_begin = + reinterpret_cast(si->indptr()->raw_data()); + std::vector indptr_values(indptr_begin, + indptr_begin + si->indptr()->shape()[0]); + + ASSERT_EQ(5, indptr_values.size()); + ASSERT_EQ(std::vector({0, 3, 6, 9, 12}), indptr_values); + + const int64_t* indices_begin = + reinterpret_cast(si->indices()->raw_data()); + std::vector indices_values(indices_begin, + indices_begin + si->indices()->shape()[0]); + + ASSERT_EQ(12, indices_values.size()); + ASSERT_EQ(std::vector({0, 2, 4, 1, 3, 5, 0, 2, 4, 1, 3, 5}), indices_values); + + ASSERT_TRUE(st->Equals(*this->sparse_tensor_from_dense_)); +} + +TEST_F(TestSparseCSCMatrix, TensorEquality) { + std::vector values1 = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::vector values2 = {9, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::shared_ptr buffer1 = Buffer::Wrap(values1); + std::shared_ptr buffer2 = Buffer::Wrap(values2); + NumericTensor tensor1(buffer1, this->shape_); + NumericTensor tensor2(buffer2, this->shape_); + + std::shared_ptr st1, st2; + ASSERT_OK_AND_ASSIGN(st1, SparseCSCMatrix::Make(tensor1)); + ASSERT_OK_AND_ASSIGN(st2, SparseCSCMatrix::Make(tensor2)); + + ASSERT_TRUE(st1->Equals(*this->sparse_tensor_from_dense_)); + ASSERT_FALSE(st1->Equals(*st2)); +} + +TEST_F(TestSparseCSCMatrix, TestToTensor) { + std::vector values = {1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 1, + 0, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1}; + std::vector shape({6, 4}); + std::shared_ptr buffer = Buffer::Wrap(values); + Tensor tensor(int64(), buffer, shape, {}, this->dim_names_); + + std::shared_ptr sparse_tensor; + ASSERT_OK_AND_ASSIGN(sparse_tensor, SparseCSCMatrix::Make(tensor)); + + ASSERT_EQ(7, sparse_tensor->non_zero_length()); + ASSERT_TRUE(sparse_tensor->is_mutable()); + + std::shared_ptr dense_tensor; + ASSERT_OK(sparse_tensor->ToTensor(&dense_tensor)); + ASSERT_TRUE(tensor.Equals(*dense_tensor)); +} + } // namespace arrow From 635f8ffdc2bf5dab7081267961638f78970acc9a Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Fri, 6 Dec 2019 23:45:41 +0900 Subject: [PATCH 05/15] Add IPC support --- cpp/src/arrow/ipc/metadata_internal.cc | 33 ++++++++++--- cpp/src/arrow/ipc/read_write_test.cc | 68 +++++++++++++++++++++++++- cpp/src/arrow/ipc/reader.cc | 60 +++++++++++++---------- cpp/src/arrow/ipc/writer.cc | 11 +++++ 4 files changed, 138 insertions(+), 34 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 960b029c924..f20e5ad8846 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -988,7 +988,21 @@ Status MakeSparseTensorIndexCOO(FBB& fbb, const SparseCOOIndex& sparse_index, return Status::OK(); } -Status MakeSparseMatrixIndexCSR(FBB& fbb, const SparseCSRIndex& sparse_index, +template +struct SparseMatrixCompressedAxis {}; + +template <> +struct SparseMatrixCompressedAxis { + constexpr static const auto value = flatbuf::SparseMatrixCompressedAxis_Row; +}; + +template <> +struct SparseMatrixCompressedAxis { + constexpr static const auto value = flatbuf::SparseMatrixCompressedAxis_Column; +}; + +template +Status MakeSparseMatrixIndexCSX(FBB& fbb, const SparseIndexType& sparse_index, const std::vector& buffers, flatbuf::SparseTensorIndex* fb_sparse_index_type, Offset* fb_sparse_index, size_t* num_buffers) { @@ -1012,10 +1026,11 @@ Status MakeSparseMatrixIndexCSR(FBB& fbb, const SparseCSRIndex& sparse_index, const BufferMetadata& indices_metadata = buffers[1]; flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length); - *fb_sparse_index = flatbuf::CreateSparseMatrixIndexCSX( - fbb, flatbuf::SparseMatrixCompressedAxis_Row, indptr_type_offset, - &indptr, indices_type_offset, &indices) - .Union(); + auto compressedAxis = SparseMatrixCompressedAxis::value; + *fb_sparse_index = + flatbuf::CreateSparseMatrixIndexCSX(fbb, compressedAxis, indptr_type_offset, + &indptr, indices_type_offset, &indices) + .Union(); *num_buffers = 2; return Status::OK(); } @@ -1032,11 +1047,17 @@ Status MakeSparseTensorIndex(FBB& fbb, const SparseIndex& sparse_index, break; case SparseTensorFormat::CSR: - RETURN_NOT_OK(MakeSparseMatrixIndexCSR( + RETURN_NOT_OK(MakeSparseMatrixIndexCSX( fbb, checked_cast(sparse_index), buffers, fb_sparse_index_type, fb_sparse_index, num_buffers)); break; + case SparseTensorFormat::CSC: + RETURN_NOT_OK(MakeSparseMatrixIndexCSX( + fbb, checked_cast(sparse_index), buffers, + fb_sparse_index_type, fb_sparse_index, num_buffers)); + break; + default: std::stringstream ss; ss << "Unsupporoted sparse tensor format:: " << sparse_index.ToString() diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 5c391d3b3bc..61eed0d71d6 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -1144,6 +1144,7 @@ class TestSparseTensorRoundTrip : public ::testing::Test, public IpcTestFixture void CheckSparseTensorRoundTrip(const SparseCOOTensor& sparse_tensor); void CheckSparseTensorRoundTrip(const SparseCSRMatrix& sparse_tensor); + void CheckSparseTensorRoundTrip(const SparseCSCMatrix& sparse_tensor); protected: std::shared_ptr MakeSparseCOOIndex( @@ -1195,6 +1196,7 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( std::shared_ptr result; ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); + ASSERT_EQ(SparseTensorFormat::COO, result->format_id()); const auto& resulted_sparse_index = checked_cast(*result->sparse_index()); @@ -1233,6 +1235,7 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( std::shared_ptr result; ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); + ASSERT_EQ(SparseTensorFormat::CSR, result->format_id()); const auto& resulted_sparse_index = checked_cast(*result->sparse_index()); @@ -1242,6 +1245,46 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( ASSERT_TRUE(result->Equals(sparse_tensor)); } +template +void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( + const SparseCSCMatrix& sparse_tensor) { + const auto& type = checked_cast(*sparse_tensor.type()); + const int elem_size = type.bit_width() / 8; + const int index_elem_size = sizeof(typename IndexValueType::c_type); + + int32_t metadata_length; + int64_t body_length; + + ASSERT_OK(mmap_->Seek(0)); + + ASSERT_OK( + WriteSparseTensor(sparse_tensor, mmap_.get(), &metadata_length, &body_length)); + + const auto& sparse_index = + checked_cast(*sparse_tensor.sparse_index()); + const int64_t indptr_length = + BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indptr()->size()); + const int64_t indices_length = + BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indices()->size()); + const int64_t data_length = + BitUtil::RoundUpToMultipleOf8(elem_size * sparse_tensor.non_zero_length()); + const int64_t expected_body_length = indptr_length + indices_length + data_length; + ASSERT_EQ(expected_body_length, body_length); + + ASSERT_OK(mmap_->Seek(0)); + + std::shared_ptr result; + ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); + ASSERT_EQ(SparseTensorFormat::CSC, result->format_id()); + + const auto& resulted_sparse_index = + checked_cast(*result->sparse_index()); + ASSERT_EQ(resulted_sparse_index.indptr()->data()->size(), indptr_length); + ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); + ASSERT_EQ(result->data()->size(), data_length); + ASSERT_TRUE(result->Equals(sparse_tensor)); +} + TYPED_TEST_CASE_P(TestSparseTensorRoundTrip); TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCOOIndexRowMajor) { @@ -1360,8 +1403,31 @@ TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCSRIndex) { this->CheckSparseTensorRoundTrip(*st); } +TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCSCIndex) { + using IndexValueType = TypeParam; + + std::string path = "test-write-sparse-csc-matrix"; + constexpr int64_t kBufferSize = 1 << 20; + ASSERT_OK_AND_ASSIGN(this->mmap_, + io::MemoryMapFixture::InitMemoryMap(kBufferSize, path)); + + std::vector shape = {4, 6}; + std::vector dim_names = {"foo", "bar", "baz"}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + + auto data = Buffer::Wrap(values); + NumericTensor t(data, shape, {}, dim_names); + std::shared_ptr st; + ASSERT_OK_AND_ASSIGN( + st, SparseCSCMatrix::Make(t, TypeTraits::type_singleton())); + + this->CheckSparseTensorRoundTrip(*st); +} + REGISTER_TYPED_TEST_CASE_P(TestSparseTensorRoundTrip, WithSparseCOOIndexRowMajor, - WithSparseCOOIndexColumnMajor, WithSparseCSRIndex); + WithSparseCOOIndexColumnMajor, WithSparseCSRIndex, + WithSparseCSCIndex); INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestSparseTensorRoundTrip, Int8Type); INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestSparseTensorRoundTrip, UInt8Type); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index eb482e08412..39f6b360828 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -898,18 +898,20 @@ Result> ReadSparseCSXIndex( ARROW_ASSIGN_OR_RAISE(auto indices_data, file->ReadAt(indices_buffer->offset(), indices_buffer->length())); - std::vector indptr_shape({shape[0] + 1}); std::vector indices_shape({non_zero_length}); - switch (sparse_index->compressedAxis()) { - case flatbuf::SparseMatrixCompressedAxis_Row: + case flatbuf::SparseMatrixCompressedAxis_Row: { + std::vector indptr_shape({shape[0] + 1}); return std::make_shared( std::make_shared(indptr_type, indptr_data, indptr_shape), std::make_shared(indices_type, indices_data, indices_shape)); - - case flatbuf::SparseMatrixCompressedAxis_Column: - return Status::NotImplemented("CSC sparse index is not supported"); - + } + case flatbuf::SparseMatrixCompressedAxis_Column: { + std::vector indptr_shape({shape[1] + 1}); + return std::make_shared( + std::make_shared(indptr_type, indptr_data, indptr_shape), + std::make_shared(indices_type, indices_data, indices_shape)); + } default: return Status::Invalid("Invalid value of SparseMatrixCompressedAxis"); } @@ -933,6 +935,15 @@ Status MakeSparseTensorWithSparseCSRIndex( return Status::OK(); } +Status MakeSparseTensorWithSparseCSCIndex( + const std::shared_ptr& type, const std::vector& shape, + const std::vector& dim_names, + const std::shared_ptr& sparse_index, int64_t non_zero_length, + const std::shared_ptr& data, std::shared_ptr* out) { + *out = std::make_shared(sparse_index, type, data, shape, dim_names); + return Status::OK(); +} + Status ReadSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* out_type, std::vector* out_shape, @@ -1041,7 +1052,6 @@ Status ReadSparseTensorPayload(const IpcPayload& payload, non_zero_length, payload.body_buffers[1], out); } - case SparseTensorFormat::CSR: { std::shared_ptr sparse_index; std::shared_ptr indptr_type; @@ -1058,7 +1068,6 @@ Status ReadSparseTensorPayload(const IpcPayload& payload, non_zero_length, payload.body_buffers[2], out); } - default: return Status::Invalid("Unsupported sparse index format"); } @@ -1084,30 +1093,27 @@ Status ReadSparseTensor(const Buffer& metadata, io::RandomAccessFile* file, std::shared_ptr sparse_index; switch (sparse_tensor_format_id) { - case SparseTensorFormat::COO: - RETURN_NOT_OK(ReadSparseCOOIndex(sparse_tensor, shape, non_zero_length, file) - .Value(&sparse_index)); + case SparseTensorFormat::COO: { + ARROW_ASSIGN_OR_RAISE( + sparse_index, ReadSparseCOOIndex(sparse_tensor, shape, non_zero_length, file)); return MakeSparseTensorWithSparseCOOIndex( type, shape, dim_names, checked_pointer_cast(sparse_index), non_zero_length, data, out); - - case SparseTensorFormat::CSR: - RETURN_NOT_OK(ReadSparseCSXIndex(sparse_tensor, shape, non_zero_length, file) - .Value(&sparse_index)); + } + case SparseTensorFormat::CSR: { + ARROW_ASSIGN_OR_RAISE( + sparse_index, ReadSparseCSXIndex(sparse_tensor, shape, non_zero_length, file)); return MakeSparseTensorWithSparseCSRIndex( type, shape, dim_names, checked_pointer_cast(sparse_index), non_zero_length, data, out); - - case SparseTensorFormat::CSC: - return Status::NotImplemented("CSC sparse index is not supported"); // TODO: - - // RETURN_NOT_OK( - // ReadSparseCSXIndex(sparse_tensor, shape, non_zero_length, - // file).Value(&sparse_index)); - // return MakeSparseTensorWithSparseCSCIndex( - // type, shape, dim_names, checked_pointer_cast(sparse_index), - // non_zero_length, data, out); - + } + case SparseTensorFormat::CSC: { + ARROW_ASSIGN_OR_RAISE( + sparse_index, ReadSparseCSXIndex(sparse_tensor, shape, non_zero_length, file)); + return MakeSparseTensorWithSparseCSCIndex( + type, shape, dim_names, checked_pointer_cast(sparse_index), + non_zero_length, data, out); + } default: return Status::Invalid("Unsupported sparse index format"); } diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 6bee1b109e0..88ba56f6a72 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -746,6 +746,11 @@ class SparseTensorSerializer { VisitSparseCSRIndex(checked_cast(sparse_index))); break; + case SparseTensorFormat::CSC: + RETURN_NOT_OK( + VisitSparseCSCIndex(checked_cast(sparse_index))); + break; + default: std::stringstream ss; ss << "Unable to convert type: " << sparse_index.ToString() << std::endl; @@ -798,6 +803,12 @@ class SparseTensorSerializer { return Status::OK(); } + Status VisitSparseCSCIndex(const SparseCSCIndex& sparse_index) { + out_->body_buffers.emplace_back(sparse_index.indptr()->data()); + out_->body_buffers.emplace_back(sparse_index.indices()->data()); + return Status::OK(); + } + IpcPayload* out_; std::vector buffer_meta_; From 89a7ddce745ea91345e9f2a32e75cfc92a64c56e Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 11 Dec 2019 09:48:58 +0900 Subject: [PATCH 06/15] Remove needless return --- cpp/src/arrow/sparse_tensor.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 21694a71706..2bb9e33a78c 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -592,7 +592,6 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t *out = std::make_shared(sparse_tensor->type(), values_buffer, sparse_tensor->shape()); return Status::OK(); - return Status::NotImplemented("CSC format is not implemented yet"); } } return Status::NotImplemented("Unsupported SparseIndex format type"); From 878a08ff8bde5f92215d22297f43f0ae3528c3ab Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 11 Dec 2019 10:03:12 +0900 Subject: [PATCH 07/15] Anonymize some functions --- cpp/src/arrow/ipc/metadata_internal.cc | 112 ++++++++++++------------- 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index f20e5ad8846..5ddaa5eef84 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -906,62 +906,6 @@ static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, return Status::OK(); } -} // namespace - -Status WriteSchemaMessage(const Schema& schema, DictionaryMemo* dictionary_memo, - std::shared_ptr* out) { - FBB fbb; - flatbuffers::Offset fb_schema; - RETURN_NOT_OK(SchemaToFlatbuffer(fbb, schema, dictionary_memo, &fb_schema)); - return WriteFBMessage(fbb, flatbuf::MessageHeader_Schema, fb_schema.Union(), 0, out); -} - -Status WriteRecordBatchMessage(int64_t length, int64_t body_length, - const std::vector& nodes, - const std::vector& buffers, - std::shared_ptr* out) { - FBB fbb; - RecordBatchOffset record_batch; - RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, &record_batch)); - return WriteFBMessage(fbb, flatbuf::MessageHeader_RecordBatch, record_batch.Union(), - body_length, out); -} - -Status WriteTensorMessage(const Tensor& tensor, int64_t buffer_start_offset, - std::shared_ptr* out) { - using TensorDimOffset = flatbuffers::Offset; - using TensorOffset = flatbuffers::Offset; - - FBB fbb; - - const auto& type = checked_cast(*tensor.type()); - const int elem_size = type.bit_width() / 8; - - flatbuf::Type fb_type_type; - Offset fb_type; - RETURN_NOT_OK(TensorTypeToFlatbuffer(fbb, *tensor.type(), &fb_type_type, &fb_type)); - - std::vector dims; - for (int i = 0; i < tensor.ndim(); ++i) { - FBString name = fbb.CreateString(tensor.dim_name(i)); - dims.push_back(flatbuf::CreateTensorDim(fbb, tensor.shape()[i], name)); - } - - auto fb_shape = fbb.CreateVector(util::MakeNonNull(dims.data()), dims.size()); - - flatbuffers::Offset> fb_strides; - fb_strides = fbb.CreateVector(util::MakeNonNull(tensor.strides().data()), - tensor.strides().size()); - int64_t body_length = tensor.size() * elem_size; - flatbuf::Buffer buffer(buffer_start_offset, body_length); - - TensorOffset fb_tensor = - flatbuf::CreateTensor(fbb, fb_type_type, fb_type, fb_shape, fb_strides, &buffer); - - return WriteFBMessage(fbb, flatbuf::MessageHeader_Tensor, fb_tensor.Union(), - body_length, out); -} - Status MakeSparseTensorIndexCOO(FBB& fbb, const SparseCOOIndex& sparse_index, const std::vector& buffers, flatbuf::SparseTensorIndex* fb_sparse_index_type, @@ -1104,6 +1048,62 @@ Status MakeSparseTensor(FBB& fbb, const SparseTensor& sparse_tensor, int64_t bod return Status::OK(); } +} // namespace + +Status WriteSchemaMessage(const Schema& schema, DictionaryMemo* dictionary_memo, + std::shared_ptr* out) { + FBB fbb; + flatbuffers::Offset fb_schema; + RETURN_NOT_OK(SchemaToFlatbuffer(fbb, schema, dictionary_memo, &fb_schema)); + return WriteFBMessage(fbb, flatbuf::MessageHeader_Schema, fb_schema.Union(), 0, out); +} + +Status WriteRecordBatchMessage(int64_t length, int64_t body_length, + const std::vector& nodes, + const std::vector& buffers, + std::shared_ptr* out) { + FBB fbb; + RecordBatchOffset record_batch; + RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, &record_batch)); + return WriteFBMessage(fbb, flatbuf::MessageHeader_RecordBatch, record_batch.Union(), + body_length, out); +} + +Status WriteTensorMessage(const Tensor& tensor, int64_t buffer_start_offset, + std::shared_ptr* out) { + using TensorDimOffset = flatbuffers::Offset; + using TensorOffset = flatbuffers::Offset; + + FBB fbb; + + const auto& type = checked_cast(*tensor.type()); + const int elem_size = type.bit_width() / 8; + + flatbuf::Type fb_type_type; + Offset fb_type; + RETURN_NOT_OK(TensorTypeToFlatbuffer(fbb, *tensor.type(), &fb_type_type, &fb_type)); + + std::vector dims; + for (int i = 0; i < tensor.ndim(); ++i) { + FBString name = fbb.CreateString(tensor.dim_name(i)); + dims.push_back(flatbuf::CreateTensorDim(fbb, tensor.shape()[i], name)); + } + + auto fb_shape = fbb.CreateVector(util::MakeNonNull(dims.data()), dims.size()); + + flatbuffers::Offset> fb_strides; + fb_strides = fbb.CreateVector(util::MakeNonNull(tensor.strides().data()), + tensor.strides().size()); + int64_t body_length = tensor.size() * elem_size; + flatbuf::Buffer buffer(buffer_start_offset, body_length); + + TensorOffset fb_tensor = + flatbuf::CreateTensor(fbb, fb_type_type, fb_type, fb_shape, fb_strides, &buffer); + + return WriteFBMessage(fbb, flatbuf::MessageHeader_Tensor, fb_tensor.Union(), + body_length, out); +} + Status WriteSparseTensorMessage(const SparseTensor& sparse_tensor, int64_t body_length, const std::vector& buffers, std::shared_ptr* out) { From 1c3c541e09fd6cd7bbb850f84dca27c06ea38f95 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 11 Dec 2019 11:10:15 +0900 Subject: [PATCH 08/15] Update comment --- cpp/src/arrow/ipc/metadata_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index 1f4b083fb13..5d7aba7cc4f 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -111,7 +111,7 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type Status GetSparseCOOIndexMetadata(const flatbuf::SparseTensorIndexCOO* sparse_index, std::shared_ptr* indices_type); -// EXPERIMENTAL: Extracting metadata of a SparseCSRIndex from the message +// EXPERIMENTAL: Extracting metadata of a SparseCSXIndex from the message Status GetSparseCSXIndexMetadata(const flatbuf::SparseMatrixIndexCSX* sparse_index, std::shared_ptr* indptr_type, std::shared_ptr* indices_type); From 25177c5d44f8b3dab90b75ae64273458717441b7 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Thu, 12 Dec 2019 11:48:05 +0900 Subject: [PATCH 09/15] Use TypeError --- cpp/src/arrow/sparse_tensor.cc | 12 ++++++------ cpp/src/arrow/sparse_tensor_test.cc | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 2bb9e33a78c..421287fa6d7 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -164,7 +164,7 @@ class SparseTensorConverter ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_TYPE_SPECIFIC_CONVERT); // LCOV_EXCL_START: The following invalid causes program failure. default: - return Status::Invalid("Unsupported SparseTensor index value type"); + return Status::TypeError("Unsupported SparseTensor index value type"); // LCOV_EXCL_STOP } } @@ -271,7 +271,7 @@ class SparseTensorConverter ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_TYPE_SPECIFIC_CONVERT); // LCOV_EXCL_START: The following invalid causes program failure. default: - return Status::Invalid("Unsupported SparseTensor index value type"); + return Status::TypeError("Unsupported SparseTensor index value type"); // LCOV_EXCL_STOP } } @@ -392,7 +392,7 @@ class SparseTensorConverter ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_TYPE_SPECIFIC_CONVERT); // LCOV_EXCL_START: The following invalid causes program failure. default: - return Status::Invalid("Unsupported SparseTensor index value type"); + return Status::TypeError("Unsupported SparseTensor index value type"); // LCOV_EXCL_STOP } } @@ -478,7 +478,7 @@ inline Status MakeSparseTensorFromTensor( ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(MAKE_SPARSE_TENSOR_FROM_TENSOR); // LCOV_EXCL_START: ignore program failure default: - return Status::Invalid("Unsupported Tensor value type"); + return Status::TypeError("Unsupported Tensor value type"); // LCOV_EXCL_STOP } } @@ -675,7 +675,7 @@ inline Status CheckSparseCOOIndexValidity(const std::shared_ptr& type, const std::vector& shape, const std::vector& strides) { if (!is_integer(type->id())) { - return Status::Invalid("Type of SparseCOOIndex indices must be integer"); + return Status::TypeError("Type of SparseCOOIndex indices must be integer"); } if (shape.size() != 2) { return Status::Invalid("SparseCOOIndex indices must be a matrix"); @@ -728,7 +728,7 @@ Status ValidateSparseCSXIndex(const std::shared_ptr& indptr_type, const std::vector& indices_shape, char const* type_name) { if (!is_integer(indptr_type->id())) { - return Status::Invalid("Type of ", type_name, " indptr must be integer"); + return Status::TypeError("Type of ", type_name, " indptr must be integer"); } if (indptr_shape.size() != 1) { return Status::Invalid(type_name, " indptr must be a vector"); diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index e78b9a395a2..63cbd50e1dd 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -63,7 +63,7 @@ TEST(TestSparseCOOIndex, Make) { // Non-integer type auto res = SparseCOOIndex::Make(float32(), shape, strides, data); - ASSERT_RAISES(Invalid, res); + ASSERT_RAISES(TypeError, res); // Non-matrix indices res = SparseCOOIndex::Make(int32(), {4, 3, 4}, strides, data); @@ -96,7 +96,7 @@ TEST(TestSparseCSRIndex, Make) { // Non-integer type auto res = SparseCSRIndex::Make(float32(), indptr_shape, indices_shape, indptr_data, indices_data); - ASSERT_RAISES(Invalid, res); + ASSERT_RAISES(TypeError, res); // Non-vector indptr shape ASSERT_RAISES(Invalid, SparseCSRIndex::Make(int32(), {1, 2}, indices_shape, indptr_data, @@ -126,8 +126,8 @@ TEST(TestSparseCSCIndex, Make) { ASSERT_EQ(std::string("SparseCSCIndex"), si->ToString()); // Non-integer type - ASSERT_RAISES(Invalid, SparseCSCIndex::Make(float32(), indptr_shape, indices_shape, - indptr_data, indices_data)); + ASSERT_RAISES(TypeError, SparseCSCIndex::Make(float32(), indptr_shape, indices_shape, + indptr_data, indices_data)); // Non-vector indptr shape ASSERT_RAISES(Invalid, SparseCSCIndex::Make(int32(), {1, 2}, indices_shape, indptr_data, From 269640f59ccc99a6a2066052d0b22db6080fe708 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Thu, 12 Dec 2019 17:21:14 +0900 Subject: [PATCH 10/15] Define format_id in subclasses --- cpp/src/arrow/sparse_tensor.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 1be6196c48e..68df85eb2f0 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -168,12 +168,6 @@ void CheckSparseCSXIndexValidity(const std::shared_ptr& indptr_type, template class SparseCSXIndex : public SparseIndexBase { public: - static constexpr SparseTensorFormat::type format_id = std::conditional< - COMPRESSED_AXIS == SparseMatrixCompressedAxis::ROW, - std::integral_constant, - std::integral_constant>::type::value; - /// \brief Make a subclass of SparseCSXIndex from raw properties static Result> Make( const std::shared_ptr& indptr_type, @@ -290,6 +284,7 @@ class ARROW_EXPORT SparseCSRIndex : public internal::SparseCSXIndex { public: + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSR; constexpr static char const* TYPE_NAME = "SparseCSRIndex"; using SparseCSXIndex::Make; @@ -315,6 +310,7 @@ class ARROW_EXPORT SparseCSCIndex : public internal::SparseCSXIndex { public: + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSC; constexpr static char const* TYPE_NAME = "SparseCSCIndex"; using SparseCSXIndex::Make; From 42df6b5ca449db18eb67c3ee502b739652fa442f Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Thu, 12 Dec 2019 17:30:30 +0900 Subject: [PATCH 11/15] Rename TYPE_NAME to kTYPE_NAME --- cpp/src/arrow/sparse_tensor.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 68df85eb2f0..b175809ac20 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -176,7 +176,7 @@ class SparseCSXIndex : public SparseIndexBase { std::shared_ptr indptr_data, std::shared_ptr indices_data) { ARROW_RETURN_NOT_OK(ValidateSparseCSXIndex(indptr_type, indices_type, indptr_shape, indices_shape, - SparseIndexType::TYPE_NAME)); + SparseIndexType::kTypeName)); return std::make_shared( std::make_shared(indptr_type, indptr_data, indptr_shape), std::make_shared(indices_type, indices_data, indices_shape)); @@ -221,7 +221,7 @@ class SparseCSXIndex : public SparseIndexBase { indptr_(indptr), indices_(indices) { CheckSparseCSXIndexValidity(indptr_->type(), indices_->type(), indptr_->shape(), - indices_->shape(), SparseIndexType::TYPE_NAME); + indices_->shape(), SparseIndexType::kTypeName); } /// \brief Return a 1D tensor of indptr vector @@ -232,7 +232,7 @@ class SparseCSXIndex : public SparseIndexBase { /// \brief Return a string representation of the sparse index std::string ToString() const override { - return std::string(SparseIndexType::TYPE_NAME); + return std::string(SparseIndexType::kTypeName); } /// \brief Return whether the CSR indices are equal @@ -285,7 +285,7 @@ class ARROW_EXPORT SparseCSRIndex internal::SparseMatrixCompressedAxis::ROW> { public: static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSR; - constexpr static char const* TYPE_NAME = "SparseCSRIndex"; + static constexpr char const* kTypeName = "SparseCSRIndex"; using SparseCSXIndex::Make; using SparseCSXIndex::SparseCSXIndex; @@ -311,7 +311,7 @@ class ARROW_EXPORT SparseCSCIndex internal::SparseMatrixCompressedAxis::COLUMN> { public: static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSC; - constexpr static char const* TYPE_NAME = "SparseCSCIndex"; + static constexpr char const* kTypeName = "SparseCSCIndex"; using SparseCSXIndex::Make; using SparseCSXIndex::SparseCSXIndex; From 25edf46208fc876ccd3830c6a122626768621c6c Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Thu, 12 Dec 2019 18:31:22 +0900 Subject: [PATCH 12/15] Combine CheckSparseTensorRoundTrip for CSR and CSC --- cpp/src/arrow/ipc/read_write_test.cc | 210 +++++++++++---------------- cpp/src/arrow/sparse_tensor.h | 11 ++ 2 files changed, 97 insertions(+), 124 deletions(-) diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 61eed0d71d6..30651fd828a 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -1142,9 +1142,88 @@ class TestSparseTensorRoundTrip : public ::testing::Test, public IpcTestFixture void SetUp() { IpcTestFixture::SetUp(); } void TearDown() { IpcTestFixture::TearDown(); } - void CheckSparseTensorRoundTrip(const SparseCOOTensor& sparse_tensor); - void CheckSparseTensorRoundTrip(const SparseCSRMatrix& sparse_tensor); - void CheckSparseTensorRoundTrip(const SparseCSCMatrix& sparse_tensor); + void CheckSparseCOOTensorRoundTrip(const SparseCOOTensor& sparse_tensor) { + const auto& type = checked_cast(*sparse_tensor.type()); + const int elem_size = type.bit_width() / 8; + const int index_elem_size = sizeof(typename IndexValueType::c_type); + + int32_t metadata_length; + int64_t body_length; + + ASSERT_OK(mmap_->Seek(0)); + + ASSERT_OK( + WriteSparseTensor(sparse_tensor, mmap_.get(), &metadata_length, &body_length)); + + const auto& sparse_index = + checked_cast(*sparse_tensor.sparse_index()); + const int64_t indices_length = + BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indices()->size()); + const int64_t data_length = + BitUtil::RoundUpToMultipleOf8(elem_size * sparse_tensor.non_zero_length()); + const int64_t expected_body_length = indices_length + data_length; + ASSERT_EQ(expected_body_length, body_length); + + ASSERT_OK(mmap_->Seek(0)); + + std::shared_ptr result; + ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); + ASSERT_EQ(SparseTensorFormat::COO, result->format_id()); + + const auto& resulted_sparse_index = + checked_cast(*result->sparse_index()); + ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); + ASSERT_EQ(result->data()->size(), data_length); + ASSERT_TRUE(result->Equals(sparse_tensor)); + } + + template + void CheckSparseCSXMatrixRoundTrip( + const SparseTensorImpl& sparse_tensor) { + static_assert(std::is_same::value || + std::is_same::value, + "SparseIndexType must be either SparseCSRIndex or SparseCSCIndex"); + + const auto& type = checked_cast(*sparse_tensor.type()); + const int elem_size = type.bit_width() / 8; + const int index_elem_size = sizeof(typename IndexValueType::c_type); + + int32_t metadata_length; + int64_t body_length; + + ASSERT_OK(mmap_->Seek(0)); + + ASSERT_OK( + WriteSparseTensor(sparse_tensor, mmap_.get(), &metadata_length, &body_length)); + + const auto& sparse_index = + checked_cast(*sparse_tensor.sparse_index()); + const int64_t indptr_length = + BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indptr()->size()); + const int64_t indices_length = + BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indices()->size()); + const int64_t data_length = + BitUtil::RoundUpToMultipleOf8(elem_size * sparse_tensor.non_zero_length()); + const int64_t expected_body_length = indptr_length + indices_length + data_length; + ASSERT_EQ(expected_body_length, body_length); + + ASSERT_OK(mmap_->Seek(0)); + + std::shared_ptr result; + ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); + + constexpr auto expected_format_id = + std::is_same::value ? SparseTensorFormat::CSR + : SparseTensorFormat::CSC; + ASSERT_EQ(expected_format_id, result->format_id()); + + const auto& resulted_sparse_index = + checked_cast(*result->sparse_index()); + ASSERT_EQ(resulted_sparse_index.indptr()->data()->size(), indptr_length); + ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); + ASSERT_EQ(result->data()->size(), data_length); + ASSERT_TRUE(result->Equals(sparse_tensor)); + } protected: std::shared_ptr MakeSparseCOOIndex( @@ -1168,123 +1247,6 @@ class TestSparseTensorRoundTrip : public ::testing::Test, public IpcTestFixture } }; -template -void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( - const SparseCOOTensor& sparse_tensor) { - const auto& type = checked_cast(*sparse_tensor.type()); - const int elem_size = type.bit_width() / 8; - const int index_elem_size = sizeof(typename IndexValueType::c_type); - - int32_t metadata_length; - int64_t body_length; - - ASSERT_OK(mmap_->Seek(0)); - - ASSERT_OK( - WriteSparseTensor(sparse_tensor, mmap_.get(), &metadata_length, &body_length)); - - const auto& sparse_index = - checked_cast(*sparse_tensor.sparse_index()); - const int64_t indices_length = - BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indices()->size()); - const int64_t data_length = - BitUtil::RoundUpToMultipleOf8(elem_size * sparse_tensor.non_zero_length()); - const int64_t expected_body_length = indices_length + data_length; - ASSERT_EQ(expected_body_length, body_length); - - ASSERT_OK(mmap_->Seek(0)); - - std::shared_ptr result; - ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); - ASSERT_EQ(SparseTensorFormat::COO, result->format_id()); - - const auto& resulted_sparse_index = - checked_cast(*result->sparse_index()); - ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); - ASSERT_EQ(result->data()->size(), data_length); - ASSERT_TRUE(result->Equals(sparse_tensor)); -} - -template -void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( - const SparseCSRMatrix& sparse_tensor) { - const auto& type = checked_cast(*sparse_tensor.type()); - const int elem_size = type.bit_width() / 8; - const int index_elem_size = sizeof(typename IndexValueType::c_type); - - int32_t metadata_length; - int64_t body_length; - - ASSERT_OK(mmap_->Seek(0)); - - ASSERT_OK( - WriteSparseTensor(sparse_tensor, mmap_.get(), &metadata_length, &body_length)); - - const auto& sparse_index = - checked_cast(*sparse_tensor.sparse_index()); - const int64_t indptr_length = - BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indptr()->size()); - const int64_t indices_length = - BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indices()->size()); - const int64_t data_length = - BitUtil::RoundUpToMultipleOf8(elem_size * sparse_tensor.non_zero_length()); - const int64_t expected_body_length = indptr_length + indices_length + data_length; - ASSERT_EQ(expected_body_length, body_length); - - ASSERT_OK(mmap_->Seek(0)); - - std::shared_ptr result; - ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); - ASSERT_EQ(SparseTensorFormat::CSR, result->format_id()); - - const auto& resulted_sparse_index = - checked_cast(*result->sparse_index()); - ASSERT_EQ(resulted_sparse_index.indptr()->data()->size(), indptr_length); - ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); - ASSERT_EQ(result->data()->size(), data_length); - ASSERT_TRUE(result->Equals(sparse_tensor)); -} - -template -void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( - const SparseCSCMatrix& sparse_tensor) { - const auto& type = checked_cast(*sparse_tensor.type()); - const int elem_size = type.bit_width() / 8; - const int index_elem_size = sizeof(typename IndexValueType::c_type); - - int32_t metadata_length; - int64_t body_length; - - ASSERT_OK(mmap_->Seek(0)); - - ASSERT_OK( - WriteSparseTensor(sparse_tensor, mmap_.get(), &metadata_length, &body_length)); - - const auto& sparse_index = - checked_cast(*sparse_tensor.sparse_index()); - const int64_t indptr_length = - BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indptr()->size()); - const int64_t indices_length = - BitUtil::RoundUpToMultipleOf8(index_elem_size * sparse_index.indices()->size()); - const int64_t data_length = - BitUtil::RoundUpToMultipleOf8(elem_size * sparse_tensor.non_zero_length()); - const int64_t expected_body_length = indptr_length + indices_length + data_length; - ASSERT_EQ(expected_body_length, body_length); - - ASSERT_OK(mmap_->Seek(0)); - - std::shared_ptr result; - ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); - ASSERT_EQ(SparseTensorFormat::CSC, result->format_id()); - - const auto& resulted_sparse_index = - checked_cast(*result->sparse_index()); - ASSERT_EQ(resulted_sparse_index.indptr()->data()->size(), indptr_length); - ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); - ASSERT_EQ(result->data()->size(), data_length); - ASSERT_TRUE(result->Equals(sparse_tensor)); -} - TYPED_TEST_CASE_P(TestSparseTensorRoundTrip); TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCOOIndexRowMajor) { @@ -1331,7 +1293,7 @@ TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCOOIndexRowMajor) { std::vector values = {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}; auto st = this->MakeSparseCOOTensor(si, values, shape, dim_names); - this->CheckSparseTensorRoundTrip(*st); + this->CheckSparseCOOTensorRoundTrip(*st); } TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCOOIndexColumnMajor) { @@ -1378,7 +1340,7 @@ TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCOOIndexColumnMajor) { std::vector values = {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}; auto st = this->MakeSparseCOOTensor(si, values, shape, dim_names); - this->CheckSparseTensorRoundTrip(*st); + this->CheckSparseCOOTensorRoundTrip(*st); } TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCSRIndex) { @@ -1400,7 +1362,7 @@ TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCSRIndex) { ASSERT_OK_AND_ASSIGN( st, SparseCSRMatrix::Make(t, TypeTraits::type_singleton())); - this->CheckSparseTensorRoundTrip(*st); + this->CheckSparseCSXMatrixRoundTrip(*st); } TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCSCIndex) { @@ -1422,7 +1384,7 @@ TYPED_TEST_P(TestSparseTensorRoundTrip, WithSparseCSCIndex) { ASSERT_OK_AND_ASSIGN( st, SparseCSCMatrix::Make(t, TypeTraits::type_singleton())); - this->CheckSparseTensorRoundTrip(*st); + this->CheckSparseCSXMatrixRoundTrip(*st); } REGISTER_TYPED_TEST_CASE_P(TestSparseTensorRoundTrip, WithSparseCOOIndexRowMajor, diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index b175809ac20..b525c632fd2 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -168,6 +168,8 @@ void CheckSparseCSXIndexValidity(const std::shared_ptr& indptr_type, template class SparseCSXIndex : public SparseIndexBase { public: + static constexpr SparseMatrixCompressedAxis::type kCompressedAxis = COMPRESSED_AXIS; + /// \brief Make a subclass of SparseCSXIndex from raw properties static Result> Make( const std::shared_ptr& indptr_type, @@ -284,9 +286,13 @@ class ARROW_EXPORT SparseCSRIndex : public internal::SparseCSXIndex { public: + using BaseClass = + internal::SparseCSXIndex; + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSR; static constexpr char const* kTypeName = "SparseCSRIndex"; + using SparseCSXIndex::kCompressedAxis; using SparseCSXIndex::Make; using SparseCSXIndex::SparseCSXIndex; }; @@ -310,9 +316,14 @@ class ARROW_EXPORT SparseCSCIndex : public internal::SparseCSXIndex { public: + using BaseClass = + internal::SparseCSXIndex; + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSC; static constexpr char const* kTypeName = "SparseCSCIndex"; + using SparseCSXIndex::kCompressedAxis; using SparseCSXIndex::Make; using SparseCSXIndex::SparseCSXIndex; }; From 7f1d7f8e33a478730629b9efc43f0bc693197544 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Fri, 13 Dec 2019 15:02:34 +0900 Subject: [PATCH 13/15] Fix comments --- cpp/src/arrow/sparse_tensor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index b525c632fd2..989d51a3b56 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -184,7 +184,7 @@ class SparseCSXIndex : public SparseIndexBase { std::make_shared(indices_type, indices_data, indices_shape)); } - /// \brief Make a subclass of SparseCSRIndex from raw properties + /// \brief Make a subclass of SparseCSXIndex from raw properties static Result> Make( const std::shared_ptr& indices_type, const std::vector& indptr_shape, const std::vector& indices_shape, @@ -216,7 +216,7 @@ class SparseCSXIndex : public SparseIndexBase { indices_data); } - /// \brief Construct SparseCSRIndex from two index vectors + /// \brief Construct SparseCSXIndex from two index vectors explicit SparseCSXIndex(const std::shared_ptr& indptr, const std::shared_ptr& indices) : SparseIndexBase(indices->shape()[0]), From 28307dbdfa66d695d51159a37a38b0f3f8ac1583 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Fri, 13 Dec 2019 16:09:39 +0900 Subject: [PATCH 14/15] Add shape validation in ReadSparseCSXIndex --- cpp/src/arrow/ipc/reader.cc | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 39f6b360828..f941116051e 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -884,6 +884,10 @@ Result> ReadSparseCOOIndex( Result> ReadSparseCSXIndex( const flatbuf::SparseTensor* sparse_tensor, const std::vector& shape, int64_t non_zero_length, io::RandomAccessFile* file) { + if (shape.size() != 2) { + return Status::Invalid("Invalid shape length for a sparse matrix"); + } + auto* sparse_index = sparse_tensor->sparseIndex_as_SparseMatrixIndexCSX(); std::shared_ptr indptr_type, indices_type; @@ -899,15 +903,34 @@ Result> ReadSparseCSXIndex( file->ReadAt(indices_buffer->offset(), indices_buffer->length())); std::vector indices_shape({non_zero_length}); + const auto indices_minimum_bytes = + indices_shape[0] * checked_pointer_cast(indices_type)->bit_width() / + CHAR_BIT; + if (indices_minimum_bytes > indices_buffer->length()) { + return Status::Invalid("shape is inconsistent to the size of indices buffer"); + } + switch (sparse_index->compressedAxis()) { case flatbuf::SparseMatrixCompressedAxis_Row: { std::vector indptr_shape({shape[0] + 1}); + const int64_t indptr_minimum_bytes = + indptr_shape[0] * + checked_pointer_cast(indptr_type)->bit_width() / CHAR_BIT; + if (indptr_minimum_bytes > indptr_buffer->length()) { + return Status::Invalid("shape is inconsistent to the size of indptr buffer"); + } return std::make_shared( std::make_shared(indptr_type, indptr_data, indptr_shape), std::make_shared(indices_type, indices_data, indices_shape)); } case flatbuf::SparseMatrixCompressedAxis_Column: { std::vector indptr_shape({shape[1] + 1}); + const int64_t indptr_minimum_bytes = + indptr_shape[0] * + checked_pointer_cast(indptr_type)->bit_width() / CHAR_BIT; + if (indptr_minimum_bytes > indptr_buffer->length()) { + return Status::Invalid("shape is inconsistent to the size of indptr buffer"); + } return std::make_shared( std::make_shared(indptr_type, indptr_data, indptr_shape), std::make_shared(indices_type, indices_data, indices_shape)); From 013f6381e69aabccdc32de54f128a9cb60af89c3 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Fri, 13 Dec 2019 17:00:43 +0900 Subject: [PATCH 15/15] Use enum class for SparseMatrixCompressedAxis --- cpp/src/arrow/sparse_tensor.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 989d51a3b56..fe0d192da26 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -141,14 +141,12 @@ class ARROW_EXPORT SparseCOOIndex : public internal::SparseIndexBase& indptr_type, const std::vector& indices_shape, char const* type_name); -template +template class SparseCSXIndex : public SparseIndexBase { public: - static constexpr SparseMatrixCompressedAxis::type kCompressedAxis = COMPRESSED_AXIS; + static constexpr SparseMatrixCompressedAxis kCompressedAxis = COMPRESSED_AXIS; /// \brief Make a subclass of SparseCSXIndex from raw properties static Result> Make(