From c3bc6edfa553a7ef6ea8332a77d5f49b1ed4fc8f Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Tue, 11 Sep 2018 11:51:49 +0900 Subject: [PATCH 01/40] Add tentative SparseTensor format --- format/Message.fbs | 5 ++-- format/SparseTensor.fbs | 52 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 format/SparseTensor.fbs diff --git a/format/Message.fbs b/format/Message.fbs index 830718139d8..d7dcd7647fd 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -17,6 +17,7 @@ include "Schema.fbs"; include "Tensor.fbs"; +include "SparseTensor.fbs"; namespace org.apache.arrow.flatbuf; @@ -87,7 +88,7 @@ table DictionaryBatch { /// which may include experimental metadata types. For maximum compatibility, /// it is best to send data using RecordBatch union MessageHeader { - Schema, DictionaryBatch, RecordBatch, Tensor + Schema, DictionaryBatch, RecordBatch, Tensor, SparseTensor } table Message { @@ -96,4 +97,4 @@ table Message { bodyLength: long; } -root_type Message; \ No newline at end of file +root_type Message; diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs new file mode 100644 index 00000000000..66ec098f41e --- /dev/null +++ b/format/SparseTensor.fbs @@ -0,0 +1,52 @@ +include "Tensor.fbs" + +namespace org.apache.arrow.flatbuf; + +/// Coodinate format. +table SparseTensorIndexCOO { + /// COO's index list are represented as a NxM matrix, + /// where N is the number of non-zero values, + /// and M is the number of dimensions of a sparse tensor. + /// indicesBuffer stores the location and size of this index matrix. + /// The type of index value is long, so the stride for the index matrix is unnecessary. + indicesBuffer: Buffer +}; + +/// Compressed Sparse Row format, that is matrix-specific. +table SparseMatrixIndexCSR { + /// This array represents the range of the rows. + /// The ith row spans from indptr[i] to indptr[i+1] in the data. + /// The length of this array is 1 + (the number of rows). + indptr: [long] + + /// indicesBuffer stores the location and size of the array that + /// contains the column indices of the corresponding non-zero values. + /// The type of index value is long. + indicesBuffer: Buffer +}; + +union SparseTensorIndex { + SparseTensorIndexCOO, + SparseMatrixIndexCSR +}; + +table SparseTensor { + /// The type of data contained in a value cell. + /// Currently only fixed-width value types are supported, + /// no strings or nested types. + type: Type; + + /// The dimensions of the tensor, optionally named. + shape: [TensorDim]; + + /// The number of non-zero values in a sparse tensor. + length: long + + /// Sparse tensor index + sparseIndex: SparseTensorIndex; + + /// The location and size of the tensor's data + data: Buffer; +} + +root_type SparseTensor; From 1f16ffed817ac9fdcf598af81499e563c2db6d02 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 12 Sep 2018 09:22:36 +0900 Subject: [PATCH 02/40] Fix syntax error in SparseTensor.fbs --- format/SparseTensor.fbs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index 66ec098f41e..e4b1a777888 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -1,4 +1,4 @@ -include "Tensor.fbs" +include "Tensor.fbs"; namespace org.apache.arrow.flatbuf; @@ -9,26 +9,26 @@ table SparseTensorIndexCOO { /// and M is the number of dimensions of a sparse tensor. /// indicesBuffer stores the location and size of this index matrix. /// The type of index value is long, so the stride for the index matrix is unnecessary. - indicesBuffer: Buffer -}; + indicesBuffer: Buffer; +} /// Compressed Sparse Row format, that is matrix-specific. table SparseMatrixIndexCSR { /// This array represents the range of the rows. /// The ith row spans from indptr[i] to indptr[i+1] in the data. /// The length of this array is 1 + (the number of rows). - indptr: [long] + indptr: [long]; /// indicesBuffer stores the location and size of the array that /// contains the column indices of the corresponding non-zero values. /// The type of index value is long. - indicesBuffer: Buffer -}; + indicesBuffer: Buffer; +} union SparseTensorIndex { SparseTensorIndexCOO, SparseMatrixIndexCSR -}; +} table SparseTensor { /// The type of data contained in a value cell. @@ -40,7 +40,7 @@ table SparseTensor { shape: [TensorDim]; /// The number of non-zero values in a sparse tensor. - length: long + length: long; /// Sparse tensor index sparseIndex: SparseTensorIndex; From aa9b8a4d08e5a1e41a54643efc8ab26cb51f07a2 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 12 Sep 2018 09:36:02 +0900 Subject: [PATCH 03/40] Add SparseTensor.fbs in FBS_SRC --- cpp/src/arrow/ipc/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 422e72e2eda..07e333b6edd 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -64,6 +64,7 @@ set(FBS_SRC ${CMAKE_SOURCE_DIR}/../format/File.fbs ${CMAKE_SOURCE_DIR}/../format/Schema.fbs ${CMAKE_SOURCE_DIR}/../format/Tensor.fbs + ${CMAKE_SOURCE_DIR}/../format/SparseTensor.fbs ${CMAKE_CURRENT_SOURCE_DIR}/feather.fbs) foreach(FIL ${FBS_SRC}) From 866b2c13ae6967c87a5db719ec0c829453a879c5 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 12 Sep 2018 09:58:31 +0900 Subject: [PATCH 04/40] Add header comments in SparseTensor.fbs --- format/SparseTensor.fbs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index e4b1a777888..af2b68558ef 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -1,3 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// EXPERIMENTAL: Metadata for n-dimensional sparse tensors, that contains +/// only non-zero values. Arrow implementations in general are not required +/// to implement this type + include "Tensor.fbs"; namespace org.apache.arrow.flatbuf; From d7e653f174bce7d622856de821a4d33a4a634ccb Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Mon, 17 Sep 2018 11:07:42 +0900 Subject: [PATCH 05/40] Add an example of COO format in comment --- format/SparseTensor.fbs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index af2b68558ef..3e57126f389 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -30,6 +30,26 @@ table SparseTensorIndexCOO { /// and M is the number of dimensions of a sparse tensor. /// indicesBuffer stores the location and size of this index matrix. /// The type of index value is long, so the stride for the index matrix is unnecessary. + /// + /// For example, let X be a 2x3x4x5 tensor, and it has the following 6 non-zero values: + /// + /// X[0, 1, 2, 0] := 1 + /// X[1, 1, 2, 3] := 2 + /// X[0, 2, 1, 0] := 3 + /// X[0, 1, 3, 0] := 4 + /// X[0, 1, 2, 1] := 5 + /// X[1, 2, 0, 4] := 6 + /// + /// In COO format, the index matrix of X is the following 10x4 matrix: + /// + /// [[0, 1, 2, 0], + /// [0, 1, 2, 1], + /// [0, 1, 3, 0], + /// [0, 2, 1, 0], + /// [1, 1, 2, 3], + /// [1, 2, 0, 4]] + /// + /// Note that the indices are sorted in lexcographical order. indicesBuffer: Buffer; } From 76c56dd351632d4f77bce36ecfa25a8105aac3ed Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Mon, 17 Sep 2018 11:24:15 +0900 Subject: [PATCH 06/40] Make indptr of CSR a buffer --- format/SparseTensor.fbs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index 3e57126f389..670261999e4 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -55,10 +55,12 @@ table SparseTensorIndexCOO { /// Compressed Sparse Row format, that is matrix-specific. table SparseMatrixIndexCSR { - /// This array represents the range of the rows. - /// The ith row spans from indptr[i] to indptr[i+1] in the data. - /// The length of this array is 1 + (the number of rows). - indptr: [long]; + /// indptrBuffer stores the location and size of indptr array that + /// represents the range of the rows. + /// The i-th row spans from indptr[i] to indptr[i+1] in the data. + /// The length of this array is 1 + (the number of rows), and the type + /// of index value is long. + indptrBuffer: Buffer; /// indicesBuffer stores the location and size of the array that /// contains the column indices of the corresponding non-zero values. From 2b50040f5ce3088fc1e2d4768982831f315c1a3a Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Mon, 17 Sep 2018 11:38:35 +0900 Subject: [PATCH 07/40] Add an example of the CSR format in comment --- format/SparseTensor.fbs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index 670261999e4..59aa6aaad6e 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -60,11 +60,32 @@ table SparseMatrixIndexCSR { /// The i-th row spans from indptr[i] to indptr[i+1] in the data. /// The length of this array is 1 + (the number of rows), and the type /// of index value is long. + /// + /// For example, let X be the following 6x4 matrix: + /// + /// X := [[0, 1, 2, 0], + /// [0, 0, 3, 0], + /// [0, 4, 0, 5], + /// [0, 0, 0, 0], + /// [6, 0, 7, 8], + /// [0, 9, 0, 0]]. + /// + /// The array of non-zero values in X is: + /// + /// values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9]. + /// + /// And the indptr of X is: + /// + /// indptr(X) = [0, 2, 3, 5, 5, 8, 10]. indptrBuffer: Buffer; /// indicesBuffer stores the location and size of the array that /// contains the column indices of the corresponding non-zero values. /// The type of index value is long. + /// + /// For example, the indices of the above X is: + /// + /// indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1]. indicesBuffer: Buffer; } From c508db086c36f7b33075c302a4a13ab4028ca63c Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Mon, 24 Sep 2018 13:48:18 +0900 Subject: [PATCH 08/40] Write sparse tensor format in IPC.md --- docs/source/format/IPC.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/docs/source/format/IPC.rst b/docs/source/format/IPC.rst index 8cb74b87afc..43812f2895d 100644 --- a/docs/source/format/IPC.rst +++ b/docs/source/format/IPC.rst @@ -234,4 +234,27 @@ region) to be multiples of 64 bytes: :: +SparseTensor Message Format +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``SparseTensor`` message types provides another way to write a +multidimensional array of fixed-size values using Arrow's shared memory tools +in addition to ``Tensor``. ``SparseTensor`` is designed specifically for tensors +whose elements are almost zeros. Arrow implementations in general are not +required to implement this data format likewise ``Tensor``. + +When writing a standalone encapsulated sparse tensor message, we use the format as +indicated above, but additionally align the starting offset of the metadata as +well as the starting offsets of the sparse index and the sparse tensor body +(if writing to a shared memory region) to be multiples of 64 bytes: + + + + + + + +The contents of the sparse tensor index is depends on what kinds of sparse +format is used. + .. _Flatbuffer: https://github.com/google/flatbuffers From b24f3c34292a249f3d66bc16dd6ec668bc31a13a Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Mon, 24 Sep 2018 14:24:06 +0900 Subject: [PATCH 09/40] Insert additional padding in sparse tensor format --- docs/source/format/IPC.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/format/IPC.rst b/docs/source/format/IPC.rst index 43812f2895d..62a1237436a 100644 --- a/docs/source/format/IPC.rst +++ b/docs/source/format/IPC.rst @@ -252,6 +252,7 @@ well as the starting offsets of the sparse index and the sparse tensor body + The contents of the sparse tensor index is depends on what kinds of sparse From 392a25b7ceae5aa4cfd4477743f7b22d4564a8ed Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Mon, 1 Oct 2018 16:26:43 +0900 Subject: [PATCH 10/40] Implement SparseTensor and SparseCOOIndex --- cpp/src/arrow/CMakeLists.txt | 2 + cpp/src/arrow/sparse_tensor-test.cc | 179 ++++++++++++++++ cpp/src/arrow/sparse_tensor.cc | 321 ++++++++++++++++++++++++++++ cpp/src/arrow/sparse_tensor.h | 121 +++++++++++ cpp/src/arrow/tensor.h | 6 + 5 files changed, 629 insertions(+) create mode 100644 cpp/src/arrow/sparse_tensor-test.cc create mode 100644 cpp/src/arrow/sparse_tensor.cc create mode 100644 cpp/src/arrow/sparse_tensor.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index f2a81124728..91bdce294c2 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -83,6 +83,7 @@ set(ARROW_SRCS table.cc table_builder.cc tensor.cc + sparse_tensor.cc type.cc visitor.cc @@ -286,6 +287,7 @@ ADD_ARROW_TEST(type-test) ADD_ARROW_TEST(table-test) ADD_ARROW_TEST(table_builder-test) ADD_ARROW_TEST(tensor-test) +ADD_ARROW_TEST(sparse_tensor-test) ADD_ARROW_BENCHMARK(builder-benchmark) ADD_ARROW_BENCHMARK(column-benchmark) diff --git a/cpp/src/arrow/sparse_tensor-test.cc b/cpp/src/arrow/sparse_tensor-test.cc new file mode 100644 index 00000000000..63ef2d11e77 --- /dev/null +++ b/cpp/src/arrow/sparse_tensor-test.cc @@ -0,0 +1,179 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Unit tests for DataType (and subclasses), Field, and Schema + +#include +#include +#include +#include + +#include + +#include + +#include "arrow/sparse_tensor.h" +#include "arrow/test-util.h" +#include "arrow/type.h" + +namespace arrow { + +TEST(TestSparseCOOTensor, CreationEmptyTensor) { + std::vector shape = {2, 3, 4}; + SparseTensor st1(int64(), shape); + + std::vector dim_names = {"foo", "bar", "baz"}; + SparseTensor st2(int64(), shape, dim_names); + + ASSERT_EQ(0, st1.length()); + ASSERT_EQ(0, st2.length()); + + ASSERT_EQ(24, st1.size()); + ASSERT_EQ(24, st2.size()); + + ASSERT_EQ("foo", st2.dim_name(0)); + ASSERT_EQ("bar", st2.dim_name(1)); + ASSERT_EQ("baz", st2.dim_name(2)); + + ASSERT_EQ("", st1.dim_name(0)); + ASSERT_EQ("", st1.dim_name(1)); + ASSERT_EQ("", st1.dim_name(2)); +} + +TEST(TestSparseCOOTensor, CreationFromNumericTensor) { + std::vector shape = {2, 3, 4}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::shared_ptr buffer(Buffer::Wrap(values)); + std::vector dim_names = {"foo", "bar", "baz"}; + NumericTensor tensor1(buffer, shape); + NumericTensor tensor2(buffer, shape, {}, dim_names); + SparseTensor st1(tensor1); + SparseTensor st2(tensor2); + + ASSERT_EQ(12, st1.length()); + ASSERT_TRUE(st1.is_mutable()); + + ASSERT_EQ("foo", st2.dim_name(0)); + ASSERT_EQ("bar", st2.dim_name(1)); + ASSERT_EQ("baz", st2.dim_name(2)); + + ASSERT_EQ("", st1.dim_name(0)); + ASSERT_EQ("", st1.dim_name(1)); + ASSERT_EQ("", st1.dim_name(2)); + + const int64_t* ptr = reinterpret_cast(st1.raw_data()); + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(i + 1, ptr[i]); + } + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(i + 11, ptr[i + 6]); + } + + std::shared_ptr si = st1.sparse_index(); + std::shared_ptr sidx = si->indices(); + ASSERT_EQ(std::vector({12, 3}), sidx->shape()); + ASSERT_TRUE(sidx->is_column_major()); + + // (0, 0, 0) -> 1 + ASSERT_EQ(0, sidx->Value({0, 0})); + ASSERT_EQ(0, sidx->Value({0, 1})); + ASSERT_EQ(0, sidx->Value({0, 2})); + + // (0, 0, 2) -> 2 + ASSERT_EQ(0, sidx->Value({1, 0})); + ASSERT_EQ(0, sidx->Value({1, 1})); + ASSERT_EQ(2, sidx->Value({1, 2})); + + // (0, 1, 1) -> 3 + ASSERT_EQ(0, sidx->Value({2, 0})); + ASSERT_EQ(1, sidx->Value({2, 1})); + ASSERT_EQ(1, sidx->Value({2, 2})); + + // (1, 2, 1) -> 15 + ASSERT_EQ(1, sidx->Value({10, 0})); + ASSERT_EQ(2, sidx->Value({10, 1})); + ASSERT_EQ(1, sidx->Value({10, 2})); + + // (1, 2, 3) -> 16 + ASSERT_EQ(1, sidx->Value({11, 0})); + ASSERT_EQ(2, sidx->Value({11, 1})); + ASSERT_EQ(3, sidx->Value({11, 2})); +} + +TEST(TestSparseCOOTensor, CreationFromTensor) { + std::vector shape = {2, 3, 4}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::shared_ptr buffer(Buffer::Wrap(values)); + std::vector dim_names = {"foo", "bar", "baz"}; + Tensor tensor1(int64(), buffer, shape); + Tensor tensor2(int64(), buffer, shape, {}, dim_names); + SparseTensor st1(tensor1); + SparseTensor st2(tensor2); + + ASSERT_EQ(12, st1.length()); + ASSERT_TRUE(st1.is_mutable()); + + ASSERT_EQ("foo", st2.dim_name(0)); + ASSERT_EQ("bar", st2.dim_name(1)); + ASSERT_EQ("baz", st2.dim_name(2)); + + ASSERT_EQ("", st1.dim_name(0)); + ASSERT_EQ("", st1.dim_name(1)); + ASSERT_EQ("", st1.dim_name(2)); + + const int64_t* ptr = reinterpret_cast(st1.raw_data()); + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(i + 1, ptr[i]); + } + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(i + 11, ptr[i + 6]); + } + + std::shared_ptr si = st1.sparse_index(); + std::shared_ptr sidx = si->indices(); + ASSERT_EQ(std::vector({12, 3}), sidx->shape()); + ASSERT_TRUE(sidx->is_column_major()); + + // (0, 0, 0) -> 1 + ASSERT_EQ(0, sidx->Value({0, 0})); + ASSERT_EQ(0, sidx->Value({0, 1})); + ASSERT_EQ(0, sidx->Value({0, 2})); + + // (0, 0, 2) -> 2 + ASSERT_EQ(0, sidx->Value({1, 0})); + ASSERT_EQ(0, sidx->Value({1, 1})); + ASSERT_EQ(2, sidx->Value({1, 2})); + + // (0, 1, 1) -> 3 + ASSERT_EQ(0, sidx->Value({2, 0})); + ASSERT_EQ(1, sidx->Value({2, 1})); + ASSERT_EQ(1, sidx->Value({2, 2})); + + // (1, 2, 1) -> 15 + ASSERT_EQ(1, sidx->Value({10, 0})); + ASSERT_EQ(2, sidx->Value({10, 1})); + ASSERT_EQ(1, sidx->Value({10, 2})); + + // (1, 2, 3) -> 16 + ASSERT_EQ(1, sidx->Value({11, 0})); + ASSERT_EQ(2, sidx->Value({11, 1})); + ASSERT_EQ(3, sidx->Value({11, 2})); +} + +} // namespace arrow diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc new file mode 100644 index 00000000000..ba7a2e82fd5 --- /dev/null +++ b/cpp/src/arrow/sparse_tensor.cc @@ -0,0 +1,321 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/sparse_tensor.h" + +#include +#include +#include + +#include "arrow/util/logging.h" + +namespace arrow { + +namespace { + +template +struct SparseIndexTraits {}; + +template <> +struct SparseIndexTraits { + static inline const char* name() { return "SparseCOOIndex"; } +}; + +template +class SparseTensorConverter { + public: + explicit SparseTensorConverter(const NumericTensor&) {} + + Status Convert() { + std::string sparse_index_name(SparseIndexTraits::name()); + return Status::NotImplemented(sparse_index_name + + std::string(" is not supported yet.")); + } +}; + +template +class SparseTensorConverter { + public: + using NumericTensorType = NumericTensor; + using value_type = typename NumericTensorType::value_type; + + explicit SparseTensorConverter(const NumericTensor& tensor) : tensor_(tensor) {} + + Status Convert() { + const int64_t ndim = tensor_.ndim(); + const int64_t nonzero_count = static_cast(CountNonZero()); + + std::shared_ptr indices_buffer; + RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * ndim * nonzero_count, &indices_buffer)); + int64_t* indices = reinterpret_cast(indices_buffer->mutable_data()); + + std::shared_ptr values_buffer; + RETURN_NOT_OK(AllocateBuffer(sizeof(value_type) * nonzero_count, &values_buffer)); + value_type* values = reinterpret_cast(values_buffer->mutable_data()); + + if (ndim <= 1) { + const value_type* data = reinterpret_cast(tensor_.raw_data()); + const int64_t count = ndim == 0 ? 1 : tensor_.shape()[0]; + for (int64_t i = 0; i < count; ++i, ++data) { + if (*data != 0) { + *indices++ = i; + *values++ = *data; + } + } + } else { + const std::vector& shape = tensor_.shape(); + std::vector coord(ndim, 0); + + for (int64_t n = tensor_.size(); n > 0; n--) { + const value_type x = tensor_.Value(coord); + if (tensor_.Value(coord) != 0) { + *values++ = x; + + int64_t *indp = indices; + for (int64_t i = 0; i < ndim; ++i) { + *indp = coord[i]; + indp += nonzero_count; + } + indices++; + } + + // increment index + ++coord[ndim - 1]; + if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) { + int64_t d = ndim - 1; + while (d > 0 && coord[d] == shape[d]) { + coord[d] = 0; + ++coord[d - 1]; + --d; + } + } + } + } + + // make results + const std::vector indices_shape = {nonzero_count, ndim}; + const int64_t indices_elsize = sizeof(int64_t); + const std::vector indices_strides = {indices_elsize, indices_elsize * nonzero_count}; + sparse_index = std::make_shared( + std::make_shared(indices_buffer, + indices_shape, + indices_strides)); + data = values_buffer; + + return Status::OK(); + } + + std::shared_ptr sparse_index; + std::shared_ptr data; + + protected: + bool TensorIsTriviallyIterable() const { + return tensor_.ndim() <= 1 || tensor_.is_contiguous(); + } + + size_t CountNonZero() const { + if (tensor_.size() == 0) { + return 0; + } + + if (TensorIsTriviallyIterable()) { + const value_type* data = reinterpret_cast(tensor_.raw_data()); + return std::count_if(data, data + tensor_.size(), [](value_type x) { return x != 0; }); + } + + const std::vector& shape = tensor_.shape(); + const int64_t ndim = tensor_.ndim(); + + size_t count = 0; + std::vector coord(ndim, 0); + for (int64_t n = tensor_.size(); n > 0; n--) { + if (tensor_.Value(coord) != 0) { + ++count; + } + + // increment index + ++coord[ndim - 1]; + if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) { + int64_t d = ndim - 1; + while (d > 0 && coord[d] == shape[d]) { + coord[d] = 0; + ++coord[d - 1]; + --d; + } + } + } + return count; + } + + private: + const NumericTensor& tensor_; +}; + +template +void MakeSparseCOOTensorFromTensor(const Tensor& tensor, + std::shared_ptr* sparse_index, + std::shared_ptr* data) { + NumericTensor numeric_tensor(tensor.data(), tensor.shape(), tensor.strides()); + SparseTensorConverter converter(numeric_tensor); + DCHECK_OK(converter.Convert()); + *sparse_index = converter.sparse_index; + *data = converter.data; +} + +// ---------------------------------------------------------------------- +// Instantiate templates + +template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; +template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; +template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; +template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; +template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; +template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; +template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; +template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; +template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; +template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; +template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; + +} // namespace + +// Constructor with a column-major NumericTensor +SparseCOOIndex::SparseCOOIndex(const std::shared_ptr& coords) + : SparseIndex(coords->shape()[0]), coords_(coords) { + DCHECK(coords_->is_column_major()); +} + +// Constructor with all attributes +template +SparseTensor::SparseTensor( + const std::shared_ptr& sparse_index, + const std::shared_ptr& type, const std::shared_ptr& data, + const std::vector& shape, const std::vector& dim_names) + : type_(type), + data_(data), + shape_(shape), + sparse_index_(sparse_index), + dim_names_(dim_names) { + DCHECK(is_tensor_supported(type->id())); +} + +// Constructor with a dense tensor +template +SparseTensor::SparseTensor(const std::shared_ptr& type, + const std::vector& shape, + const std::vector& dim_names) + : SparseTensor(nullptr, type, nullptr, shape, dim_names) {} + +// Constructor with a dense tensor +template +template +SparseTensor::SparseTensor(const NumericTensor& tensor) + : SparseTensor(nullptr, tensor.type(), nullptr, tensor.shape(), tensor.dim_names_) { + SparseTensorConverter converter(tensor); + DCHECK_OK(converter.Convert()); + sparse_index_ = converter.sparse_index; + data_ = converter.data; +} + +// Constructor with a dense tensor +template +SparseTensor::SparseTensor(const Tensor& tensor) + : SparseTensor(nullptr, tensor.type(), nullptr, tensor.shape(), tensor.dim_names_) { + switch (tensor.type()->id()) { + case Type::UINT8: + MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::INT8: + MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::UINT16: + MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::INT16: + MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::UINT32: + MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::INT32: + MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::UINT64: + MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::INT64: + MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::HALF_FLOAT: + MakeSparseCOOTensorFromTensor( + tensor, &sparse_index_, &data_); + return; + case Type::FLOAT: + MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::DOUBLE: + MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + default: + break; + } +} + +template +const std::string& SparseTensor::dim_name(int i) const { + static const std::string kEmpty = ""; + if (dim_names_.size() == 0) { + return kEmpty; + } else { + DCHECK_LT(i, static_cast(dim_names_.size())); + return dim_names_[i]; + } +} + +template +int64_t SparseTensor::size() const { + return std::accumulate(shape_.begin(), shape_.end(), 1LL, std::multiplies()); +} + +// ---------------------------------------------------------------------- +// Instantiate templates + +template class ARROW_TEMPLATE_EXPORT SparseTensor; + +template SparseTensor::SparseTensor(const NumericTensor&); +template SparseTensor::SparseTensor(const NumericTensor&); +template SparseTensor::SparseTensor(const NumericTensor&); +template SparseTensor::SparseTensor(const NumericTensor&); +template SparseTensor::SparseTensor(const NumericTensor&); +template SparseTensor::SparseTensor(const NumericTensor&); +template SparseTensor::SparseTensor(const NumericTensor&); +template SparseTensor::SparseTensor(const NumericTensor&); +template SparseTensor::SparseTensor(const NumericTensor&); +template SparseTensor::SparseTensor(const NumericTensor&); +template SparseTensor::SparseTensor(const NumericTensor&); + +} // namespace arrow diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h new file mode 100644 index 00000000000..b9b8bc96af0 --- /dev/null +++ b/cpp/src/arrow/sparse_tensor.h @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_SPARSE_TENSOR_H +#define ARROW_SPARSE_TENSOR_H + +#include +#include +#include + +#include "arrow/tensor.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// SparseIndex class + +class ARROW_EXPORT SparseIndex { + public: + explicit SparseIndex(int64_t length) : length_(length) {} + int64_t length() const { return length_; } + + protected: + int64_t length_; +}; + +// ---------------------------------------------------------------------- +// SparseCOOIndex class + +class ARROW_EXPORT SparseCOOIndex : public SparseIndex { + public: + using CoordsTensor = NumericTensor; + + virtual ~SparseCOOIndex() = default; + + // Constructor with a column-major NumericTensor + explicit SparseCOOIndex(const std::shared_ptr& coords); + + const std::shared_ptr& indices() const { return coords_; } + + protected: + std::shared_ptr coords_; +}; + +// ---------------------------------------------------------------------- +// SparseTensor class + +template +class ARROW_EXPORT SparseTensor { + public: + virtual ~SparseTensor() = default; + + // Constructor with all attributes + SparseTensor(const std::shared_ptr& sparse_index, + const std::shared_ptr& type, const std::shared_ptr& data, + const std::vector& shape, + const std::vector& dim_names); + + // Constructor with a dense tensor + SparseTensor(const std::shared_ptr& type, const std::vector& shape, + const std::vector& dim_names = {}); + + // Constructor with a dense numeric tensor + template + explicit SparseTensor(const NumericTensor& tensor); + + // Constructor with a dense tensor + explicit SparseTensor(const Tensor& tensor); + + std::shared_ptr type() const { return type_; } + std::shared_ptr data() const { return data_; } + + const uint8_t* raw_data() const { return data_->data(); } + uint8_t* raw_mutable_data() const { return data_->mutable_data(); } + + const std::vector& shape() const { return shape_; } + const std::shared_ptr& sparse_index() const { return sparse_index_; } + + int ndim() const { return static_cast(shape_.size()); } + + const std::string& dim_name(int i) const; + + /// Total number of non-zero cells in the sparse tensor + int64_t length() const { return sparse_index_ ? sparse_index_->length() : 0; } + + /// Total number of value cells in the sparse tensor + int64_t size() const; + + /// Return true if the underlying data buffer is mutable + bool is_mutable() const { return data_->is_mutable(); } + + protected: + std::shared_ptr type_; + std::shared_ptr data_; + std::vector shape_; + std::shared_ptr sparse_index_; + + /// These names are optional + std::vector dim_names_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(SparseTensor); +}; + +} // namespace arrow + +#endif // ARROW_SPARSE_TENSOR_H diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h index a9b5df81fa1..e386b096037 100644 --- a/cpp/src/arrow/tensor.h +++ b/cpp/src/arrow/tensor.h @@ -50,6 +50,9 @@ static inline bool is_tensor_supported(Type::type type_id) { return false; } +template +class SparseTensor; + class ARROW_EXPORT Tensor { public: virtual ~Tensor() = default; @@ -110,6 +113,9 @@ class ARROW_EXPORT Tensor { /// These names are optional std::vector dim_names_; + template + friend class SparseTensor; + private: ARROW_DISALLOW_COPY_AND_ASSIGN(Tensor); }; From 433c9b4416ff0011eccfa21f8517eaab96f4d1fc Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 14 Nov 2018 17:51:40 +0900 Subject: [PATCH 11/40] Change COO index matrix to column-major in a format description --- format/SparseTensor.fbs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs index 59aa6aaad6e..95666979bb5 100644 --- a/format/SparseTensor.fbs +++ b/format/SparseTensor.fbs @@ -40,14 +40,12 @@ table SparseTensorIndexCOO { /// X[0, 1, 2, 1] := 5 /// X[1, 2, 0, 4] := 6 /// - /// In COO format, the index matrix of X is the following 10x4 matrix: + /// In COO format, the index matrix of X is the following 4x10 matrix: /// - /// [[0, 1, 2, 0], - /// [0, 1, 2, 1], - /// [0, 1, 3, 0], - /// [0, 2, 1, 0], - /// [1, 1, 2, 3], - /// [1, 2, 0, 4]] + /// [[0, 0, 0, 0, 1, 1], + /// [1, 1, 1, 2, 1, 2], + /// [2, 2, 3, 1, 2, 0], + /// [0, 1, 0, 0, 3, 4]] /// /// Note that the indices are sorted in lexcographical order. indicesBuffer: Buffer; From 4251b4d08eac6f1c91598d227f931d6690b97e7d Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Tue, 20 Nov 2018 21:16:49 +0900 Subject: [PATCH 12/40] Add SparseCSRIndex --- cpp/src/arrow/sparse_tensor-test.cc | 53 ++++++ cpp/src/arrow/sparse_tensor.cc | 273 ++++++++++++++++++++-------- cpp/src/arrow/sparse_tensor.h | 21 +++ 3 files changed, 273 insertions(+), 74 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor-test.cc b/cpp/src/arrow/sparse_tensor-test.cc index 63ef2d11e77..7705d380676 100644 --- a/cpp/src/arrow/sparse_tensor-test.cc +++ b/cpp/src/arrow/sparse_tensor-test.cc @@ -176,4 +176,57 @@ TEST(TestSparseCOOTensor, CreationFromTensor) { ASSERT_EQ(3, sidx->Value({11, 2})); } +TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { + std::vector shape = {6, 4}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::shared_ptr buffer(Buffer::Wrap(values)); + std::vector dim_names = {"foo", "bar", "baz"}; + NumericTensor tensor1(buffer, shape); + NumericTensor tensor2(buffer, shape, {}, dim_names); + + SparseTensor st1(tensor1); + SparseTensor st2(tensor2); + + ASSERT_EQ(12, st1.length()); + ASSERT_TRUE(st1.is_mutable()); + + ASSERT_EQ("foo", st2.dim_name(0)); + ASSERT_EQ("bar", st2.dim_name(1)); + ASSERT_EQ("baz", st2.dim_name(2)); + + ASSERT_EQ("", st1.dim_name(0)); + ASSERT_EQ("", st1.dim_name(1)); + ASSERT_EQ("", st1.dim_name(2)); + + const int64_t* ptr = reinterpret_cast(st1.raw_data()); + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(i + 1, ptr[i]); + } + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(i + 11, ptr[i + 6]); + } + + std::shared_ptr si = st1.sparse_index(); + + ASSERT_EQ(1, si->indptr()->ndim()); + ASSERT_EQ(1, si->indices()->ndim()); + + const int64_t* indptr_begin = + reinterpret_cast(si->indptr()->raw_data()); + std::vector indptr_values(indptr_begin, + indptr_begin + si->indptr()->shape()[0]); + + ASSERT_EQ(7, indptr_values.size()); + ASSERT_EQ(std::vector({0, 2, 4, 6, 8, 10, 12}), indptr_values); + + const int64_t* indices_begin = + reinterpret_cast(si->indices()->raw_data()); + std::vector indices_values(indices_begin, + indices_begin + si->indices()->shape()[0]); + + ASSERT_EQ(12, indices_values.size()); + ASSERT_EQ(std::vector({0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}), indices_values); +} + } // namespace arrow diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index ba7a2e82fd5..533177aa111 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -35,6 +35,14 @@ struct SparseIndexTraits { static inline const char* name() { return "SparseCOOIndex"; } }; +template <> +struct SparseIndexTraits { + static inline const char* name() { return "SparseCSRIndex"; } +}; + +// ---------------------------------------------------------------------- +// SparseTensorConverter + template class SparseTensorConverter { public: @@ -47,20 +55,75 @@ class SparseTensorConverter { } }; +// ---------------------------------------------------------------------- +// SparseTensorConverter for SparseCOOIndex + template -class SparseTensorConverter { - public: +struct SparseTensorConverterBase { using NumericTensorType = NumericTensor; using value_type = typename NumericTensorType::value_type; - explicit SparseTensorConverter(const NumericTensor& tensor) : tensor_(tensor) {} + explicit SparseTensorConverterBase(const NumericTensorType& tensor) : tensor_(tensor) {} + + bool TensorIsTriviallyIterable() const { + return tensor_.ndim() <= 1 || tensor_.is_contiguous(); + } + + size_t CountNonZero() const { + if (tensor_.size() == 0) { + return 0; + } + + if (TensorIsTriviallyIterable()) { + const value_type* data = reinterpret_cast(tensor_.raw_data()); + return std::count_if(data, data + tensor_.size(), + [](value_type x) { return x != 0; }); + } + + const std::vector& shape = tensor_.shape(); + const int64_t ndim = tensor_.ndim(); + + size_t count = 0; + std::vector coord(ndim, 0); + for (int64_t n = tensor_.size(); n > 0; n--) { + if (tensor_.Value(coord) != 0) { + ++count; + } + + // increment index + ++coord[ndim - 1]; + if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) { + int64_t d = ndim - 1; + while (d > 0 && coord[d] == shape[d]) { + coord[d] = 0; + ++coord[d - 1]; + --d; + } + } + } + return count; + } + + const NumericTensorType& tensor_; +}; + +template +class SparseTensorConverter + : private SparseTensorConverterBase { + public: + using BaseClass = SparseTensorConverterBase; + using NumericTensorType = typename BaseClass::NumericTensorType; + using value_type = typename BaseClass::value_type; + + explicit SparseTensorConverter(const NumericTensorType& tensor) : BaseClass(tensor) {} Status Convert() { const int64_t ndim = tensor_.ndim(); const int64_t nonzero_count = static_cast(CountNonZero()); std::shared_ptr indices_buffer; - RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * ndim * nonzero_count, &indices_buffer)); + RETURN_NOT_OK( + AllocateBuffer(sizeof(int64_t) * ndim * nonzero_count, &indices_buffer)); int64_t* indices = reinterpret_cast(indices_buffer->mutable_data()); std::shared_ptr values_buffer; @@ -85,7 +148,7 @@ class SparseTensorConverter { if (tensor_.Value(coord) != 0) { *values++ = x; - int64_t *indp = indices; + int64_t* indp = indices; for (int64_t i = 0; i < ndim; ++i) { *indp = coord[i]; indp += nonzero_count; @@ -109,11 +172,11 @@ class SparseTensorConverter { // make results const std::vector indices_shape = {nonzero_count, ndim}; const int64_t indices_elsize = sizeof(int64_t); - const std::vector indices_strides = {indices_elsize, indices_elsize * nonzero_count}; - sparse_index = std::make_shared( - std::make_shared(indices_buffer, - indices_shape, - indices_strides)); + const std::vector indices_strides = {indices_elsize, + indices_elsize * nonzero_count}; + sparse_index = + std::make_shared(std::make_shared( + indices_buffer, indices_shape, indices_strides)); data = values_buffer; return Status::OK(); @@ -122,47 +185,9 @@ class SparseTensorConverter { std::shared_ptr sparse_index; std::shared_ptr data; - protected: - bool TensorIsTriviallyIterable() const { - return tensor_.ndim() <= 1 || tensor_.is_contiguous(); - } - - size_t CountNonZero() const { - if (tensor_.size() == 0) { - return 0; - } - - if (TensorIsTriviallyIterable()) { - const value_type* data = reinterpret_cast(tensor_.raw_data()); - return std::count_if(data, data + tensor_.size(), [](value_type x) { return x != 0; }); - } - - const std::vector& shape = tensor_.shape(); - const int64_t ndim = tensor_.ndim(); - - size_t count = 0; - std::vector coord(ndim, 0); - for (int64_t n = tensor_.size(); n > 0; n--) { - if (tensor_.Value(coord) != 0) { - ++count; - } - - // increment index - ++coord[ndim - 1]; - if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) { - int64_t d = ndim - 1; - while (d > 0 && coord[d] == shape[d]) { - coord[d] = 0; - ++coord[d - 1]; - --d; - } - } - } - return count; - } - private: - const NumericTensor& tensor_; + using SparseTensorConverterBase::tensor_; + using SparseTensorConverterBase::CountNonZero; }; template @@ -176,29 +201,126 @@ void MakeSparseCOOTensorFromTensor(const Tensor& tensor, *data = converter.data; } +// ---------------------------------------------------------------------- +// SparseTensorConverter for SparseCSRIndex + +template +class SparseTensorConverter + : private SparseTensorConverterBase { + public: + using BaseClass = SparseTensorConverterBase; + using NumericTensorType = typename BaseClass::NumericTensorType; + using value_type = typename BaseClass::value_type; + + explicit SparseTensorConverter(const NumericTensorType& tensor) : BaseClass(tensor) {} + + Status Convert() { + const int64_t ndim = tensor_.ndim(); + if (ndim > 2) { + return Status::Invalid("Invalid tensor dimension"); + } + + const int64_t nr = tensor_.shape()[0]; + const int64_t nc = tensor_.shape()[1]; + const int64_t nonzero_count = static_cast(CountNonZero()); + + std::shared_ptr indptr_buffer; + std::shared_ptr indices_buffer; + + std::shared_ptr values_buffer; + RETURN_NOT_OK(AllocateBuffer(sizeof(value_type) * nonzero_count, &values_buffer)); + value_type* values = reinterpret_cast(values_buffer->mutable_data()); + + if (ndim <= 1) { + return Status::NotImplemented("TODO for ndim <= 1"); + } else { + RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * (nr + 1), &indptr_buffer)); + int64_t* indptr = reinterpret_cast(indptr_buffer->mutable_data()); + + RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * nonzero_count, &indices_buffer)); + int64_t* indices = reinterpret_cast(indices_buffer->mutable_data()); + + int64_t k = 0; + *indptr++ = 0; + for (int64_t i = 0; i < nr; ++i) { + for (int64_t j = 0; j < nc; ++j) { + const value_type x = tensor_.Value({i, j}); + if (x != 0) { + *values++ = x; + *indices++ = j; + k++; + } + } + *indptr++ = k; + } + } + + std::vector indptr_shape({nr + 1}); + std::shared_ptr indptr_tensor = + std::make_shared(indptr_buffer, indptr_shape); + + std::vector indices_shape({nonzero_count}); + std::shared_ptr indices_tensor = + std::make_shared(indices_buffer, indices_shape); + + sparse_index = std::make_shared(indptr_tensor, indices_tensor); + data = values_buffer; + + return Status::OK(); + } + + std::shared_ptr sparse_index; + std::shared_ptr data; + + private: + using BaseClass::tensor_; + using SparseTensorConverterBase::CountNonZero; +}; + // ---------------------------------------------------------------------- // Instantiate templates -template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; -template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; -template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; -template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; -template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; -template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; -template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; -template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; -template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; -template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; -template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; +#define INSTANTIATE_SPARSE_TENSOR_CONVERTER(IndexType) \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter + +INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCOOIndex); +INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSRIndex); } // namespace +// ---------------------------------------------------------------------- +// SparseCOOIndex + // Constructor with a column-major NumericTensor SparseCOOIndex::SparseCOOIndex(const std::shared_ptr& coords) : SparseIndex(coords->shape()[0]), coords_(coords) { DCHECK(coords_->is_column_major()); } +// ---------------------------------------------------------------------- +// SparseCSRIndex + +// Constructor with two index vectors +SparseCSRIndex::SparseCSRIndex(const std::shared_ptr& indptr, + const std::shared_ptr& indices) + : SparseIndex(indices->shape()[0]), indptr_(indptr), indices_(indices) { + DCHECK_EQ(1, indptr_->ndim()); + DCHECK_EQ(1, indices_->ndim()); +} + +// ---------------------------------------------------------------------- +// SparseTensor + // Constructor with all attributes template SparseTensor::SparseTensor( @@ -304,18 +426,21 @@ int64_t SparseTensor::size() const { // ---------------------------------------------------------------------- // Instantiate templates -template class ARROW_TEMPLATE_EXPORT SparseTensor; - -template SparseTensor::SparseTensor(const NumericTensor&); -template SparseTensor::SparseTensor(const NumericTensor&); -template SparseTensor::SparseTensor(const NumericTensor&); -template SparseTensor::SparseTensor(const NumericTensor&); -template SparseTensor::SparseTensor(const NumericTensor&); -template SparseTensor::SparseTensor(const NumericTensor&); -template SparseTensor::SparseTensor(const NumericTensor&); -template SparseTensor::SparseTensor(const NumericTensor&); -template SparseTensor::SparseTensor(const NumericTensor&); -template SparseTensor::SparseTensor(const NumericTensor&); -template SparseTensor::SparseTensor(const NumericTensor&); +#define INSTANTIATE_SPARSE_TENSOR(IndexType) \ + template class ARROW_TEMPLATE_EXPORT SparseTensor; \ + template SparseTensor::SparseTensor(const NumericTensor&); \ + template SparseTensor::SparseTensor(const NumericTensor&); \ + template SparseTensor::SparseTensor(const NumericTensor&); \ + template SparseTensor::SparseTensor(const NumericTensor&); \ + template SparseTensor::SparseTensor(const NumericTensor&); \ + template SparseTensor::SparseTensor(const NumericTensor&); \ + template SparseTensor::SparseTensor(const NumericTensor&); \ + template SparseTensor::SparseTensor(const NumericTensor&); \ + template SparseTensor::SparseTensor(const NumericTensor&); \ + template SparseTensor::SparseTensor(const NumericTensor&); \ + template SparseTensor::SparseTensor(const NumericTensor&) + +INSTANTIATE_SPARSE_TENSOR(SparseCOOIndex); +INSTANTIATE_SPARSE_TENSOR(SparseCSRIndex); } // namespace arrow diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index b9b8bc96af0..be891007f26 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -56,6 +56,27 @@ class ARROW_EXPORT SparseCOOIndex : public SparseIndex { std::shared_ptr coords_; }; +// ---------------------------------------------------------------------- +// SparseCSRIndex class + +class ARROW_EXPORT SparseCSRIndex : public SparseIndex { + public: + using IndexTensor = NumericTensor; + + virtual ~SparseCSRIndex() = default; + + // Constructor with two index vectors + explicit SparseCSRIndex(const std::shared_ptr& indptr, + const std::shared_ptr& indices); + + const std::shared_ptr& indptr() const { return indptr_; } + const std::shared_ptr& indices() const { return indices_; } + + protected: + std::shared_ptr indptr_; + std::shared_ptr indices_; +}; + // ---------------------------------------------------------------------- // SparseTensor class From ed3984dd47e026a1ae171892b99601c6cadba8cd Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 5 Dec 2018 09:34:41 +0900 Subject: [PATCH 13/40] Add SparseIndex::format_type --- cpp/src/arrow/sparse_tensor.cc | 21 +++------------------ cpp/src/arrow/sparse_tensor.h | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 533177aa111..e8562c487a2 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -27,19 +27,6 @@ namespace arrow { namespace { -template -struct SparseIndexTraits {}; - -template <> -struct SparseIndexTraits { - static inline const char* name() { return "SparseCOOIndex"; } -}; - -template <> -struct SparseIndexTraits { - static inline const char* name() { return "SparseCSRIndex"; } -}; - // ---------------------------------------------------------------------- // SparseTensorConverter @@ -49,9 +36,7 @@ class SparseTensorConverter { explicit SparseTensorConverter(const NumericTensor&) {} Status Convert() { - std::string sparse_index_name(SparseIndexTraits::name()); - return Status::NotImplemented(sparse_index_name + - std::string(" is not supported yet.")); + return Status::Invalid("Unsupported sparse index"); } }; @@ -303,7 +288,7 @@ INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSRIndex); // Constructor with a column-major NumericTensor SparseCOOIndex::SparseCOOIndex(const std::shared_ptr& coords) - : SparseIndex(coords->shape()[0]), coords_(coords) { + : SparseIndex(SparseIndex::COO, coords->shape()[0]), coords_(coords) { DCHECK(coords_->is_column_major()); } @@ -313,7 +298,7 @@ SparseCOOIndex::SparseCOOIndex(const std::shared_ptr& coords) // Constructor with two index vectors SparseCSRIndex::SparseCSRIndex(const std::shared_ptr& indptr, const std::shared_ptr& indices) - : SparseIndex(indices->shape()[0]), indptr_(indptr), indices_(indices) { + : SparseIndex(SparseIndex::CSR, indices->shape()[0]), indptr_(indptr), indices_(indices) { DCHECK_EQ(1, indptr_->ndim()); DCHECK_EQ(1, indices_->ndim()); } diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index be891007f26..e5a3915a60d 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -31,10 +31,19 @@ namespace arrow { class ARROW_EXPORT SparseIndex { public: - explicit SparseIndex(int64_t length) : length_(length) {} + enum format_type { + COO, + CSR + }; + + explicit SparseIndex(format_type format_type_id, int64_t length) + : format_type_id_(format_type_id), length_(length) {} + + format_type format_type_id() const { return format_type_id_; } int64_t length() const { return length_; } protected: + format_type format_type_id_; int64_t length_; }; @@ -45,6 +54,8 @@ class ARROW_EXPORT SparseCOOIndex : public SparseIndex { public: using CoordsTensor = NumericTensor; + static constexpr SparseIndex::format_type format_type_id = SparseIndex::COO; + virtual ~SparseCOOIndex() = default; // Constructor with a column-major NumericTensor @@ -63,6 +74,8 @@ class ARROW_EXPORT SparseCSRIndex : public SparseIndex { public: using IndexTensor = NumericTensor; + static constexpr SparseIndex::format_type format_type_id = SparseIndex::COO; + virtual ~SparseCSRIndex() = default; // Constructor with two index vectors From 021b46be0d36bfca849a6838a23ac02cb4b6c828 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 5 Dec 2018 10:03:49 +0900 Subject: [PATCH 14/40] Add SparseTensorBase --- cpp/src/arrow/sparse_tensor-test.cc | 16 ++++- cpp/src/arrow/sparse_tensor.cc | 77 ++++++++++++------------ cpp/src/arrow/sparse_tensor.h | 91 +++++++++++++++++++---------- 3 files changed, 111 insertions(+), 73 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor-test.cc b/cpp/src/arrow/sparse_tensor-test.cc index 7705d380676..d31538e7610 100644 --- a/cpp/src/arrow/sparse_tensor-test.cc +++ b/cpp/src/arrow/sparse_tensor-test.cc @@ -32,6 +32,12 @@ namespace arrow { +static inline void CheckSparseIndexFormatType(SparseIndex::format_type expected, + const SparseTensorBase& sparse_tensor) { + ASSERT_EQ(expected, sparse_tensor.sparse_index_format_type_id()); + ASSERT_EQ(expected, sparse_tensor.sparse_index()->format_type_id()); +} + TEST(TestSparseCOOTensor, CreationEmptyTensor) { std::vector shape = {2, 3, 4}; SparseTensor st1(int64(), shape); @@ -65,6 +71,8 @@ TEST(TestSparseCOOTensor, CreationFromNumericTensor) { SparseTensor st1(tensor1); SparseTensor st2(tensor2); + CheckSparseIndexFormatType(SparseIndex::COO, st1); + ASSERT_EQ(12, st1.length()); ASSERT_TRUE(st1.is_mutable()); @@ -84,7 +92,7 @@ TEST(TestSparseCOOTensor, CreationFromNumericTensor) { ASSERT_EQ(i + 11, ptr[i + 6]); } - std::shared_ptr si = st1.sparse_index(); + std::shared_ptr si = std::dynamic_pointer_cast(st1.sparse_index()); std::shared_ptr sidx = si->indices(); ASSERT_EQ(std::vector({12, 3}), sidx->shape()); ASSERT_TRUE(sidx->is_column_major()); @@ -145,7 +153,7 @@ TEST(TestSparseCOOTensor, CreationFromTensor) { ASSERT_EQ(i + 11, ptr[i + 6]); } - std::shared_ptr si = st1.sparse_index(); + std::shared_ptr si = std::dynamic_pointer_cast(st1.sparse_index()); std::shared_ptr sidx = si->indices(); ASSERT_EQ(std::vector({12, 3}), sidx->shape()); ASSERT_TRUE(sidx->is_column_major()); @@ -188,6 +196,8 @@ TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { SparseTensor st1(tensor1); SparseTensor st2(tensor2); + CheckSparseIndexFormatType(SparseIndex::CSR, st1); + ASSERT_EQ(12, st1.length()); ASSERT_TRUE(st1.is_mutable()); @@ -207,7 +217,7 @@ TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { ASSERT_EQ(i + 11, ptr[i + 6]); } - std::shared_ptr si = st1.sparse_index(); + std::shared_ptr si = std::dynamic_pointer_cast(st1.sparse_index()); ASSERT_EQ(1, si->indptr()->ndim()); ASSERT_EQ(1, si->indices()->ndim()); diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index e8562c487a2..0f437380bd4 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -176,9 +176,9 @@ class SparseTensorConverter }; template -void MakeSparseCOOTensorFromTensor(const Tensor& tensor, - std::shared_ptr* sparse_index, - std::shared_ptr* data) { +void MakeSparseTensorFromTensor(const Tensor& tensor, + std::shared_ptr* sparse_index, + std::shared_ptr* data) { NumericTensor numeric_tensor(tensor.data(), tensor.shape(), tensor.strides()); SparseTensorConverter converter(numeric_tensor); DCHECK_OK(converter.Convert()); @@ -288,7 +288,7 @@ INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSRIndex); // Constructor with a column-major NumericTensor SparseCOOIndex::SparseCOOIndex(const std::shared_ptr& coords) - : SparseIndex(SparseIndex::COO, coords->shape()[0]), coords_(coords) { + : SparseIndexBase(coords->shape()[0]), coords_(coords) { DCHECK(coords_->is_column_major()); } @@ -298,20 +298,20 @@ SparseCOOIndex::SparseCOOIndex(const std::shared_ptr& coords) // Constructor with two index vectors SparseCSRIndex::SparseCSRIndex(const std::shared_ptr& indptr, const std::shared_ptr& indices) - : SparseIndex(SparseIndex::CSR, indices->shape()[0]), indptr_(indptr), indices_(indices) { + : SparseIndexBase(indices->shape()[0]), indptr_(indptr), indices_(indices) { DCHECK_EQ(1, indptr_->ndim()); DCHECK_EQ(1, indices_->ndim()); } // ---------------------------------------------------------------------- -// SparseTensor +// SparseTensorBase // Constructor with all attributes -template -SparseTensor::SparseTensor( - const std::shared_ptr& sparse_index, - const std::shared_ptr& type, const std::shared_ptr& data, - const std::vector& shape, const std::vector& dim_names) +SparseTensorBase::SparseTensorBase(const std::shared_ptr& type, + const std::shared_ptr& data, + const std::vector& shape, + const std::shared_ptr& sparse_index, + const std::vector& dim_names) : type_(type), data_(data), shape_(shape), @@ -320,6 +320,23 @@ SparseTensor::SparseTensor( DCHECK(is_tensor_supported(type->id())); } +const std::string& SparseTensorBase::dim_name(int i) const { + static const std::string kEmpty = ""; + if (dim_names_.size() == 0) { + return kEmpty; + } else { + DCHECK_LT(i, static_cast(dim_names_.size())); + return dim_names_[i]; + } +} + +int64_t SparseTensorBase::size() const { + return std::accumulate(shape_.begin(), shape_.end(), 1LL, std::multiplies()); +} + +// ---------------------------------------------------------------------- +// SparseTensor + // Constructor with a dense tensor template SparseTensor::SparseTensor(const std::shared_ptr& type, @@ -344,47 +361,47 @@ SparseTensor::SparseTensor(const Tensor& tensor) : SparseTensor(nullptr, tensor.type(), nullptr, tensor.shape(), tensor.dim_names_) { switch (tensor.type()->id()) { case Type::UINT8: - MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + MakeSparseTensorFromTensor(tensor, &sparse_index_, &data_); return; case Type::INT8: - MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + MakeSparseTensorFromTensor(tensor, &sparse_index_, &data_); return; case Type::UINT16: - MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + MakeSparseTensorFromTensor(tensor, &sparse_index_, &data_); return; case Type::INT16: - MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + MakeSparseTensorFromTensor(tensor, &sparse_index_, &data_); return; case Type::UINT32: - MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + MakeSparseTensorFromTensor(tensor, &sparse_index_, &data_); return; case Type::INT32: - MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + MakeSparseTensorFromTensor(tensor, &sparse_index_, &data_); return; case Type::UINT64: - MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + MakeSparseTensorFromTensor(tensor, &sparse_index_, &data_); return; case Type::INT64: - MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + MakeSparseTensorFromTensor(tensor, &sparse_index_, &data_); return; case Type::HALF_FLOAT: - MakeSparseCOOTensorFromTensor( + MakeSparseTensorFromTensor( tensor, &sparse_index_, &data_); return; case Type::FLOAT: - MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + MakeSparseTensorFromTensor(tensor, &sparse_index_, &data_); return; case Type::DOUBLE: - MakeSparseCOOTensorFromTensor(tensor, &sparse_index_, + MakeSparseTensorFromTensor(tensor, &sparse_index_, &data_); return; default: @@ -392,22 +409,6 @@ SparseTensor::SparseTensor(const Tensor& tensor) } } -template -const std::string& SparseTensor::dim_name(int i) const { - static const std::string kEmpty = ""; - if (dim_names_.size() == 0) { - return kEmpty; - } else { - DCHECK_LT(i, static_cast(dim_names_.size())); - return dim_names_[i]; - } -} - -template -int64_t SparseTensor::size() const { - return std::accumulate(shape_.begin(), shape_.end(), 1LL, std::multiplies()); -} - // ---------------------------------------------------------------------- // Instantiate templates diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index e5a3915a60d..51918073ed7 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -39,6 +39,8 @@ class ARROW_EXPORT SparseIndex { explicit SparseIndex(format_type format_type_id, int64_t length) : format_type_id_(format_type_id), length_(length) {} + virtual ~SparseIndex() = default; + format_type format_type_id() const { return format_type_id_; } int64_t length() const { return length_; } @@ -47,17 +49,22 @@ class ARROW_EXPORT SparseIndex { int64_t length_; }; +template +class SparseIndexBase : public SparseIndex { + public: + explicit SparseIndexBase(int64_t length) + : SparseIndex(SparseIndexType::format_type_id, length) {} +}; + // ---------------------------------------------------------------------- // SparseCOOIndex class -class ARROW_EXPORT SparseCOOIndex : public SparseIndex { +class ARROW_EXPORT SparseCOOIndex : public SparseIndexBase { public: using CoordsTensor = NumericTensor; static constexpr SparseIndex::format_type format_type_id = SparseIndex::COO; - virtual ~SparseCOOIndex() = default; - // Constructor with a column-major NumericTensor explicit SparseCOOIndex(const std::shared_ptr& coords); @@ -70,13 +77,11 @@ class ARROW_EXPORT SparseCOOIndex : public SparseIndex { // ---------------------------------------------------------------------- // SparseCSRIndex class -class ARROW_EXPORT SparseCSRIndex : public SparseIndex { +class ARROW_EXPORT SparseCSRIndex : public SparseIndexBase { public: using IndexTensor = NumericTensor; - static constexpr SparseIndex::format_type format_type_id = SparseIndex::COO; - - virtual ~SparseCSRIndex() = default; + static constexpr SparseIndex::format_type format_type_id = SparseIndex::CSR; // Constructor with two index vectors explicit SparseCSRIndex(const std::shared_ptr& indptr, @@ -91,29 +96,13 @@ class ARROW_EXPORT SparseCSRIndex : public SparseIndex { }; // ---------------------------------------------------------------------- -// SparseTensor class +// SparseTensorBase class -template -class ARROW_EXPORT SparseTensor { +class ARROW_EXPORT SparseTensorBase { public: - virtual ~SparseTensor() = default; + virtual ~SparseTensorBase() = default; - // Constructor with all attributes - SparseTensor(const std::shared_ptr& sparse_index, - const std::shared_ptr& type, const std::shared_ptr& data, - const std::vector& shape, - const std::vector& dim_names); - - // Constructor with a dense tensor - SparseTensor(const std::shared_ptr& type, const std::vector& shape, - const std::vector& dim_names = {}); - - // Constructor with a dense numeric tensor - template - explicit SparseTensor(const NumericTensor& tensor); - - // Constructor with a dense tensor - explicit SparseTensor(const Tensor& tensor); + virtual SparseIndex::format_type sparse_index_format_type_id() const = 0; std::shared_ptr type() const { return type_; } std::shared_ptr data() const { return data_; } @@ -122,29 +111,67 @@ class ARROW_EXPORT SparseTensor { uint8_t* raw_mutable_data() const { return data_->mutable_data(); } const std::vector& shape() const { return shape_; } - const std::shared_ptr& sparse_index() const { return sparse_index_; } + + const std::shared_ptr& sparse_index() const { return sparse_index_; } int ndim() const { return static_cast(shape_.size()); } const std::string& dim_name(int i) const; - /// Total number of non-zero cells in the sparse tensor - int64_t length() const { return sparse_index_ ? sparse_index_->length() : 0; } - /// Total number of value cells in the sparse tensor int64_t size() const; /// Return true if the underlying data buffer is mutable bool is_mutable() const { return data_->is_mutable(); } + /// Total number of non-zero cells in the sparse tensor + virtual int64_t length() const = 0; + protected: + // Constructor with all attributes + SparseTensorBase(const std::shared_ptr& type, const std::shared_ptr& data, + const std::vector& shape, const std::shared_ptr& sparse_index, + const std::vector& dim_names); + std::shared_ptr type_; std::shared_ptr data_; std::vector shape_; - std::shared_ptr sparse_index_; + std::shared_ptr sparse_index_; /// These names are optional std::vector dim_names_; +}; + +// ---------------------------------------------------------------------- +// SparseTensor class + +template +class ARROW_EXPORT SparseTensor : public SparseTensorBase { + public: + virtual ~SparseTensor() = default; + + // Constructor with all attributes + SparseTensor(const std::shared_ptr& sparse_index, + const std::shared_ptr& type, const std::shared_ptr& data, + const std::vector& shape, + const std::vector& dim_names) + : SparseTensorBase(type, data, shape, sparse_index, dim_names) {} + + // Constructor for empty sparse tensor + SparseTensor(const std::shared_ptr& type, const std::vector& shape, + const std::vector& dim_names = {}); + + // Constructor with a dense numeric tensor + template + explicit SparseTensor(const NumericTensor& tensor); + + // Constructor with a dense tensor + explicit SparseTensor(const Tensor& tensor); + + SparseIndex::format_type sparse_index_format_type_id() const { return SparseIndexType::format_type_id; } + + /// Total number of non-zero cells in the sparse tensor + int64_t length() const { return sparse_index_ ? sparse_index_->length() : 0; } private: ARROW_DISALLOW_COPY_AND_ASSIGN(SparseTensor); From 93c03adad1c25d1b0570efc0aa6e9eecc0aff44c Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Thu, 6 Dec 2018 12:41:11 +0900 Subject: [PATCH 15/40] Add SparseIndex::ToString() --- cpp/src/arrow/sparse_tensor.cc | 4 ++++ cpp/src/arrow/sparse_tensor.h | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 0f437380bd4..d93696233e7 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -292,6 +292,8 @@ SparseCOOIndex::SparseCOOIndex(const std::shared_ptr& coords) DCHECK(coords_->is_column_major()); } +std::string SparseCOOIndex::ToString() const { return std::string("SparseCOOIndex"); } + // ---------------------------------------------------------------------- // SparseCSRIndex @@ -303,6 +305,8 @@ SparseCSRIndex::SparseCSRIndex(const std::shared_ptr& indptr, DCHECK_EQ(1, indices_->ndim()); } +std::string SparseCSRIndex::ToString() const { return std::string("SparseCOOIndex"); } + // ---------------------------------------------------------------------- // SparseTensorBase diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 51918073ed7..a746f4d4621 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -44,6 +44,8 @@ class ARROW_EXPORT SparseIndex { format_type format_type_id() const { return format_type_id_; } int64_t length() const { return length_; } + virtual std::string ToString() const = 0; + protected: format_type format_type_id_; int64_t length_; @@ -70,6 +72,8 @@ class ARROW_EXPORT SparseCOOIndex : public SparseIndexBase { const std::shared_ptr& indices() const { return coords_; } + std::string ToString() const override; + protected: std::shared_ptr coords_; }; @@ -90,6 +94,8 @@ class ARROW_EXPORT SparseCSRIndex : public SparseIndexBase { const std::shared_ptr& indptr() const { return indptr_; } const std::shared_ptr& indices() const { return indices_; } + std::string ToString() const override; + protected: std::shared_ptr indptr_; std::shared_ptr indices_; From 51a83bfee658ccf4f38b9f885ceb22223be4307b Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Fri, 7 Dec 2018 15:12:42 +0900 Subject: [PATCH 16/40] Add SparseTensorFormat --- cpp/src/arrow/sparse_tensor-test.cc | 10 +++++----- cpp/src/arrow/sparse_tensor.h | 25 +++++++++++-------------- cpp/src/arrow/sparse_tensor_format.h | 28 ++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 19 deletions(-) create mode 100644 cpp/src/arrow/sparse_tensor_format.h diff --git a/cpp/src/arrow/sparse_tensor-test.cc b/cpp/src/arrow/sparse_tensor-test.cc index d31538e7610..f4e7edabeeb 100644 --- a/cpp/src/arrow/sparse_tensor-test.cc +++ b/cpp/src/arrow/sparse_tensor-test.cc @@ -32,10 +32,10 @@ namespace arrow { -static inline void CheckSparseIndexFormatType(SparseIndex::format_type expected, +static inline void CheckSparseIndexFormatType(SparseTensorFormat::type expected, const SparseTensorBase& sparse_tensor) { - ASSERT_EQ(expected, sparse_tensor.sparse_index_format_type_id()); - ASSERT_EQ(expected, sparse_tensor.sparse_index()->format_type_id()); + ASSERT_EQ(expected, sparse_tensor.sparse_tensor_format_id()); + ASSERT_EQ(expected, sparse_tensor.sparse_index()->format_id()); } TEST(TestSparseCOOTensor, CreationEmptyTensor) { @@ -71,7 +71,7 @@ TEST(TestSparseCOOTensor, CreationFromNumericTensor) { SparseTensor st1(tensor1); SparseTensor st2(tensor2); - CheckSparseIndexFormatType(SparseIndex::COO, st1); + CheckSparseIndexFormatType(SparseTensorFormat::COO, st1); ASSERT_EQ(12, st1.length()); ASSERT_TRUE(st1.is_mutable()); @@ -196,7 +196,7 @@ TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { SparseTensor st1(tensor1); SparseTensor st2(tensor2); - CheckSparseIndexFormatType(SparseIndex::CSR, st1); + CheckSparseIndexFormatType(SparseTensorFormat::CSR, st1); ASSERT_EQ(12, st1.length()); ASSERT_TRUE(st1.is_mutable()); diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index a746f4d4621..0a63ad1afd3 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -23,6 +23,7 @@ #include #include "arrow/tensor.h" +#include "arrow/sparse_tensor_format.h" namespace arrow { @@ -31,23 +32,18 @@ namespace arrow { class ARROW_EXPORT SparseIndex { public: - enum format_type { - COO, - CSR - }; - - explicit SparseIndex(format_type format_type_id, int64_t length) - : format_type_id_(format_type_id), length_(length) {} + explicit SparseIndex(SparseTensorFormat::type format_id, int64_t length) + : format_id_(format_id), length_(length) {} virtual ~SparseIndex() = default; - format_type format_type_id() const { return format_type_id_; } + SparseTensorFormat::type format_id() const { return format_id_; } int64_t length() const { return length_; } virtual std::string ToString() const = 0; protected: - format_type format_type_id_; + SparseTensorFormat::type format_id_; int64_t length_; }; @@ -55,7 +51,7 @@ template class SparseIndexBase : public SparseIndex { public: explicit SparseIndexBase(int64_t length) - : SparseIndex(SparseIndexType::format_type_id, length) {} + : SparseIndex(SparseIndexType::format_id, length) {} }; // ---------------------------------------------------------------------- @@ -65,7 +61,7 @@ class ARROW_EXPORT SparseCOOIndex : public SparseIndexBase { public: using CoordsTensor = NumericTensor; - static constexpr SparseIndex::format_type format_type_id = SparseIndex::COO; + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::COO; // Constructor with a column-major NumericTensor explicit SparseCOOIndex(const std::shared_ptr& coords); @@ -85,7 +81,7 @@ class ARROW_EXPORT SparseCSRIndex : public SparseIndexBase { public: using IndexTensor = NumericTensor; - static constexpr SparseIndex::format_type format_type_id = SparseIndex::CSR; + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSR; // Constructor with two index vectors explicit SparseCSRIndex(const std::shared_ptr& indptr, @@ -108,7 +104,7 @@ class ARROW_EXPORT SparseTensorBase { public: virtual ~SparseTensorBase() = default; - virtual SparseIndex::format_type sparse_index_format_type_id() const = 0; + virtual SparseTensorFormat::type sparse_tensor_format_id() const = 0; std::shared_ptr type() const { return type_; } std::shared_ptr data() const { return data_; } @@ -146,6 +142,7 @@ class ARROW_EXPORT SparseTensorBase { /// These names are optional std::vector dim_names_; + }; // ---------------------------------------------------------------------- @@ -174,7 +171,7 @@ class ARROW_EXPORT SparseTensor : public SparseTensorBase { // Constructor with a dense tensor explicit SparseTensor(const Tensor& tensor); - SparseIndex::format_type sparse_index_format_type_id() const { return SparseIndexType::format_type_id; } + SparseTensorFormat::type sparse_tensor_format_id() const { return SparseIndexType::format_id; } /// Total number of non-zero cells in the sparse tensor int64_t length() const { return sparse_index_ ? sparse_index_->length() : 0; } diff --git a/cpp/src/arrow/sparse_tensor_format.h b/cpp/src/arrow/sparse_tensor_format.h new file mode 100644 index 00000000000..42392d68ee7 --- /dev/null +++ b/cpp/src/arrow/sparse_tensor_format.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_SPARSE_TENSOR_FORMAT_H +#define ARROW_SPARSE_TENSOR_FORMAT_H + +struct SparseTensorFormat { + enum type { + COO, + CSR + }; +}; + +#endif // ARROW_SPARSE_TENSOR_FORMAT_H From 1d9042709d89d420b84332f1e278076b8aea98bb Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Fri, 7 Dec 2018 15:20:08 +0900 Subject: [PATCH 17/40] Fix format --- cpp/src/arrow/sparse_tensor-test.cc | 9 ++++++--- cpp/src/arrow/sparse_tensor.cc | 28 +++++++++++++--------------- cpp/src/arrow/sparse_tensor.h | 12 +++++++----- cpp/src/arrow/sparse_tensor_format.h | 5 +---- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor-test.cc b/cpp/src/arrow/sparse_tensor-test.cc index f4e7edabeeb..64778ca33b6 100644 --- a/cpp/src/arrow/sparse_tensor-test.cc +++ b/cpp/src/arrow/sparse_tensor-test.cc @@ -92,7 +92,8 @@ TEST(TestSparseCOOTensor, CreationFromNumericTensor) { ASSERT_EQ(i + 11, ptr[i + 6]); } - std::shared_ptr si = std::dynamic_pointer_cast(st1.sparse_index()); + std::shared_ptr si = + std::dynamic_pointer_cast(st1.sparse_index()); std::shared_ptr sidx = si->indices(); ASSERT_EQ(std::vector({12, 3}), sidx->shape()); ASSERT_TRUE(sidx->is_column_major()); @@ -153,7 +154,8 @@ TEST(TestSparseCOOTensor, CreationFromTensor) { ASSERT_EQ(i + 11, ptr[i + 6]); } - std::shared_ptr si = std::dynamic_pointer_cast(st1.sparse_index()); + std::shared_ptr si = + std::dynamic_pointer_cast(st1.sparse_index()); std::shared_ptr sidx = si->indices(); ASSERT_EQ(std::vector({12, 3}), sidx->shape()); ASSERT_TRUE(sidx->is_column_major()); @@ -217,7 +219,8 @@ TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { ASSERT_EQ(i + 11, ptr[i + 6]); } - std::shared_ptr si = std::dynamic_pointer_cast(st1.sparse_index()); + std::shared_ptr si = + std::dynamic_pointer_cast(st1.sparse_index()); ASSERT_EQ(1, si->indptr()->ndim()); ASSERT_EQ(1, si->indices()->ndim()); diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index d93696233e7..a8d96c3bd62 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -35,9 +35,7 @@ class SparseTensorConverter { public: explicit SparseTensorConverter(const NumericTensor&) {} - Status Convert() { - return Status::Invalid("Unsupported sparse index"); - } + Status Convert() { return Status::Invalid("Unsupported sparse index"); } }; // ---------------------------------------------------------------------- @@ -366,47 +364,47 @@ SparseTensor::SparseTensor(const Tensor& tensor) switch (tensor.type()->id()) { case Type::UINT8: MakeSparseTensorFromTensor(tensor, &sparse_index_, - &data_); + &data_); return; case Type::INT8: MakeSparseTensorFromTensor(tensor, &sparse_index_, - &data_); + &data_); return; case Type::UINT16: MakeSparseTensorFromTensor(tensor, &sparse_index_, - &data_); + &data_); return; case Type::INT16: MakeSparseTensorFromTensor(tensor, &sparse_index_, - &data_); + &data_); return; case Type::UINT32: MakeSparseTensorFromTensor(tensor, &sparse_index_, - &data_); + &data_); return; case Type::INT32: MakeSparseTensorFromTensor(tensor, &sparse_index_, - &data_); + &data_); return; case Type::UINT64: MakeSparseTensorFromTensor(tensor, &sparse_index_, - &data_); + &data_); return; case Type::INT64: MakeSparseTensorFromTensor(tensor, &sparse_index_, - &data_); + &data_); return; case Type::HALF_FLOAT: - MakeSparseTensorFromTensor( - tensor, &sparse_index_, &data_); + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); return; case Type::FLOAT: MakeSparseTensorFromTensor(tensor, &sparse_index_, - &data_); + &data_); return; case Type::DOUBLE: MakeSparseTensorFromTensor(tensor, &sparse_index_, - &data_); + &data_); return; default: break; diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 0a63ad1afd3..cc55e734ec9 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -22,8 +22,8 @@ #include #include -#include "arrow/tensor.h" #include "arrow/sparse_tensor_format.h" +#include "arrow/tensor.h" namespace arrow { @@ -131,8 +131,9 @@ class ARROW_EXPORT SparseTensorBase { protected: // Constructor with all attributes - SparseTensorBase(const std::shared_ptr& type, const std::shared_ptr& data, - const std::vector& shape, const std::shared_ptr& sparse_index, + SparseTensorBase(const std::shared_ptr& type, + const std::shared_ptr& data, const std::vector& shape, + const std::shared_ptr& sparse_index, const std::vector& dim_names); std::shared_ptr type_; @@ -142,7 +143,6 @@ class ARROW_EXPORT SparseTensorBase { /// These names are optional std::vector dim_names_; - }; // ---------------------------------------------------------------------- @@ -171,7 +171,9 @@ class ARROW_EXPORT SparseTensor : public SparseTensorBase { // Constructor with a dense tensor explicit SparseTensor(const Tensor& tensor); - SparseTensorFormat::type sparse_tensor_format_id() const { return SparseIndexType::format_id; } + SparseTensorFormat::type sparse_tensor_format_id() const { + return SparseIndexType::format_id; + } /// Total number of non-zero cells in the sparse tensor int64_t length() const { return sparse_index_ ? sparse_index_->length() : 0; } diff --git a/cpp/src/arrow/sparse_tensor_format.h b/cpp/src/arrow/sparse_tensor_format.h index 42392d68ee7..24c1a190f50 100644 --- a/cpp/src/arrow/sparse_tensor_format.h +++ b/cpp/src/arrow/sparse_tensor_format.h @@ -19,10 +19,7 @@ #define ARROW_SPARSE_TENSOR_FORMAT_H struct SparseTensorFormat { - enum type { - COO, - CSR - }; + enum type { COO, CSR }; }; #endif // ARROW_SPARSE_TENSOR_FORMAT_H From 6bc9e296f5f213b579c55a35875b46eb5f85d454 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Fri, 7 Dec 2018 15:13:54 +0900 Subject: [PATCH 18/40] Support IPC read and write of SparseTensor --- cpp/src/arrow/ipc/message.cc | 2 + cpp/src/arrow/ipc/message.h | 2 +- cpp/src/arrow/ipc/metadata-internal.cc | 128 +++++++++++++++++++++++++ cpp/src/arrow/ipc/metadata-internal.h | 11 +++ cpp/src/arrow/ipc/read-write-test.cc | 110 +++++++++++++++++++++ cpp/src/arrow/ipc/reader.cc | 101 +++++++++++++++++++ cpp/src/arrow/ipc/reader.h | 25 +++++ cpp/src/arrow/ipc/writer.cc | 101 +++++++++++++++++++ cpp/src/arrow/ipc/writer.h | 15 +++ 9 files changed, 494 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 8adf4a8b660..23709a46192 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -63,6 +63,8 @@ class Message::MessageImpl { return Message::RECORD_BATCH; case flatbuf::MessageHeader_Tensor: return Message::TENSOR; + case flatbuf::MessageHeader_SparseTensor: + return Message::SPARSE_TENSOR; default: return Message::NONE; } diff --git a/cpp/src/arrow/ipc/message.h b/cpp/src/arrow/ipc/message.h index 092a19ff9a0..760012d1a68 100644 --- a/cpp/src/arrow/ipc/message.h +++ b/cpp/src/arrow/ipc/message.h @@ -70,7 +70,7 @@ constexpr int kMaxNestingDepth = 64; /// \brief An IPC message including metadata and body class ARROW_EXPORT Message { public: - enum Type { NONE, SCHEMA, DICTIONARY_BATCH, RECORD_BATCH, TENSOR }; + enum Type { NONE, SCHEMA, DICTIONARY_BATCH, RECORD_BATCH, TENSOR, SPARSE_TENSOR }; /// \brief Construct message, but do not validate /// diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 1d4c80c2946..a644f434c24 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -29,10 +29,12 @@ #include "arrow/ipc/File_generated.h" // IWYU pragma: keep #include "arrow/ipc/Message_generated.h" #include "arrow/ipc/Tensor_generated.h" // IWYU pragma: keep +#include "arrow/ipc/SparseTensor_generated.h" #include "arrow/ipc/message.h" #include "arrow/ipc/util.h" #include "arrow/status.h" #include "arrow/tensor.h" +#include "arrow/sparse_tensor.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" @@ -50,6 +52,7 @@ using DictionaryOffset = flatbuffers::Offset; using FieldOffset = flatbuffers::Offset; using KeyValueOffset = flatbuffers::Offset; using RecordBatchOffset = flatbuffers::Offset; +using SparseTensorOffset = flatbuffers::Offset; using Offset = flatbuffers::Offset; using FBString = flatbuffers::Offset; @@ -781,6 +784,85 @@ Status WriteTensorMessage(const Tensor& tensor, int64_t buffer_start_offset, body_length, out); } +Status MakeSparseTensorIndexCOO(FBB& fbb, const SparseCOOIndex& sparse_index, const std::vector& buffers, flatbuf::SparseTensorIndex* fb_sparse_index_type, Offset* fb_sparse_index, size_t* num_buffers) { + *fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseTensorIndexCOO; + const BufferMetadata& indices_metadata = buffers[0]; + flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length); + *fb_sparse_index = flatbuf::CreateSparseTensorIndexCOO(fbb, &indices).Union(); + *num_buffers = 1; + return Status::OK(); +} + +Status MakeSparseMatrixIndexCSR(FBB& fbb, const SparseCSRIndex& sparse_index, const std::vector& buffers, flatbuf::SparseTensorIndex* fb_sparse_index_type, Offset* fb_sparse_index, size_t* num_buffers) { + *fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseMatrixIndexCSR; + const BufferMetadata& indptr_metadata = buffers[0]; + const BufferMetadata& indices_metadata = buffers[1]; + flatbuf::Buffer indptr(indptr_metadata.offset, indptr_metadata.length); + flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length); + *fb_sparse_index = flatbuf::CreateSparseMatrixIndexCSR(fbb, &indptr, &indices).Union(); + *num_buffers = 2; + return Status::OK(); +} + +Status MakeSparseTensorIndex(FBB& fbb, const SparseIndex& sparse_index, const std::vector& buffers, flatbuf::SparseTensorIndex* fb_sparse_index_type, Offset* fb_sparse_index, size_t* num_buffers) { + switch (sparse_index.format_id()) { + case SparseTensorFormat::COO: + RETURN_NOT_OK(MakeSparseTensorIndexCOO(fbb, checked_cast(sparse_index), buffers, fb_sparse_index_type, fb_sparse_index, num_buffers)); + break; + + case SparseTensorFormat::CSR: + RETURN_NOT_OK(MakeSparseMatrixIndexCSR(fbb, checked_cast(sparse_index), buffers, fb_sparse_index_type, fb_sparse_index, num_buffers)); + break; + + default: + std::stringstream ss; + ss << "Unsupporoted sparse tensor format:: " << sparse_index.ToString() << std::endl; + return Status::NotImplemented(ss.str()); + } + + return Status::OK(); +} + +Status MakeSparseTensor(FBB& fbb, const SparseTensorBase& sparse_tensor, int64_t body_length, + const std::vector& buffers, + SparseTensorOffset* offset) { + flatbuf::Type fb_type_type; + Offset fb_type; + RETURN_NOT_OK(TensorTypeToFlatbuffer(fbb, *sparse_tensor.type(), &fb_type_type, &fb_type)); + + using TensorDimOffset = flatbuffers::Offset; + std::vector dims; + for (int i = 0; i < sparse_tensor.ndim(); ++i) { + FBString name = fbb.CreateString(sparse_tensor.dim_name(i)); + dims.push_back(flatbuf::CreateTensorDim(fbb, sparse_tensor.shape()[i], name)); + } + + auto fb_shape = fbb.CreateVector(dims); + + flatbuf::SparseTensorIndex fb_sparse_index_type; + Offset fb_sparse_index; + size_t num_index_buffers = 0; + RETURN_NOT_OK(MakeSparseTensorIndex(fbb, *sparse_tensor.sparse_index(), buffers, &fb_sparse_index_type, &fb_sparse_index, &num_index_buffers)); + + const BufferMetadata& data_metadata = buffers[num_index_buffers]; + flatbuf::Buffer data(data_metadata.offset, data_metadata.length); + + int64_t length = sparse_tensor.length(); + + *offset = flatbuf::CreateSparseTensor(fbb, fb_type_type, fb_type, fb_shape, length, fb_sparse_index_type, fb_sparse_index, &data); + + return Status::OK(); +} + +Status WriteSparseTensorMessage(const SparseTensorBase& sparse_tensor, int64_t body_length, + const std::vector& buffers, + std::shared_ptr* out) { + FBB fbb; + SparseTensorOffset fb_sparse_tensor; + RETURN_NOT_OK(MakeSparseTensor(fbb, sparse_tensor, body_length, buffers, &fb_sparse_tensor)); + return WriteFBMessage(fbb, flatbuf::MessageHeader_SparseTensor, fb_sparse_tensor.Union(), body_length, out); +} + Status WriteDictionaryMessage(int64_t id, int64_t length, int64_t body_length, const std::vector& nodes, const std::vector& buffers, @@ -933,6 +1015,52 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type return TypeFromFlatbuffer(tensor->type_type(), tensor->type(), {}, type); } +Status GetSparseTensorMetadata(const Buffer& metadata, + std::shared_ptr* type, std::vector* shape, + std::vector* dim_names, int64_t* length, + SparseTensorFormat::type* sparse_tensor_format_id) +{ + auto message = flatbuf::GetMessage(metadata.data()); + if (message->header_type() != flatbuf::MessageHeader_SparseTensor) { + DCHECK_EQ(message->header_type(), flatbuf::MessageHeader_SparseTensor); + } + if (message->header() == nullptr) { + return Status::IOError("Header-pointer of flatbuffer-encoded Message is null."); + } + + auto sparse_tensor = reinterpret_cast(message->header()); + int ndim = static_cast(sparse_tensor->shape()->size()); + + for (int i = 0; i < ndim; ++i) { + auto dim = sparse_tensor->shape()->Get(i); + + shape->push_back(dim->size()); + auto fb_name = dim->name(); + if (fb_name == 0) { + dim_names->push_back(""); + } else { + dim_names->push_back(fb_name->str()); + } + } + + *length = sparse_tensor->length(); + + switch (sparse_tensor->sparseIndex_type()) { + case flatbuf::SparseTensorIndex_SparseTensorIndexCOO: + *sparse_tensor_format_id = SparseTensorFormat::COO; + break; + + case flatbuf::SparseTensorIndex_SparseMatrixIndexCSR: + *sparse_tensor_format_id = SparseTensorFormat::CSR; + break; + + default: + return Status::Invalid("Unrecognized sparse index type"); + } + + return TypeFromFlatbuffer(sparse_tensor->type_type(), sparse_tensor->type(), {}, type); +} + // ---------------------------------------------------------------------- // Implement message writing diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index 152ca1367ec..621a9c66bf5 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -33,6 +33,7 @@ #include "arrow/ipc/dictionary.h" // IYWU pragma: keep #include "arrow/ipc/message.h" #include "arrow/memory_pool.h" +#include "arrow/sparse_tensor_format.h" #include "arrow/status.h" namespace arrow { @@ -40,6 +41,7 @@ namespace arrow { class DataType; class Schema; class Tensor; +class SparseTensorBase; namespace flatbuf = org::apache::arrow::flatbuf; @@ -103,6 +105,11 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type std::vector* shape, std::vector* strides, std::vector* dim_names); +Status GetSparseTensorMetadata(const Buffer& metadata, + std::shared_ptr* type, std::vector* shape, + std::vector* dim_names, int64_t* length, + SparseTensorFormat::type* sparse_tensor_format_id); + /// Write a serialized message metadata with a length-prefix and padding to an /// 8-byte offset. Does not make assumptions about whether the stream is /// aligned already @@ -137,6 +144,10 @@ Status WriteRecordBatchMessage(const int64_t length, const int64_t body_length, Status WriteTensorMessage(const Tensor& tensor, const int64_t buffer_start_offset, std::shared_ptr* out); +Status WriteSparseTensorMessage(const SparseTensorBase& sparse_tensor, int64_t body_length, + const std::vector& buffers, + std::shared_ptr* out); + Status WriteFileFooter(const Schema& schema, const std::vector& dictionaries, const std::vector& record_batches, DictionaryMemo* dictionary_memo, io::OutputStream* out); diff --git a/cpp/src/arrow/ipc/read-write-test.cc b/cpp/src/arrow/ipc/read-write-test.cc index 3a723badf37..14055013247 100644 --- a/cpp/src/arrow/ipc/read-write-test.cc +++ b/cpp/src/arrow/ipc/read-write-test.cc @@ -40,6 +40,7 @@ #include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/tensor.h" +#include "arrow/sparse_tensor.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/bit-util.h" @@ -844,6 +845,115 @@ TEST_F(TestTensorRoundTrip, NonContiguous) { CheckTensorRoundTrip(tensor); } +class TestSparseTensorRoundTrip : public ::testing::Test, public IpcTestFixture { + public: + void SetUp() { pool_ = default_memory_pool(); } + void TearDown() { io::MemoryMapFixture::TearDown(); } + + template + void CheckSparseTensorRoundTrip(const SparseTensor& tensor) { + GTEST_FAIL(); + } +}; + +template <> +void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( + const SparseTensor& tensor) { + + const auto& type = checked_cast(*tensor.type()); + const int elem_size = type.bit_width() / 8; + + int32_t metadata_length; + int64_t body_length; + + ASSERT_OK(mmap_->Seek(0)); + + ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length, default_memory_pool())); + + const auto& sparse_index = checked_cast(*tensor.sparse_index()); + const int64_t indices_length = elem_size * sparse_index.indices()->size(); + const int64_t data_length = elem_size * tensor.length(); + const int64_t expected_body_length = indices_length + data_length; + ASSERT_EQ(expected_body_length, body_length); + + ASSERT_OK(mmap_->Seek(0)); + + std::shared_ptr result; + ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); + + const auto& resulted_sparse_index = checked_cast(*result->sparse_index()); + ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); + ASSERT_EQ(result->data()->size(), data_length); + // TODO ASSERT_TRUE(sparse_tensor.Equals(*result)); +} + +template <> +void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( + const SparseTensor& tensor) { + + const auto& type = checked_cast(*tensor.type()); + const int elem_size = type.bit_width() / 8; + + int32_t metadata_length; + int64_t body_length; + + ASSERT_OK(mmap_->Seek(0)); + + ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length, default_memory_pool())); + + const auto& sparse_index = checked_cast(*tensor.sparse_index()); + const int64_t indptr_length = elem_size * sparse_index.indptr()->size(); + const int64_t indices_length = elem_size * sparse_index.indices()->size(); + const int64_t data_length = elem_size * tensor.length(); + const int64_t expected_body_length = indptr_length + indices_length + data_length; + ASSERT_EQ(expected_body_length, body_length); + + ASSERT_OK(mmap_->Seek(0)); + + std::shared_ptr result; + ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); + + const auto& resulted_sparse_index = checked_cast(*result->sparse_index()); + ASSERT_EQ(resulted_sparse_index.indptr()->data()->size(), indptr_length); + ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); + ASSERT_EQ(result->data()->size(), data_length); + // TODO ASSERT_TRUE(sparse_tensor.Equals(*result)); +} + +TEST_F(TestSparseTensorRoundTrip, WithSparseCOOIndex) { + std::string path = "test-write-sparse-coo-tensor"; + constexpr int64_t kBufferSize = 1 << 20; + ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_)); + + std::vector shape = {2, 3, 4}; + std::vector dim_names = {"foo", "bar", "baz"}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + + auto data = Buffer::Wrap(values); + NumericTensor t(data, shape, {}, dim_names); + SparseTensor st(t); + + CheckSparseTensorRoundTrip(st); +} + +TEST_F(TestSparseTensorRoundTrip, WithSparseCSRIndex) { + std::string path = "test-write-sparse-csr-matrix"; + constexpr int64_t kBufferSize = 1 << 20; + ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_)); + + std::vector shape = {4, 6}; + std::vector dim_names = {"foo", "bar", "baz"}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + + auto data = Buffer::Wrap(values); + NumericTensor t(data, shape, {}, dim_names); + SparseTensor st(t); + + CheckSparseTensorRoundTrip(st); +} + TEST(TestRecordBatchStreamReader, MalformedInput) { const std::string empty_str = ""; const std::string garbage_str = "12345678"; diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 59a322a6433..fabe6f81f01 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -40,6 +40,7 @@ #include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/tensor.h" +#include "arrow/sparse_tensor.h" #include "arrow/type.h" #include "arrow/util/logging.h" #include "arrow/visitor_inline.h" @@ -726,5 +727,105 @@ Status ReadTensor(const Message& message, std::shared_ptr* out) { return Status::OK(); } +namespace { + +Status ReadSparseCOOIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, int64_t length, io::RandomAccessFile* file, + std::shared_ptr* out) { + auto* sparse_index = sparse_tensor->sparseIndex_as_SparseTensorIndexCOO(); + auto* indices_buffer = sparse_index->indicesBuffer(); + std::shared_ptr indices_data; + RETURN_NOT_OK(file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); + std::vector shape({length, ndim}); + const int64_t elsize = sizeof(int64_t); + std::vector strides({elsize, elsize * length}); + *out = std::make_shared( + std::make_shared(indices_data, shape, strides)); + return Status::OK(); +} + +Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, int64_t length, io::RandomAccessFile* file, + std::shared_ptr* out) { + auto* sparse_index = sparse_tensor->sparseIndex_as_SparseMatrixIndexCSR(); + + auto* indptr_buffer = sparse_index->indptrBuffer(); + std::shared_ptr indptr_data; + RETURN_NOT_OK(file->ReadAt(indptr_buffer->offset(), indptr_buffer->length(), &indptr_data)); + + auto* indices_buffer = sparse_index->indicesBuffer(); + std::shared_ptr indices_data; + RETURN_NOT_OK(file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); + + std::vector indptr_shape({ndim+1}); + std::vector indices_shape({length}); + *out = std::make_shared( + std::make_shared(indptr_data, indptr_shape), + std::make_shared(indices_data, indices_shape)); + return Status::OK(); +} + +Status MakeSparseTensorWithSparseCOOIndex(const std::shared_ptr& type, const std::vector& shape, const std::vector& dim_names, const std::shared_ptr& sparse_index, int64_t length, const std::shared_ptr& data, std::shared_ptr* out) { + auto* sparse_tensor = new SparseTensor(sparse_index, type, data, shape, dim_names); + *out = std::shared_ptr(sparse_tensor); + return Status::OK(); +} + +Status MakeSparseTensorWithSparseCSRIndex(const std::shared_ptr& type, const std::vector& shape, const std::vector& dim_names, const std::shared_ptr& sparse_index, int64_t length, const std::shared_ptr& data, std::shared_ptr* out) { + auto* sparse_tensor = new SparseTensor(sparse_index, type, data, shape, dim_names); + *out = std::shared_ptr(sparse_tensor); + return Status::OK(); +} + +} // namespace + +Status ReadSparseTensor(const Buffer& metadata, + io::RandomAccessFile* file, + std::shared_ptr* out) { + std::shared_ptr type; + std::vector shape; + std::vector dim_names; + int64_t length; + SparseTensorFormat::type sparse_tensor_format_id; + + RETURN_NOT_OK(internal::GetSparseTensorMetadata(metadata, &type, &shape, &dim_names, + &length, &sparse_tensor_format_id)); + + auto message = flatbuf::GetMessage(metadata.data()); + auto sparse_tensor = reinterpret_cast(message->header()); + const flatbuf::Buffer* buffer = sparse_tensor->data(); + DCHECK(BitUtil::IsMultipleOf8(buffer->offset())) + << "Buffer of sparse index data " + << "did not start on 8-byte aligned offset: " << buffer->offset(); + + std::shared_ptr data; + RETURN_NOT_OK(file->ReadAt(buffer->offset(), buffer->length(), &data)); + + std::shared_ptr sparse_index; + switch (sparse_tensor_format_id) { + case SparseTensorFormat::COO: + RETURN_NOT_OK(ReadSparseCOOIndex(sparse_tensor, shape.size(), length, file, &sparse_index)); + return MakeSparseTensorWithSparseCOOIndex(type, shape, dim_names, std::dynamic_pointer_cast(sparse_index), length, data, out); + + case SparseTensorFormat::CSR: + RETURN_NOT_OK(ReadSparseCSRIndex(sparse_tensor, shape.size(), length, file, &sparse_index)); + return MakeSparseTensorWithSparseCSRIndex(type, shape, dim_names, std::dynamic_pointer_cast(sparse_index), length, data, out); + + default: + return Status::Invalid("Unsupported sparse index format"); + } +} + +Status ReadSparseTensor(const Message& message, std::shared_ptr* out) { + io::BufferReader buffer_reader(message.body()); + return ReadSparseTensor(*message.metadata(), &buffer_reader, out); +} + +Status ReadSparseTensor(io::InputStream* file, std::shared_ptr* out) { + std::unique_ptr message; + RETURN_NOT_OK(ReadContiguousPayload(file, &message)); + DCHECK_EQ(message->type(), Message::SPARSE_TENSOR); + io::BufferReader buffer_reader(message->body()); + return ReadSparseTensor(*message->metadata(), &buffer_reader, out); +} + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index 942664d6f22..ca15d9c5357 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -33,6 +33,7 @@ class Buffer; class Schema; class Status; class Tensor; +class SparseTensorBase; namespace io { @@ -235,6 +236,30 @@ Status ReadTensor(io::InputStream* file, std::shared_ptr* out); ARROW_EXPORT Status ReadTensor(const Message& message, std::shared_ptr* out); +/// \brief Read arrow::SparseTensor as encapsulated IPC message in file +/// +/// \param[in] file an InputStream pointed at the start of the message +/// \param[out] out the read sparse tensor +/// \return Status +ARROW_EXPORT +Status ReadSparseTensor(io::InputStream* file, std::shared_ptr* out); + +/// \brief EXPERIMENTAL: Read arrow::Tensor from IPC message +/// +/// \param[in] message a Message containing the tensor metadata and body +/// \param[out] out the read tensor +/// \return Status +ARROW_EXPORT +Status ReadTensor(const Message& message, std::shared_ptr* out); + +/// \brief EXPERIMENTAL: Read arrow::SparseTensor from IPC message +/// +/// \param[in] message a Message containing the tensor metadata and body +/// \param[out] out the read sparse tensor +/// \return Status +ARROW_EXPORT +Status ReadSparseTensor(const Message& message, std::shared_ptr* out); + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 6ce72e070e7..b02962a2272 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include "arrow/array.h" @@ -36,6 +37,7 @@ #include "arrow/status.h" #include "arrow/table.h" #include "arrow/tensor.h" +#include "arrow/sparse_tensor.h" #include "arrow/type.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" @@ -671,6 +673,105 @@ Status GetTensorMessage(const Tensor& tensor, MemoryPool* pool, return Status::OK(); } +namespace internal { + +class SparseTensorSerializer { + public: + SparseTensorSerializer(int64_t buffer_start_offset, IpcPayload* out) + : out_(out), + buffer_start_offset_(buffer_start_offset) {} + + ~SparseTensorSerializer() = default; + + Status VisitSparseIndex(const SparseIndex& sparse_index) { + switch (sparse_index.format_id()) { + case SparseTensorFormat::COO: + RETURN_NOT_OK(VisitSparseCOOIndex(checked_cast(sparse_index))); + break; + + case SparseTensorFormat::CSR: + RETURN_NOT_OK(VisitSparseCSRIndex(checked_cast(sparse_index))); + break; + + default: + std::stringstream ss; + ss << "Unable to convert type: " << sparse_index.ToString() << std::endl; + return Status::NotImplemented(ss.str()); + } + + return Status::OK(); + } + + Status SerializeMetadata(const SparseTensorBase& sparse_tensor) { + return WriteSparseTensorMessage(sparse_tensor, out_->body_length, buffer_meta_, + &out_->metadata); + } + + Status Assemble(const SparseTensorBase& sparse_tensor) { + if (buffer_meta_.size() > 0) { + buffer_meta_.clear(); + out_->body_buffers.clear(); + } + + RETURN_NOT_OK(VisitSparseIndex(*sparse_tensor.sparse_index())); + out_->body_buffers.emplace_back(sparse_tensor.data()); + + int64_t offset = buffer_start_offset_; + buffer_meta_.reserve(out_->body_buffers.size()); + + for (size_t i = 0; i < out_->body_buffers.size(); ++i) { + const Buffer* buffer = out_->body_buffers[i].get(); + int64_t size = buffer->size(); + int64_t padding = BitUtil::RoundUpToMultipleOf8(size) - size; + buffer_meta_.push_back({offset, size + padding}); + offset += size + padding; + } + + out_->body_length = offset - buffer_start_offset_; + DCHECK(BitUtil::IsMultipleOf8(out_->body_length)); + + return SerializeMetadata(sparse_tensor); + } + + private: + Status VisitSparseCOOIndex(const SparseCOOIndex& sparse_index) { + out_->body_buffers.emplace_back(sparse_index.indices()->data()); + return Status::OK(); + } + + Status VisitSparseCSRIndex(const SparseCSRIndex& sparse_index) { + out_->body_buffers.emplace_back(sparse_index.indptr()->data()); + out_->body_buffers.emplace_back(sparse_index.indices()->data()); + return Status::OK(); + } + + IpcPayload* out_; + + std::vector buffer_meta_; + + int64_t buffer_start_offset_; +}; + + +Status GetSparseTensorPayload(const SparseTensorBase& sparse_tensor, MemoryPool* pool, + IpcPayload* out) { + SparseTensorSerializer writer(0, out); + return writer.Assemble(sparse_tensor); +} + +} // namespace internal + +Status WriteSparseTensor(const SparseTensorBase& sparse_tensor, + io::OutputStream* dst, int32_t* metadata_length, + int64_t* body_length, MemoryPool* pool) { + internal::IpcPayload payload; + internal::SparseTensorSerializer writer(0, &payload); + RETURN_NOT_OK(writer.Assemble(sparse_tensor)); + + *body_length = payload.body_length; + return internal::WriteIpcPayload(payload, dst, metadata_length); +} + Status WriteDictionary(int64_t dictionary_id, const std::shared_ptr& dictionary, int64_t buffer_start_offset, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length, MemoryPool* pool) { diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index a1c711146ef..85b8663ea49 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -36,6 +36,7 @@ class Schema; class Status; class Table; class Tensor; +class SparseTensorBase; namespace io { @@ -269,6 +270,20 @@ ARROW_EXPORT Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length); +// \brief Write arrow::SparseTensor as a contiguous mesasge. The metadata, +// sparse index, and body are written assuming 64-byte alignment. It is the +// user's responsibility to ensure that the OutputStream has been aligned +// to a 64-byte multiple before writing the message. +// +// \param[in] tensor the SparseTensor to write +// \param[in] dst the OutputStream to write to +// \param[out] metadata_length the actual metadata length, including padding +// \param[out] body_length the actual message body length +ARROW_EXPORT +Status WriteSparseTensor(const SparseTensorBase& sparse_tensor, + io::OutputStream* dst, int32_t* metadata_length, + int64_t* body_length, MemoryPool* pool); + namespace internal { // These internal APIs may change without warning or deprecation From b3a62ebfa0c683d4c2215ec4bf4f55948ccec00c Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Fri, 7 Dec 2018 15:20:17 +0900 Subject: [PATCH 19/40] Fix format --- cpp/src/arrow/ipc/metadata-internal.cc | 60 +++++++++++++++++--------- cpp/src/arrow/ipc/metadata-internal.h | 7 +-- cpp/src/arrow/ipc/read-write-test.cc | 16 ++++--- cpp/src/arrow/ipc/reader.cc | 54 +++++++++++++++-------- cpp/src/arrow/ipc/writer.cc | 18 ++++---- cpp/src/arrow/ipc/writer.h | 6 +-- 6 files changed, 102 insertions(+), 59 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index a644f434c24..1b315d20746 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -28,13 +28,13 @@ #include "arrow/io/interfaces.h" #include "arrow/ipc/File_generated.h" // IWYU pragma: keep #include "arrow/ipc/Message_generated.h" -#include "arrow/ipc/Tensor_generated.h" // IWYU pragma: keep #include "arrow/ipc/SparseTensor_generated.h" +#include "arrow/ipc/Tensor_generated.h" // IWYU pragma: keep #include "arrow/ipc/message.h" #include "arrow/ipc/util.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/tensor.h" -#include "arrow/sparse_tensor.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" @@ -784,7 +784,10 @@ Status WriteTensorMessage(const Tensor& tensor, int64_t buffer_start_offset, body_length, out); } -Status MakeSparseTensorIndexCOO(FBB& fbb, const SparseCOOIndex& sparse_index, const std::vector& buffers, flatbuf::SparseTensorIndex* fb_sparse_index_type, Offset* fb_sparse_index, size_t* num_buffers) { +Status MakeSparseTensorIndexCOO(FBB& fbb, const SparseCOOIndex& sparse_index, + const std::vector& buffers, + flatbuf::SparseTensorIndex* fb_sparse_index_type, + Offset* fb_sparse_index, size_t* num_buffers) { *fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseTensorIndexCOO; const BufferMetadata& indices_metadata = buffers[0]; flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length); @@ -793,7 +796,10 @@ Status MakeSparseTensorIndexCOO(FBB& fbb, const SparseCOOIndex& sparse_index, co return Status::OK(); } -Status MakeSparseMatrixIndexCSR(FBB& fbb, const SparseCSRIndex& sparse_index, const std::vector& buffers, flatbuf::SparseTensorIndex* fb_sparse_index_type, Offset* fb_sparse_index, size_t* num_buffers) { +Status MakeSparseMatrixIndexCSR(FBB& fbb, const SparseCSRIndex& sparse_index, + const std::vector& buffers, + flatbuf::SparseTensorIndex* fb_sparse_index_type, + Offset* fb_sparse_index, size_t* num_buffers) { *fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseMatrixIndexCSR; const BufferMetadata& indptr_metadata = buffers[0]; const BufferMetadata& indices_metadata = buffers[1]; @@ -804,31 +810,40 @@ Status MakeSparseMatrixIndexCSR(FBB& fbb, const SparseCSRIndex& sparse_index, co return Status::OK(); } -Status MakeSparseTensorIndex(FBB& fbb, const SparseIndex& sparse_index, const std::vector& buffers, flatbuf::SparseTensorIndex* fb_sparse_index_type, Offset* fb_sparse_index, size_t* num_buffers) { +Status MakeSparseTensorIndex(FBB& fbb, const SparseIndex& sparse_index, + const std::vector& buffers, + flatbuf::SparseTensorIndex* fb_sparse_index_type, + Offset* fb_sparse_index, size_t* num_buffers) { switch (sparse_index.format_id()) { case SparseTensorFormat::COO: - RETURN_NOT_OK(MakeSparseTensorIndexCOO(fbb, checked_cast(sparse_index), buffers, fb_sparse_index_type, fb_sparse_index, num_buffers)); + RETURN_NOT_OK(MakeSparseTensorIndexCOO( + fbb, checked_cast(sparse_index), buffers, + fb_sparse_index_type, fb_sparse_index, num_buffers)); break; case SparseTensorFormat::CSR: - RETURN_NOT_OK(MakeSparseMatrixIndexCSR(fbb, checked_cast(sparse_index), buffers, fb_sparse_index_type, fb_sparse_index, num_buffers)); + RETURN_NOT_OK(MakeSparseMatrixIndexCSR( + fbb, checked_cast(sparse_index), buffers, + fb_sparse_index_type, fb_sparse_index, num_buffers)); break; default: std::stringstream ss; - ss << "Unsupporoted sparse tensor format:: " << sparse_index.ToString() << std::endl; + ss << "Unsupporoted sparse tensor format:: " << sparse_index.ToString() + << std::endl; return Status::NotImplemented(ss.str()); } return Status::OK(); } -Status MakeSparseTensor(FBB& fbb, const SparseTensorBase& sparse_tensor, int64_t body_length, - const std::vector& buffers, +Status MakeSparseTensor(FBB& fbb, const SparseTensorBase& sparse_tensor, + int64_t body_length, const std::vector& buffers, SparseTensorOffset* offset) { flatbuf::Type fb_type_type; Offset fb_type; - RETURN_NOT_OK(TensorTypeToFlatbuffer(fbb, *sparse_tensor.type(), &fb_type_type, &fb_type)); + RETURN_NOT_OK( + TensorTypeToFlatbuffer(fbb, *sparse_tensor.type(), &fb_type_type, &fb_type)); using TensorDimOffset = flatbuffers::Offset; std::vector dims; @@ -842,25 +857,31 @@ Status MakeSparseTensor(FBB& fbb, const SparseTensorBase& sparse_tensor, int64_t flatbuf::SparseTensorIndex fb_sparse_index_type; Offset fb_sparse_index; size_t num_index_buffers = 0; - RETURN_NOT_OK(MakeSparseTensorIndex(fbb, *sparse_tensor.sparse_index(), buffers, &fb_sparse_index_type, &fb_sparse_index, &num_index_buffers)); + RETURN_NOT_OK(MakeSparseTensorIndex(fbb, *sparse_tensor.sparse_index(), buffers, + &fb_sparse_index_type, &fb_sparse_index, + &num_index_buffers)); const BufferMetadata& data_metadata = buffers[num_index_buffers]; flatbuf::Buffer data(data_metadata.offset, data_metadata.length); int64_t length = sparse_tensor.length(); - *offset = flatbuf::CreateSparseTensor(fbb, fb_type_type, fb_type, fb_shape, length, fb_sparse_index_type, fb_sparse_index, &data); + *offset = flatbuf::CreateSparseTensor(fbb, fb_type_type, fb_type, fb_shape, length, + fb_sparse_index_type, fb_sparse_index, &data); return Status::OK(); } -Status WriteSparseTensorMessage(const SparseTensorBase& sparse_tensor, int64_t body_length, +Status WriteSparseTensorMessage(const SparseTensorBase& sparse_tensor, + int64_t body_length, const std::vector& buffers, std::shared_ptr* out) { FBB fbb; SparseTensorOffset fb_sparse_tensor; - RETURN_NOT_OK(MakeSparseTensor(fbb, sparse_tensor, body_length, buffers, &fb_sparse_tensor)); - return WriteFBMessage(fbb, flatbuf::MessageHeader_SparseTensor, fb_sparse_tensor.Union(), body_length, out); + RETURN_NOT_OK( + MakeSparseTensor(fbb, sparse_tensor, body_length, buffers, &fb_sparse_tensor)); + return WriteFBMessage(fbb, flatbuf::MessageHeader_SparseTensor, + fb_sparse_tensor.Union(), body_length, out); } Status WriteDictionaryMessage(int64_t id, int64_t length, int64_t body_length, @@ -1015,11 +1036,10 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type return TypeFromFlatbuffer(tensor->type_type(), tensor->type(), {}, type); } -Status GetSparseTensorMetadata(const Buffer& metadata, - std::shared_ptr* type, std::vector* shape, +Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* type, + std::vector* shape, std::vector* dim_names, int64_t* length, - SparseTensorFormat::type* sparse_tensor_format_id) -{ + SparseTensorFormat::type* sparse_tensor_format_id) { auto message = flatbuf::GetMessage(metadata.data()); if (message->header_type() != flatbuf::MessageHeader_SparseTensor) { DCHECK_EQ(message->header_type(), flatbuf::MessageHeader_SparseTensor); diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index 621a9c66bf5..420fca8a0bd 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -105,8 +105,8 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type std::vector* shape, std::vector* strides, std::vector* dim_names); -Status GetSparseTensorMetadata(const Buffer& metadata, - std::shared_ptr* type, std::vector* shape, +Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* type, + std::vector* shape, std::vector* dim_names, int64_t* length, SparseTensorFormat::type* sparse_tensor_format_id); @@ -144,7 +144,8 @@ Status WriteRecordBatchMessage(const int64_t length, const int64_t body_length, Status WriteTensorMessage(const Tensor& tensor, const int64_t buffer_start_offset, std::shared_ptr* out); -Status WriteSparseTensorMessage(const SparseTensorBase& sparse_tensor, int64_t body_length, +Status WriteSparseTensorMessage(const SparseTensorBase& sparse_tensor, + int64_t body_length, const std::vector& buffers, std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/read-write-test.cc b/cpp/src/arrow/ipc/read-write-test.cc index 14055013247..c69d139a37a 100644 --- a/cpp/src/arrow/ipc/read-write-test.cc +++ b/cpp/src/arrow/ipc/read-write-test.cc @@ -38,9 +38,9 @@ #include "arrow/ipc/writer.h" #include "arrow/memory_pool.h" #include "arrow/record_batch.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/tensor.h" -#include "arrow/sparse_tensor.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/bit-util.h" @@ -859,7 +859,6 @@ class TestSparseTensorRoundTrip : public ::testing::Test, public IpcTestFixture template <> void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( const SparseTensor& tensor) { - const auto& type = checked_cast(*tensor.type()); const int elem_size = type.bit_width() / 8; @@ -868,7 +867,8 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( ASSERT_OK(mmap_->Seek(0)); - ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length, default_memory_pool())); + ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length, + default_memory_pool())); const auto& sparse_index = checked_cast(*tensor.sparse_index()); const int64_t indices_length = elem_size * sparse_index.indices()->size(); @@ -881,7 +881,8 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( std::shared_ptr result; ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); - const auto& resulted_sparse_index = checked_cast(*result->sparse_index()); + const auto& resulted_sparse_index = + checked_cast(*result->sparse_index()); ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); ASSERT_EQ(result->data()->size(), data_length); // TODO ASSERT_TRUE(sparse_tensor.Equals(*result)); @@ -890,7 +891,6 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( template <> void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( const SparseTensor& tensor) { - const auto& type = checked_cast(*tensor.type()); const int elem_size = type.bit_width() / 8; @@ -899,7 +899,8 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( ASSERT_OK(mmap_->Seek(0)); - ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length, default_memory_pool())); + ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length, + default_memory_pool())); const auto& sparse_index = checked_cast(*tensor.sparse_index()); const int64_t indptr_length = elem_size * sparse_index.indptr()->size(); @@ -913,7 +914,8 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( std::shared_ptr result; ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); - const auto& resulted_sparse_index = checked_cast(*result->sparse_index()); + const auto& resulted_sparse_index = + checked_cast(*result->sparse_index()); ASSERT_EQ(resulted_sparse_index.indptr()->data()->size(), indptr_length); ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); ASSERT_EQ(result->data()->size(), data_length); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index fabe6f81f01..62faaef0475 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -38,9 +38,9 @@ #include "arrow/ipc/message.h" #include "arrow/ipc/metadata-internal.h" #include "arrow/record_batch.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/tensor.h" -#include "arrow/sparse_tensor.h" #include "arrow/type.h" #include "arrow/util/logging.h" #include "arrow/visitor_inline.h" @@ -729,12 +729,14 @@ Status ReadTensor(const Message& message, std::shared_ptr* out) { namespace { -Status ReadSparseCOOIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, int64_t length, io::RandomAccessFile* file, +Status ReadSparseCOOIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, + int64_t length, io::RandomAccessFile* file, std::shared_ptr* out) { auto* sparse_index = sparse_tensor->sparseIndex_as_SparseTensorIndexCOO(); auto* indices_buffer = sparse_index->indicesBuffer(); std::shared_ptr indices_data; - RETURN_NOT_OK(file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); + RETURN_NOT_OK( + file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); std::vector shape({length, ndim}); const int64_t elsize = sizeof(int64_t); std::vector strides({elsize, elsize * length}); @@ -743,19 +745,22 @@ Status ReadSparseCOOIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t nd return Status::OK(); } -Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, int64_t length, io::RandomAccessFile* file, +Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, + int64_t length, io::RandomAccessFile* file, std::shared_ptr* out) { auto* sparse_index = sparse_tensor->sparseIndex_as_SparseMatrixIndexCSR(); auto* indptr_buffer = sparse_index->indptrBuffer(); std::shared_ptr indptr_data; - RETURN_NOT_OK(file->ReadAt(indptr_buffer->offset(), indptr_buffer->length(), &indptr_data)); + RETURN_NOT_OK( + file->ReadAt(indptr_buffer->offset(), indptr_buffer->length(), &indptr_data)); auto* indices_buffer = sparse_index->indicesBuffer(); std::shared_ptr indices_data; - RETURN_NOT_OK(file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); + RETURN_NOT_OK( + file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); - std::vector indptr_shape({ndim+1}); + std::vector indptr_shape({ndim + 1}); std::vector indices_shape({length}); *out = std::make_shared( std::make_shared(indptr_data, indptr_shape), @@ -763,22 +768,31 @@ Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t nd return Status::OK(); } -Status MakeSparseTensorWithSparseCOOIndex(const std::shared_ptr& type, const std::vector& shape, const std::vector& dim_names, const std::shared_ptr& sparse_index, int64_t length, const std::shared_ptr& data, std::shared_ptr* out) { - auto* sparse_tensor = new SparseTensor(sparse_index, type, data, shape, dim_names); +Status MakeSparseTensorWithSparseCOOIndex( + const std::shared_ptr& type, const std::vector& shape, + const std::vector& dim_names, + const std::shared_ptr& sparse_index, int64_t length, + const std::shared_ptr& data, std::shared_ptr* out) { + auto* sparse_tensor = + new SparseTensor(sparse_index, type, data, shape, dim_names); *out = std::shared_ptr(sparse_tensor); return Status::OK(); } -Status MakeSparseTensorWithSparseCSRIndex(const std::shared_ptr& type, const std::vector& shape, const std::vector& dim_names, const std::shared_ptr& sparse_index, int64_t length, const std::shared_ptr& data, std::shared_ptr* out) { - auto* sparse_tensor = new SparseTensor(sparse_index, type, data, shape, dim_names); +Status MakeSparseTensorWithSparseCSRIndex( + const std::shared_ptr& type, const std::vector& shape, + const std::vector& dim_names, + const std::shared_ptr& sparse_index, int64_t length, + const std::shared_ptr& data, std::shared_ptr* out) { + auto* sparse_tensor = + new SparseTensor(sparse_index, type, data, shape, dim_names); *out = std::shared_ptr(sparse_tensor); return Status::OK(); } } // namespace -Status ReadSparseTensor(const Buffer& metadata, - io::RandomAccessFile* file, +Status ReadSparseTensor(const Buffer& metadata, io::RandomAccessFile* file, std::shared_ptr* out) { std::shared_ptr type; std::vector shape; @@ -802,12 +816,18 @@ Status ReadSparseTensor(const Buffer& metadata, std::shared_ptr sparse_index; switch (sparse_tensor_format_id) { case SparseTensorFormat::COO: - RETURN_NOT_OK(ReadSparseCOOIndex(sparse_tensor, shape.size(), length, file, &sparse_index)); - return MakeSparseTensorWithSparseCOOIndex(type, shape, dim_names, std::dynamic_pointer_cast(sparse_index), length, data, out); + RETURN_NOT_OK( + ReadSparseCOOIndex(sparse_tensor, shape.size(), length, file, &sparse_index)); + return MakeSparseTensorWithSparseCOOIndex( + type, shape, dim_names, std::dynamic_pointer_cast(sparse_index), + length, data, out); case SparseTensorFormat::CSR: - RETURN_NOT_OK(ReadSparseCSRIndex(sparse_tensor, shape.size(), length, file, &sparse_index)); - return MakeSparseTensorWithSparseCSRIndex(type, shape, dim_names, std::dynamic_pointer_cast(sparse_index), length, data, out); + RETURN_NOT_OK( + ReadSparseCSRIndex(sparse_tensor, shape.size(), length, file, &sparse_index)); + return MakeSparseTensorWithSparseCSRIndex( + type, shape, dim_names, std::dynamic_pointer_cast(sparse_index), + length, data, out); default: return Status::Invalid("Unsupported sparse index format"); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index b02962a2272..cd1d2773c0b 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -34,10 +34,10 @@ #include "arrow/ipc/util.h" #include "arrow/memory_pool.h" #include "arrow/record_batch.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/tensor.h" -#include "arrow/sparse_tensor.h" #include "arrow/type.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" @@ -678,19 +678,20 @@ namespace internal { class SparseTensorSerializer { public: SparseTensorSerializer(int64_t buffer_start_offset, IpcPayload* out) - : out_(out), - buffer_start_offset_(buffer_start_offset) {} + : out_(out), buffer_start_offset_(buffer_start_offset) {} ~SparseTensorSerializer() = default; Status VisitSparseIndex(const SparseIndex& sparse_index) { switch (sparse_index.format_id()) { case SparseTensorFormat::COO: - RETURN_NOT_OK(VisitSparseCOOIndex(checked_cast(sparse_index))); + RETURN_NOT_OK( + VisitSparseCOOIndex(checked_cast(sparse_index))); break; case SparseTensorFormat::CSR: - RETURN_NOT_OK(VisitSparseCSRIndex(checked_cast(sparse_index))); + RETURN_NOT_OK( + VisitSparseCSRIndex(checked_cast(sparse_index))); break; default: @@ -752,7 +753,6 @@ class SparseTensorSerializer { int64_t buffer_start_offset_; }; - Status GetSparseTensorPayload(const SparseTensorBase& sparse_tensor, MemoryPool* pool, IpcPayload* out) { SparseTensorSerializer writer(0, out); @@ -761,9 +761,9 @@ Status GetSparseTensorPayload(const SparseTensorBase& sparse_tensor, MemoryPool* } // namespace internal -Status WriteSparseTensor(const SparseTensorBase& sparse_tensor, - io::OutputStream* dst, int32_t* metadata_length, - int64_t* body_length, MemoryPool* pool) { +Status WriteSparseTensor(const SparseTensorBase& sparse_tensor, io::OutputStream* dst, + int32_t* metadata_length, int64_t* body_length, + MemoryPool* pool) { internal::IpcPayload payload; internal::SparseTensorSerializer writer(0, &payload); RETURN_NOT_OK(writer.Assemble(sparse_tensor)); diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index 85b8663ea49..996f0ff7302 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -280,9 +280,9 @@ Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadat // \param[out] metadata_length the actual metadata length, including padding // \param[out] body_length the actual message body length ARROW_EXPORT -Status WriteSparseTensor(const SparseTensorBase& sparse_tensor, - io::OutputStream* dst, int32_t* metadata_length, - int64_t* body_length, MemoryPool* pool); +Status WriteSparseTensor(const SparseTensorBase& sparse_tensor, io::OutputStream* dst, + int32_t* metadata_length, int64_t* body_length, + MemoryPool* pool); namespace internal { From d6a8c380591d0e1573015d7ef6897d539b2549d0 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Fri, 7 Dec 2018 16:31:45 +0900 Subject: [PATCH 20/40] Unify Tensor.fbs and SparseTensor.fbs --- cpp/src/arrow/ipc/CMakeLists.txt | 1 - cpp/src/arrow/ipc/metadata-internal.cc | 1 - format/Message.fbs | 1 - format/SparseTensor.fbs | 114 ------------------------- format/Tensor.fbs | 96 +++++++++++++++++++++ 5 files changed, 96 insertions(+), 117 deletions(-) delete mode 100644 format/SparseTensor.fbs diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 07e333b6edd..422e72e2eda 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -64,7 +64,6 @@ set(FBS_SRC ${CMAKE_SOURCE_DIR}/../format/File.fbs ${CMAKE_SOURCE_DIR}/../format/Schema.fbs ${CMAKE_SOURCE_DIR}/../format/Tensor.fbs - ${CMAKE_SOURCE_DIR}/../format/SparseTensor.fbs ${CMAKE_CURRENT_SOURCE_DIR}/feather.fbs) foreach(FIL ${FBS_SRC}) diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 1b315d20746..7d301b1d4e0 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -28,7 +28,6 @@ #include "arrow/io/interfaces.h" #include "arrow/ipc/File_generated.h" // IWYU pragma: keep #include "arrow/ipc/Message_generated.h" -#include "arrow/ipc/SparseTensor_generated.h" #include "arrow/ipc/Tensor_generated.h" // IWYU pragma: keep #include "arrow/ipc/message.h" #include "arrow/ipc/util.h" diff --git a/format/Message.fbs b/format/Message.fbs index d7dcd7647fd..e14fdca8f15 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -17,7 +17,6 @@ include "Schema.fbs"; include "Tensor.fbs"; -include "SparseTensor.fbs"; namespace org.apache.arrow.flatbuf; diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs deleted file mode 100644 index 95666979bb5..00000000000 --- a/format/SparseTensor.fbs +++ /dev/null @@ -1,114 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/// EXPERIMENTAL: Metadata for n-dimensional sparse tensors, that contains -/// only non-zero values. Arrow implementations in general are not required -/// to implement this type - -include "Tensor.fbs"; - -namespace org.apache.arrow.flatbuf; - -/// Coodinate format. -table SparseTensorIndexCOO { - /// COO's index list are represented as a NxM matrix, - /// where N is the number of non-zero values, - /// and M is the number of dimensions of a sparse tensor. - /// indicesBuffer stores the location and size of this index matrix. - /// The type of index value is long, so the stride for the index matrix is unnecessary. - /// - /// For example, let X be a 2x3x4x5 tensor, and it has the following 6 non-zero values: - /// - /// X[0, 1, 2, 0] := 1 - /// X[1, 1, 2, 3] := 2 - /// X[0, 2, 1, 0] := 3 - /// X[0, 1, 3, 0] := 4 - /// X[0, 1, 2, 1] := 5 - /// X[1, 2, 0, 4] := 6 - /// - /// In COO format, the index matrix of X is the following 4x10 matrix: - /// - /// [[0, 0, 0, 0, 1, 1], - /// [1, 1, 1, 2, 1, 2], - /// [2, 2, 3, 1, 2, 0], - /// [0, 1, 0, 0, 3, 4]] - /// - /// Note that the indices are sorted in lexcographical order. - indicesBuffer: Buffer; -} - -/// Compressed Sparse Row format, that is matrix-specific. -table SparseMatrixIndexCSR { - /// indptrBuffer stores the location and size of indptr array that - /// represents the range of the rows. - /// The i-th row spans from indptr[i] to indptr[i+1] in the data. - /// The length of this array is 1 + (the number of rows), and the type - /// of index value is long. - /// - /// For example, let X be the following 6x4 matrix: - /// - /// X := [[0, 1, 2, 0], - /// [0, 0, 3, 0], - /// [0, 4, 0, 5], - /// [0, 0, 0, 0], - /// [6, 0, 7, 8], - /// [0, 9, 0, 0]]. - /// - /// The array of non-zero values in X is: - /// - /// values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9]. - /// - /// And the indptr of X is: - /// - /// indptr(X) = [0, 2, 3, 5, 5, 8, 10]. - indptrBuffer: Buffer; - - /// indicesBuffer stores the location and size of the array that - /// contains the column indices of the corresponding non-zero values. - /// The type of index value is long. - /// - /// For example, the indices of the above X is: - /// - /// indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1]. - indicesBuffer: Buffer; -} - -union SparseTensorIndex { - SparseTensorIndexCOO, - SparseMatrixIndexCSR -} - -table SparseTensor { - /// The type of data contained in a value cell. - /// Currently only fixed-width value types are supported, - /// no strings or nested types. - type: Type; - - /// The dimensions of the tensor, optionally named. - shape: [TensorDim]; - - /// The number of non-zero values in a sparse tensor. - length: long; - - /// Sparse tensor index - sparseIndex: SparseTensorIndex; - - /// The location and size of the tensor's data - data: Buffer; -} - -root_type SparseTensor; diff --git a/format/Tensor.fbs b/format/Tensor.fbs index 18b614c3bde..74000f90259 100644 --- a/format/Tensor.fbs +++ b/format/Tensor.fbs @@ -23,6 +23,9 @@ include "Schema.fbs"; namespace org.apache.arrow.flatbuf; +/// ---------------------------------------------------------------------- +/// Data structures for dense tensors + /// Shape data for a single axis in a tensor table TensorDim { /// Length of dimension @@ -48,3 +51,96 @@ table Tensor { } root_type Tensor; + +/// ---------------------------------------------------------------------- +/// Data structures for sparse tensors + +/// Coodinate format of sparse tensor index. +table SparseTensorIndexCOO { + /// COO's index list are represented as a NxM matrix, + /// where N is the number of non-zero values, + /// and M is the number of dimensions of a sparse tensor. + /// indicesBuffer stores the location and size of this index matrix. + /// The type of index value is long, so the stride for the index matrix is unnecessary. + /// + /// For example, let X be a 2x3x4x5 tensor, and it has the following 6 non-zero values: + /// + /// X[0, 1, 2, 0] := 1 + /// X[1, 1, 2, 3] := 2 + /// X[0, 2, 1, 0] := 3 + /// X[0, 1, 3, 0] := 4 + /// X[0, 1, 2, 1] := 5 + /// X[1, 2, 0, 4] := 6 + /// + /// In COO format, the index matrix of X is the following 4x10 matrix: + /// + /// [[0, 0, 0, 0, 1, 1], + /// [1, 1, 1, 2, 1, 2], + /// [2, 2, 3, 1, 2, 0], + /// [0, 1, 0, 0, 3, 4]] + /// + /// Note that the indices are sorted in lexcographical order. + indicesBuffer: Buffer; +} + +/// Compressed Sparse Row format, that is matrix-specific. +table SparseMatrixIndexCSR { + /// indptrBuffer stores the location and size of indptr array that + /// represents the range of the rows. + /// The i-th row spans from indptr[i] to indptr[i+1] in the data. + /// The length of this array is 1 + (the number of rows), and the type + /// of index value is long. + /// + /// For example, let X be the following 6x4 matrix: + /// + /// X := [[0, 1, 2, 0], + /// [0, 0, 3, 0], + /// [0, 4, 0, 5], + /// [0, 0, 0, 0], + /// [6, 0, 7, 8], + /// [0, 9, 0, 0]]. + /// + /// The array of non-zero values in X is: + /// + /// values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9]. + /// + /// And the indptr of X is: + /// + /// indptr(X) = [0, 2, 3, 5, 5, 8, 10]. + indptrBuffer: Buffer; + + /// indicesBuffer stores the location and size of the array that + /// contains the column indices of the corresponding non-zero values. + /// The type of index value is long. + /// + /// For example, the indices of the above X is: + /// + /// indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1]. + indicesBuffer: Buffer; +} + +union SparseTensorIndex { + SparseTensorIndexCOO, + SparseMatrixIndexCSR +} + +table SparseTensor { + /// The type of data contained in a value cell. + /// Currently only fixed-width value types are supported, + /// no strings or nested types. + type: Type; + + /// The dimensions of the tensor, optionally named. + shape: [TensorDim]; + + /// The number of non-zero values in a sparse tensor. + length: long; + + /// Sparse tensor index + sparseIndex: SparseTensorIndex; + + /// The location and size of the tensor's data + data: Buffer; +} + +root_type SparseTensor; From 3b1db7d32644e5057728b39e2bff3bac0a8c13e8 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Fri, 7 Dec 2018 22:41:02 +0900 Subject: [PATCH 21/40] Add SparseTensorBase::Equals --- cpp/src/arrow/compare.cc | 90 ++++++++++++++++++++++++++++ cpp/src/arrow/compare.h | 4 ++ cpp/src/arrow/ipc/read-write-test.cc | 4 +- cpp/src/arrow/sparse_tensor.cc | 5 ++ cpp/src/arrow/sparse_tensor.h | 10 ++++ 5 files changed, 111 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index efc8ad82faf..86bf87b41b2 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -30,6 +30,7 @@ #include "arrow/array.h" #include "arrow/buffer.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/tensor.h" #include "arrow/type.h" @@ -782,6 +783,95 @@ bool TensorEquals(const Tensor& left, const Tensor& right) { return are_equal; } +namespace { + +template +struct SparseTensorEqualsImpl { + static bool compare(const SparseTensor& left, + const SparseTensor& right) { + // TODO(mrkn): should we support the equality among different formats? + return false; + } +}; + +template +struct SparseTensorEqualsImpl { + static bool compare(const SparseTensor& left, + const SparseTensor& right) { + DCHECK(left.type()->id() == right.type()->id()); + DCHECK(left.shape() == right.shape()); + DCHECK(left.length() == right.length()); + + const auto& left_index = checked_cast(*left.sparse_index()); + const auto& right_index = checked_cast(*right.sparse_index()); + + if (!left_index.Equals(right_index)) { + return false; + } + + const auto& size_meta = dynamic_cast(*left.type()); + const int byte_width = size_meta.bit_width() / CHAR_BIT; + DCHECK_GT(byte_width, 0); + + const uint8_t* left_data = left.data()->data(); + const uint8_t* right_data = right.data()->data(); + + return memcmp(left_data, right_data, static_cast(byte_width * left.length())); + } +}; + +template +inline bool SparseTensorEqualsImplDispatch(const SparseTensor& left, + const SparseTensorBase& right) { + switch (right.sparse_tensor_format_id()) { + case SparseTensorFormat::COO: { + const auto& right_coo = checked_cast&>(right); + return SparseTensorEqualsImpl::compare(left, + right_coo); + } + + case SparseTensorFormat::CSR: { + const auto& right_csr = checked_cast&>(right); + return SparseTensorEqualsImpl::compare(left, + right_csr); + } + + default: + return false; + } +} + +} // namespace + +bool SparseTensorEquals(const SparseTensorBase& left, const SparseTensorBase& right) { + if (&left == &right) { + return true; + } else if (left.type()->id() != right.type()->id()) { + return false; + } else if (left.size() == 0) { + return true; + } else if (left.shape() != right.shape()) { + return false; + } else if (left.length() != right.length()) { + return false; + } + + switch (left.sparse_tensor_format_id()) { + case SparseTensorFormat::COO: { + const auto& left_coo = checked_cast&>(left); + return SparseTensorEqualsImplDispatch(left_coo, right); + } + + case SparseTensorFormat::CSR: { + const auto& left_csr = checked_cast&>(left); + return SparseTensorEqualsImplDispatch(left_csr, right); + } + + default: + return false; + } +} + bool TypeEquals(const DataType& left, const DataType& right) { bool are_equal; // The arrays are the same object diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 21e2fdc24f1..bc4877dcbd4 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -29,12 +29,16 @@ namespace arrow { class Array; class DataType; class Tensor; +class SparseTensorBase; /// Returns true if the arrays are exactly equal bool ARROW_EXPORT ArrayEquals(const Array& left, const Array& right); bool ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right); +bool ARROW_EXPORT SparseTensorEquals(const SparseTensorBase& left, + const SparseTensorBase& right); + /// Returns true if the arrays are approximately equal. For non-floating point /// types, this is equivalent to ArrayEquals(left, right) bool ARROW_EXPORT ArrayApproxEquals(const Array& left, const Array& right); diff --git a/cpp/src/arrow/ipc/read-write-test.cc b/cpp/src/arrow/ipc/read-write-test.cc index c69d139a37a..79a84a8497f 100644 --- a/cpp/src/arrow/ipc/read-write-test.cc +++ b/cpp/src/arrow/ipc/read-write-test.cc @@ -885,7 +885,7 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( checked_cast(*result->sparse_index()); ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); ASSERT_EQ(result->data()->size(), data_length); - // TODO ASSERT_TRUE(sparse_tensor.Equals(*result)); + ASSERT_TRUE(result->Equals(*result)); } template <> @@ -919,7 +919,7 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( ASSERT_EQ(resulted_sparse_index.indptr()->data()->size(), indptr_length); ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); ASSERT_EQ(result->data()->size(), data_length); - // TODO ASSERT_TRUE(sparse_tensor.Equals(*result)); + ASSERT_TRUE(result->Equals(*result)); } TEST_F(TestSparseTensorRoundTrip, WithSparseCOOIndex) { diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index a8d96c3bd62..844be035fd7 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -21,6 +21,7 @@ #include #include +#include "arrow/compare.h" #include "arrow/util/logging.h" namespace arrow { @@ -336,6 +337,10 @@ int64_t SparseTensorBase::size() const { return std::accumulate(shape_.begin(), shape_.end(), 1LL, std::multiplies()); } +bool SparseTensorBase::Equals(const SparseTensorBase& other) const { + return SparseTensorEquals(*this, other); +} + // ---------------------------------------------------------------------- // SparseTensor diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index cc55e734ec9..e25703a47e0 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -70,6 +70,10 @@ class ARROW_EXPORT SparseCOOIndex : public SparseIndexBase { std::string ToString() const override; + bool Equals(const SparseCOOIndex& other) const { + return indices()->Equals(*other.indices()); + } + protected: std::shared_ptr coords_; }; @@ -92,6 +96,10 @@ class ARROW_EXPORT SparseCSRIndex : public SparseIndexBase { std::string ToString() const override; + bool Equals(const SparseCSRIndex& other) const { + return indptr()->Equals(*other.indptr()) && indices()->Equals(*other.indices()); + } + protected: std::shared_ptr indptr_; std::shared_ptr indices_; @@ -129,6 +137,8 @@ class ARROW_EXPORT SparseTensorBase { /// Total number of non-zero cells in the sparse tensor virtual int64_t length() const = 0; + bool Equals(const SparseTensorBase& other) const; + protected: // Constructor with all attributes SparseTensorBase(const std::shared_ptr& type, From 9e457acd392ded97402afa97fcea85c807cb9886 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Sun, 9 Dec 2018 10:08:44 +0900 Subject: [PATCH 22/40] Remove needless virtual specifiers --- cpp/src/arrow/sparse_tensor.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index e25703a47e0..95da9cc1a35 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -112,7 +112,7 @@ class ARROW_EXPORT SparseTensorBase { public: virtual ~SparseTensorBase() = default; - virtual SparseTensorFormat::type sparse_tensor_format_id() const = 0; + SparseTensorFormat::type sparse_tensor_format_id() const { return sparse_index_->format_id(); } std::shared_ptr type() const { return type_; } std::shared_ptr data() const { return data_; } @@ -135,7 +135,7 @@ class ARROW_EXPORT SparseTensorBase { bool is_mutable() const { return data_->is_mutable(); } /// Total number of non-zero cells in the sparse tensor - virtual int64_t length() const = 0; + int64_t length() const { return sparse_index_ ? sparse_index_->length() : 0; } bool Equals(const SparseTensorBase& other) const; @@ -181,13 +181,6 @@ class ARROW_EXPORT SparseTensor : public SparseTensorBase { // Constructor with a dense tensor explicit SparseTensor(const Tensor& tensor); - SparseTensorFormat::type sparse_tensor_format_id() const { - return SparseIndexType::format_id; - } - - /// Total number of non-zero cells in the sparse tensor - int64_t length() const { return sparse_index_ ? sparse_index_->length() : 0; } - private: ARROW_DISALLOW_COPY_AND_ASSIGN(SparseTensor); }; From 401ae8023a74058e26ae795b6c490e8e02156f64 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Sun, 9 Dec 2018 10:22:00 +0900 Subject: [PATCH 23/40] Fix SparseCSRIndex::ToString and add tests --- cpp/src/arrow/sparse_tensor-test.cc | 28 ++++++++++++++-------------- cpp/src/arrow/sparse_tensor.cc | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor-test.cc b/cpp/src/arrow/sparse_tensor-test.cc index 64778ca33b6..86323e68b5a 100644 --- a/cpp/src/arrow/sparse_tensor-test.cc +++ b/cpp/src/arrow/sparse_tensor-test.cc @@ -92,9 +92,10 @@ TEST(TestSparseCOOTensor, CreationFromNumericTensor) { ASSERT_EQ(i + 11, ptr[i + 6]); } - std::shared_ptr si = - std::dynamic_pointer_cast(st1.sparse_index()); - std::shared_ptr sidx = si->indices(); + const auto& si = internal::checked_cast(*st1.sparse_index()); + ASSERT_EQ(std::string("SparseCOOIndex"), si.ToString()); + + std::shared_ptr sidx = si.indices(); ASSERT_EQ(std::vector({12, 3}), sidx->shape()); ASSERT_TRUE(sidx->is_column_major()); @@ -154,9 +155,8 @@ TEST(TestSparseCOOTensor, CreationFromTensor) { ASSERT_EQ(i + 11, ptr[i + 6]); } - std::shared_ptr si = - std::dynamic_pointer_cast(st1.sparse_index()); - std::shared_ptr sidx = si->indices(); + const auto& si = internal::checked_cast(*st1.sparse_index()); + std::shared_ptr sidx = si.indices(); ASSERT_EQ(std::vector({12, 3}), sidx->shape()); ASSERT_TRUE(sidx->is_column_major()); @@ -219,24 +219,24 @@ TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { ASSERT_EQ(i + 11, ptr[i + 6]); } - std::shared_ptr si = - std::dynamic_pointer_cast(st1.sparse_index()); + const auto& si = internal::checked_cast(*st1.sparse_index()); - ASSERT_EQ(1, si->indptr()->ndim()); - ASSERT_EQ(1, si->indices()->ndim()); + ASSERT_EQ(std::string("SparseCSRIndex"), si.ToString()); + ASSERT_EQ(1, si.indptr()->ndim()); + ASSERT_EQ(1, si.indices()->ndim()); const int64_t* indptr_begin = - reinterpret_cast(si->indptr()->raw_data()); + reinterpret_cast(si.indptr()->raw_data()); std::vector indptr_values(indptr_begin, - indptr_begin + si->indptr()->shape()[0]); + indptr_begin + si.indptr()->shape()[0]); ASSERT_EQ(7, indptr_values.size()); ASSERT_EQ(std::vector({0, 2, 4, 6, 8, 10, 12}), indptr_values); const int64_t* indices_begin = - reinterpret_cast(si->indices()->raw_data()); + reinterpret_cast(si.indices()->raw_data()); std::vector indices_values(indices_begin, - indices_begin + si->indices()->shape()[0]); + indices_begin + si.indices()->shape()[0]); ASSERT_EQ(12, indices_values.size()); ASSERT_EQ(std::vector({0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}), indices_values); diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 844be035fd7..964a6c6f29e 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -304,7 +304,7 @@ SparseCSRIndex::SparseCSRIndex(const std::shared_ptr& indptr, DCHECK_EQ(1, indices_->ndim()); } -std::string SparseCSRIndex::ToString() const { return std::string("SparseCOOIndex"); } +std::string SparseCSRIndex::ToString() const { return std::string("SparseCSRIndex"); } // ---------------------------------------------------------------------- // SparseTensorBase From 99b1d1d4d9b38d87557b6135763c8362542fd69b Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Sun, 9 Dec 2018 08:47:48 +0900 Subject: [PATCH 24/40] Add missing ARROW_EXPORT specifiers --- cpp/src/arrow/sparse_tensor.cc | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 964a6c6f29e..dadb29e156c 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -421,17 +421,17 @@ SparseTensor::SparseTensor(const Tensor& tensor) #define INSTANTIATE_SPARSE_TENSOR(IndexType) \ template class ARROW_TEMPLATE_EXPORT SparseTensor; \ - template SparseTensor::SparseTensor(const NumericTensor&); \ - template SparseTensor::SparseTensor(const NumericTensor&); \ - template SparseTensor::SparseTensor(const NumericTensor&); \ - template SparseTensor::SparseTensor(const NumericTensor&); \ - template SparseTensor::SparseTensor(const NumericTensor&); \ - template SparseTensor::SparseTensor(const NumericTensor&); \ - template SparseTensor::SparseTensor(const NumericTensor&); \ - template SparseTensor::SparseTensor(const NumericTensor&); \ - template SparseTensor::SparseTensor(const NumericTensor&); \ - template SparseTensor::SparseTensor(const NumericTensor&); \ - template SparseTensor::SparseTensor(const NumericTensor&) + template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&) INSTANTIATE_SPARSE_TENSOR(SparseCOOIndex); INSTANTIATE_SPARSE_TENSOR(SparseCSRIndex); From 43d8eea44860da31530b964638626c89128cd208 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Sun, 9 Dec 2018 10:29:27 +0900 Subject: [PATCH 25/40] Fix coding style --- cpp/src/arrow/sparse_tensor-test.cc | 3 +-- cpp/src/arrow/sparse_tensor.cc | 37 +++++++++++++++++++---------- cpp/src/arrow/sparse_tensor.h | 4 +++- 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor-test.cc b/cpp/src/arrow/sparse_tensor-test.cc index 86323e68b5a..9c648b85caa 100644 --- a/cpp/src/arrow/sparse_tensor-test.cc +++ b/cpp/src/arrow/sparse_tensor-test.cc @@ -225,8 +225,7 @@ TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { ASSERT_EQ(1, si.indptr()->ndim()); ASSERT_EQ(1, si.indices()->ndim()); - const int64_t* indptr_begin = - reinterpret_cast(si.indptr()->raw_data()); + const int64_t* indptr_begin = reinterpret_cast(si.indptr()->raw_data()); std::vector indptr_values(indptr_begin, indptr_begin + si.indptr()->shape()[0]); diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index dadb29e156c..896750c9ca5 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -419,19 +419,30 @@ SparseTensor::SparseTensor(const Tensor& tensor) // ---------------------------------------------------------------------- // Instantiate templates -#define INSTANTIATE_SPARSE_TENSOR(IndexType) \ - template class ARROW_TEMPLATE_EXPORT SparseTensor; \ - template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor(const NumericTensor&) +#define INSTANTIATE_SPARSE_TENSOR(IndexType) \ + template class ARROW_TEMPLATE_EXPORT SparseTensor; \ + template ARROW_EXPORT SparseTensor::SparseTensor( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensor::SparseTensor( \ + const NumericTensor&) INSTANTIATE_SPARSE_TENSOR(SparseCOOIndex); INSTANTIATE_SPARSE_TENSOR(SparseCSRIndex); diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 95da9cc1a35..db3d5cc14d7 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -112,7 +112,9 @@ class ARROW_EXPORT SparseTensorBase { public: virtual ~SparseTensorBase() = default; - SparseTensorFormat::type sparse_tensor_format_id() const { return sparse_index_->format_id(); } + SparseTensorFormat::type sparse_tensor_format_id() const { + return sparse_index_->format_id(); + } std::shared_ptr type() const { return type_; } std::shared_ptr data() const { return data_; } From 357860d8c7c491d6b35ff0e19c4284525870da43 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Thu, 13 Dec 2018 09:48:25 +0900 Subject: [PATCH 26/40] Fix typo in comments --- format/Tensor.fbs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/format/Tensor.fbs b/format/Tensor.fbs index 74000f90259..936e4c56919 100644 --- a/format/Tensor.fbs +++ b/format/Tensor.fbs @@ -72,7 +72,7 @@ table SparseTensorIndexCOO { /// X[0, 1, 2, 1] := 5 /// X[1, 2, 0, 4] := 6 /// - /// In COO format, the index matrix of X is the following 4x10 matrix: + /// In COO format, the index matrix of X is the following 4x6 matrix: /// /// [[0, 0, 0, 0, 1, 1], /// [1, 1, 1, 2, 1, 2], From 7e814de365b92ee0900c569fdfe26f1983b6b014 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Thu, 13 Dec 2018 09:51:03 +0900 Subject: [PATCH 27/40] Put EXPERIMENTAL markn in comments --- cpp/src/arrow/ipc/reader.h | 2 +- cpp/src/arrow/ipc/writer.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index ca15d9c5357..6668418e7ca 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -236,7 +236,7 @@ Status ReadTensor(io::InputStream* file, std::shared_ptr* out); ARROW_EXPORT Status ReadTensor(const Message& message, std::shared_ptr* out); -/// \brief Read arrow::SparseTensor as encapsulated IPC message in file +/// \brief EXPERIMETNAL: Read arrow::SparseTensor as encapsulated IPC message in file /// /// \param[in] file an InputStream pointed at the start of the message /// \param[out] out the read sparse tensor diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index 996f0ff7302..1a39dfc49d5 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -270,7 +270,7 @@ ARROW_EXPORT Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length); -// \brief Write arrow::SparseTensor as a contiguous mesasge. The metadata, +// \brief EXPERIMETNAL: Write arrow::SparseTensor as a contiguous mesasge. The metadata, // sparse index, and body are written assuming 64-byte alignment. It is the // user's responsibility to ensure that the OutputStream has been aligned // to a 64-byte multiple before writing the message. From f78230344de9567e245422548b2105da81b56ade Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Thu, 13 Dec 2018 10:01:26 +0900 Subject: [PATCH 28/40] Return Status::IOError instead of DCHECK if message header type is not matched --- cpp/src/arrow/ipc/metadata-internal.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 7d301b1d4e0..6964aa36730 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -1041,7 +1041,7 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr SparseTensorFormat::type* sparse_tensor_format_id) { auto message = flatbuf::GetMessage(metadata.data()); if (message->header_type() != flatbuf::MessageHeader_SparseTensor) { - DCHECK_EQ(message->header_type(), flatbuf::MessageHeader_SparseTensor); + return Status::IOError("Header of flatbuffer-encoded Message is not SparseTensor."); } if (message->header() == nullptr) { return Status::IOError("Header-pointer of flatbuffer-encoded Message is null."); From ff3ea71c5a0b816251d8c786ac680f5c015368fb Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Thu, 13 Dec 2018 10:23:41 +0900 Subject: [PATCH 29/40] Rename length to non_zero_length in SparseTensor --- cpp/src/arrow/compare.cc | 6 +++--- cpp/src/arrow/ipc/metadata-internal.cc | 8 ++++---- cpp/src/arrow/ipc/read-write-test.cc | 4 ++-- cpp/src/arrow/ipc/reader.cc | 26 +++++++++++++------------- cpp/src/arrow/sparse_tensor-test.cc | 10 +++++----- cpp/src/arrow/sparse_tensor.h | 14 +++++++------- format/Tensor.fbs | 2 +- 7 files changed, 35 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 86bf87b41b2..1d7a2cc6bb2 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -800,7 +800,7 @@ struct SparseTensorEqualsImpl { const SparseTensor& right) { DCHECK(left.type()->id() == right.type()->id()); DCHECK(left.shape() == right.shape()); - DCHECK(left.length() == right.length()); + DCHECK(left.non_zero_length() == right.non_zero_length()); const auto& left_index = checked_cast(*left.sparse_index()); const auto& right_index = checked_cast(*right.sparse_index()); @@ -816,7 +816,7 @@ struct SparseTensorEqualsImpl { const uint8_t* left_data = left.data()->data(); const uint8_t* right_data = right.data()->data(); - return memcmp(left_data, right_data, static_cast(byte_width * left.length())); + return memcmp(left_data, right_data, static_cast(byte_width * left.non_zero_length())); } }; @@ -852,7 +852,7 @@ bool SparseTensorEquals(const SparseTensorBase& left, const SparseTensorBase& ri return true; } else if (left.shape() != right.shape()) { return false; - } else if (left.length() != right.length()) { + } else if (left.non_zero_length() != right.non_zero_length()) { return false; } diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 6964aa36730..0c968a359b3 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -863,9 +863,9 @@ Status MakeSparseTensor(FBB& fbb, const SparseTensorBase& sparse_tensor, const BufferMetadata& data_metadata = buffers[num_index_buffers]; flatbuf::Buffer data(data_metadata.offset, data_metadata.length); - int64_t length = sparse_tensor.length(); + const int64_t non_zero_length = sparse_tensor.non_zero_length(); - *offset = flatbuf::CreateSparseTensor(fbb, fb_type_type, fb_type, fb_shape, length, + *offset = flatbuf::CreateSparseTensor(fbb, fb_type_type, fb_type, fb_shape, non_zero_length, fb_sparse_index_type, fb_sparse_index, &data); return Status::OK(); @@ -1037,7 +1037,7 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* type, std::vector* shape, - std::vector* dim_names, int64_t* length, + std::vector* dim_names, int64_t* non_zero_length, SparseTensorFormat::type* sparse_tensor_format_id) { auto message = flatbuf::GetMessage(metadata.data()); if (message->header_type() != flatbuf::MessageHeader_SparseTensor) { @@ -1062,7 +1062,7 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr } } - *length = sparse_tensor->length(); + *non_zero_length = sparse_tensor->non_zero_length(); switch (sparse_tensor->sparseIndex_type()) { case flatbuf::SparseTensorIndex_SparseTensorIndexCOO: diff --git a/cpp/src/arrow/ipc/read-write-test.cc b/cpp/src/arrow/ipc/read-write-test.cc index 79a84a8497f..820708dcaa3 100644 --- a/cpp/src/arrow/ipc/read-write-test.cc +++ b/cpp/src/arrow/ipc/read-write-test.cc @@ -872,7 +872,7 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( const auto& sparse_index = checked_cast(*tensor.sparse_index()); const int64_t indices_length = elem_size * sparse_index.indices()->size(); - const int64_t data_length = elem_size * tensor.length(); + const int64_t data_length = elem_size * tensor.non_zero_length(); const int64_t expected_body_length = indices_length + data_length; ASSERT_EQ(expected_body_length, body_length); @@ -905,7 +905,7 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( const auto& sparse_index = checked_cast(*tensor.sparse_index()); const int64_t indptr_length = elem_size * sparse_index.indptr()->size(); const int64_t indices_length = elem_size * sparse_index.indices()->size(); - const int64_t data_length = elem_size * tensor.length(); + const int64_t data_length = elem_size * tensor.non_zero_length(); const int64_t expected_body_length = indptr_length + indices_length + data_length; ASSERT_EQ(expected_body_length, body_length); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 62faaef0475..920977882b6 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -730,23 +730,23 @@ Status ReadTensor(const Message& message, std::shared_ptr* out) { namespace { Status ReadSparseCOOIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, - int64_t length, io::RandomAccessFile* file, + int64_t non_zero_length, io::RandomAccessFile* file, std::shared_ptr* out) { auto* sparse_index = sparse_tensor->sparseIndex_as_SparseTensorIndexCOO(); auto* indices_buffer = sparse_index->indicesBuffer(); std::shared_ptr indices_data; RETURN_NOT_OK( file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); - std::vector shape({length, ndim}); + std::vector shape({non_zero_length, ndim}); const int64_t elsize = sizeof(int64_t); - std::vector strides({elsize, elsize * length}); + std::vector strides({elsize, elsize * non_zero_length}); *out = std::make_shared( std::make_shared(indices_data, shape, strides)); return Status::OK(); } Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, - int64_t length, io::RandomAccessFile* file, + int64_t non_zero_length, io::RandomAccessFile* file, std::shared_ptr* out) { auto* sparse_index = sparse_tensor->sparseIndex_as_SparseMatrixIndexCSR(); @@ -761,7 +761,7 @@ Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t nd file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); std::vector indptr_shape({ndim + 1}); - std::vector indices_shape({length}); + std::vector indices_shape({non_zero_length}); *out = std::make_shared( std::make_shared(indptr_data, indptr_shape), std::make_shared(indices_data, indices_shape)); @@ -771,7 +771,7 @@ Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t nd Status MakeSparseTensorWithSparseCOOIndex( const std::shared_ptr& type, const std::vector& shape, const std::vector& dim_names, - const std::shared_ptr& sparse_index, int64_t length, + const std::shared_ptr& sparse_index, int64_t non_zero_length, const std::shared_ptr& data, std::shared_ptr* out) { auto* sparse_tensor = new SparseTensor(sparse_index, type, data, shape, dim_names); @@ -782,7 +782,7 @@ Status MakeSparseTensorWithSparseCOOIndex( Status MakeSparseTensorWithSparseCSRIndex( const std::shared_ptr& type, const std::vector& shape, const std::vector& dim_names, - const std::shared_ptr& sparse_index, int64_t length, + const std::shared_ptr& sparse_index, int64_t non_zero_length, const std::shared_ptr& data, std::shared_ptr* out) { auto* sparse_tensor = new SparseTensor(sparse_index, type, data, shape, dim_names); @@ -797,11 +797,11 @@ Status ReadSparseTensor(const Buffer& metadata, io::RandomAccessFile* file, std::shared_ptr type; std::vector shape; std::vector dim_names; - int64_t length; + int64_t non_zero_length; SparseTensorFormat::type sparse_tensor_format_id; RETURN_NOT_OK(internal::GetSparseTensorMetadata(metadata, &type, &shape, &dim_names, - &length, &sparse_tensor_format_id)); + &non_zero_length, &sparse_tensor_format_id)); auto message = flatbuf::GetMessage(metadata.data()); auto sparse_tensor = reinterpret_cast(message->header()); @@ -817,17 +817,17 @@ Status ReadSparseTensor(const Buffer& metadata, io::RandomAccessFile* file, switch (sparse_tensor_format_id) { case SparseTensorFormat::COO: RETURN_NOT_OK( - ReadSparseCOOIndex(sparse_tensor, shape.size(), length, file, &sparse_index)); + ReadSparseCOOIndex(sparse_tensor, shape.size(), non_zero_length, file, &sparse_index)); return MakeSparseTensorWithSparseCOOIndex( type, shape, dim_names, std::dynamic_pointer_cast(sparse_index), - length, data, out); + non_zero_length, data, out); case SparseTensorFormat::CSR: RETURN_NOT_OK( - ReadSparseCSRIndex(sparse_tensor, shape.size(), length, file, &sparse_index)); + ReadSparseCSRIndex(sparse_tensor, shape.size(), non_zero_length, file, &sparse_index)); return MakeSparseTensorWithSparseCSRIndex( type, shape, dim_names, std::dynamic_pointer_cast(sparse_index), - length, data, out); + non_zero_length, data, out); default: return Status::Invalid("Unsupported sparse index format"); diff --git a/cpp/src/arrow/sparse_tensor-test.cc b/cpp/src/arrow/sparse_tensor-test.cc index 9c648b85caa..21f9991fe82 100644 --- a/cpp/src/arrow/sparse_tensor-test.cc +++ b/cpp/src/arrow/sparse_tensor-test.cc @@ -45,8 +45,8 @@ TEST(TestSparseCOOTensor, CreationEmptyTensor) { std::vector dim_names = {"foo", "bar", "baz"}; SparseTensor st2(int64(), shape, dim_names); - ASSERT_EQ(0, st1.length()); - ASSERT_EQ(0, st2.length()); + ASSERT_EQ(0, st1.non_zero_length()); + ASSERT_EQ(0, st2.non_zero_length()); ASSERT_EQ(24, st1.size()); ASSERT_EQ(24, st2.size()); @@ -73,7 +73,7 @@ TEST(TestSparseCOOTensor, CreationFromNumericTensor) { CheckSparseIndexFormatType(SparseTensorFormat::COO, st1); - ASSERT_EQ(12, st1.length()); + ASSERT_EQ(12, st1.non_zero_length()); ASSERT_TRUE(st1.is_mutable()); ASSERT_EQ("foo", st2.dim_name(0)); @@ -136,7 +136,7 @@ TEST(TestSparseCOOTensor, CreationFromTensor) { SparseTensor st1(tensor1); SparseTensor st2(tensor2); - ASSERT_EQ(12, st1.length()); + ASSERT_EQ(12, st1.non_zero_length()); ASSERT_TRUE(st1.is_mutable()); ASSERT_EQ("foo", st2.dim_name(0)); @@ -200,7 +200,7 @@ TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { CheckSparseIndexFormatType(SparseTensorFormat::CSR, st1); - ASSERT_EQ(12, st1.length()); + ASSERT_EQ(12, st1.non_zero_length()); ASSERT_TRUE(st1.is_mutable()); ASSERT_EQ("foo", st2.dim_name(0)); diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index db3d5cc14d7..359ee4600ba 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -32,26 +32,26 @@ namespace arrow { class ARROW_EXPORT SparseIndex { public: - explicit SparseIndex(SparseTensorFormat::type format_id, int64_t length) - : format_id_(format_id), length_(length) {} + explicit SparseIndex(SparseTensorFormat::type format_id, int64_t non_zero_length) + : format_id_(format_id), non_zero_length_(non_zero_length) {} virtual ~SparseIndex() = default; SparseTensorFormat::type format_id() const { return format_id_; } - int64_t length() const { return length_; } + int64_t non_zero_length() const { return non_zero_length_; } virtual std::string ToString() const = 0; protected: SparseTensorFormat::type format_id_; - int64_t length_; + int64_t non_zero_length_; }; template class SparseIndexBase : public SparseIndex { public: - explicit SparseIndexBase(int64_t length) - : SparseIndex(SparseIndexType::format_id, length) {} + explicit SparseIndexBase(int64_t non_zero_length) + : SparseIndex(SparseIndexType::format_id, non_zero_length) {} }; // ---------------------------------------------------------------------- @@ -137,7 +137,7 @@ class ARROW_EXPORT SparseTensorBase { bool is_mutable() const { return data_->is_mutable(); } /// Total number of non-zero cells in the sparse tensor - int64_t length() const { return sparse_index_ ? sparse_index_->length() : 0; } + int64_t non_zero_length() const { return sparse_index_ ? sparse_index_->non_zero_length() : 0; } bool Equals(const SparseTensorBase& other) const; diff --git a/format/Tensor.fbs b/format/Tensor.fbs index 936e4c56919..c1df6b2d1a9 100644 --- a/format/Tensor.fbs +++ b/format/Tensor.fbs @@ -134,7 +134,7 @@ table SparseTensor { shape: [TensorDim]; /// The number of non-zero values in a sparse tensor. - length: long; + non_zero_length: long; /// Sparse tensor index sparseIndex: SparseTensorIndex; From 6f291581edc5dbca306708a652ea76657155dfcd Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Mon, 7 Jan 2019 23:53:44 +0900 Subject: [PATCH 30/40] Mark APIs for sparse tensor as EXPERIMENTAL --- cpp/src/arrow/compare.h | 1 + cpp/src/arrow/ipc/metadata-internal.h | 1 + cpp/src/arrow/ipc/writer.h | 2 +- cpp/src/arrow/sparse_tensor.h | 6 ++++++ cpp/src/arrow/sparse_tensor_format.h | 1 + format/Tensor.fbs | 2 +- 6 files changed, 11 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index bc4877dcbd4..6067b7929ab 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -36,6 +36,7 @@ bool ARROW_EXPORT ArrayEquals(const Array& left, const Array& right); bool ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right); +/// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal bool ARROW_EXPORT SparseTensorEquals(const SparseTensorBase& left, const SparseTensorBase& right); diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index 420fca8a0bd..bff3dd02231 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -105,6 +105,7 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type std::vector* shape, std::vector* strides, std::vector* dim_names); +// EXPERIMENTAL: Extracting metadata of a sparse tensor from the message Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* type, std::vector* shape, std::vector* dim_names, int64_t* length, diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index 1a39dfc49d5..56c1672065c 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -270,7 +270,7 @@ ARROW_EXPORT Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length); -// \brief EXPERIMETNAL: Write arrow::SparseTensor as a contiguous mesasge. The metadata, +// \brief EXPERIMENTAL: Write arrow::SparseTensor as a contiguous mesasge. The metadata, // sparse index, and body are written assuming 64-byte alignment. It is the // user's responsibility to ensure that the OutputStream has been aligned // to a 64-byte multiple before writing the message. diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 359ee4600ba..bfa00e487c9 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -30,6 +30,8 @@ namespace arrow { // ---------------------------------------------------------------------- // SparseIndex class +/// \brief EXPERIMENTAL: The base class for representing index of non-zero +/// values in sparse tensor class ARROW_EXPORT SparseIndex { public: explicit SparseIndex(SparseTensorFormat::type format_id, int64_t non_zero_length) @@ -57,6 +59,7 @@ class SparseIndexBase : public SparseIndex { // ---------------------------------------------------------------------- // SparseCOOIndex class +/// \brief EXPERIMENTAL: The index data for COO sparse tensor class ARROW_EXPORT SparseCOOIndex : public SparseIndexBase { public: using CoordsTensor = NumericTensor; @@ -81,6 +84,7 @@ class ARROW_EXPORT SparseCOOIndex : public SparseIndexBase { // ---------------------------------------------------------------------- // SparseCSRIndex class +/// \brief EXPERIMENTAL: The index data for CSR sparse matrix class ARROW_EXPORT SparseCSRIndex : public SparseIndexBase { public: using IndexTensor = NumericTensor; @@ -108,6 +112,7 @@ class ARROW_EXPORT SparseCSRIndex : public SparseIndexBase { // ---------------------------------------------------------------------- // SparseTensorBase class +/// \brief EXPERIMENTAL: The base class of sparse tensor container class ARROW_EXPORT SparseTensorBase { public: virtual ~SparseTensorBase() = default; @@ -160,6 +165,7 @@ class ARROW_EXPORT SparseTensorBase { // ---------------------------------------------------------------------- // SparseTensor class +/// \brief EXPERIMENTAL: Concrete sparse tensor classes with sparse index type template class ARROW_EXPORT SparseTensor : public SparseTensorBase { public: diff --git a/cpp/src/arrow/sparse_tensor_format.h b/cpp/src/arrow/sparse_tensor_format.h index 24c1a190f50..813378ff0e9 100644 --- a/cpp/src/arrow/sparse_tensor_format.h +++ b/cpp/src/arrow/sparse_tensor_format.h @@ -18,6 +18,7 @@ #ifndef ARROW_SPARSE_TENSOR_FORMAT_H #define ARROW_SPARSE_TENSOR_FORMAT_H +/// \brief EXPERIMENTAL: Sparse tensor format enumeration struct SparseTensorFormat { enum type { COO, CSR }; }; diff --git a/format/Tensor.fbs b/format/Tensor.fbs index c1df6b2d1a9..e77b353a0f3 100644 --- a/format/Tensor.fbs +++ b/format/Tensor.fbs @@ -53,7 +53,7 @@ table Tensor { root_type Tensor; /// ---------------------------------------------------------------------- -/// Data structures for sparse tensors +/// EXPERIMENTAL: Data structures for sparse tensors /// Coodinate format of sparse tensor index. table SparseTensorIndexCOO { From 6ef6ad065a21e5b3c01b3fb62508b10de716519d Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Tue, 8 Jan 2019 00:56:00 +0900 Subject: [PATCH 31/40] Apply code formatter --- cpp/src/arrow/compare.cc | 3 ++- cpp/src/arrow/ipc/metadata-internal.cc | 8 +++++--- cpp/src/arrow/ipc/reader.cc | 12 ++++++------ cpp/src/arrow/sparse_tensor.h | 4 +++- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 1d7a2cc6bb2..4230c24676f 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -816,7 +816,8 @@ struct SparseTensorEqualsImpl { const uint8_t* left_data = left.data()->data(); const uint8_t* right_data = right.data()->data(); - return memcmp(left_data, right_data, static_cast(byte_width * left.non_zero_length())); + return memcmp(left_data, right_data, + static_cast(byte_width * left.non_zero_length())); } }; diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 0c968a359b3..aafa2e3fe82 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -865,8 +865,9 @@ Status MakeSparseTensor(FBB& fbb, const SparseTensorBase& sparse_tensor, const int64_t non_zero_length = sparse_tensor.non_zero_length(); - *offset = flatbuf::CreateSparseTensor(fbb, fb_type_type, fb_type, fb_shape, non_zero_length, - fb_sparse_index_type, fb_sparse_index, &data); + *offset = + flatbuf::CreateSparseTensor(fbb, fb_type_type, fb_type, fb_shape, non_zero_length, + fb_sparse_index_type, fb_sparse_index, &data); return Status::OK(); } @@ -1037,7 +1038,8 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* type, std::vector* shape, - std::vector* dim_names, int64_t* non_zero_length, + std::vector* dim_names, + int64_t* non_zero_length, SparseTensorFormat::type* sparse_tensor_format_id) { auto message = flatbuf::GetMessage(metadata.data()); if (message->header_type() != flatbuf::MessageHeader_SparseTensor) { diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 920977882b6..d930ab380c4 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -800,8 +800,8 @@ Status ReadSparseTensor(const Buffer& metadata, io::RandomAccessFile* file, int64_t non_zero_length; SparseTensorFormat::type sparse_tensor_format_id; - RETURN_NOT_OK(internal::GetSparseTensorMetadata(metadata, &type, &shape, &dim_names, - &non_zero_length, &sparse_tensor_format_id)); + RETURN_NOT_OK(internal::GetSparseTensorMetadata( + metadata, &type, &shape, &dim_names, &non_zero_length, &sparse_tensor_format_id)); auto message = flatbuf::GetMessage(metadata.data()); auto sparse_tensor = reinterpret_cast(message->header()); @@ -816,15 +816,15 @@ Status ReadSparseTensor(const Buffer& metadata, io::RandomAccessFile* file, std::shared_ptr sparse_index; switch (sparse_tensor_format_id) { case SparseTensorFormat::COO: - RETURN_NOT_OK( - ReadSparseCOOIndex(sparse_tensor, shape.size(), non_zero_length, file, &sparse_index)); + RETURN_NOT_OK(ReadSparseCOOIndex(sparse_tensor, shape.size(), non_zero_length, file, + &sparse_index)); return MakeSparseTensorWithSparseCOOIndex( type, shape, dim_names, std::dynamic_pointer_cast(sparse_index), non_zero_length, data, out); case SparseTensorFormat::CSR: - RETURN_NOT_OK( - ReadSparseCSRIndex(sparse_tensor, shape.size(), non_zero_length, file, &sparse_index)); + RETURN_NOT_OK(ReadSparseCSRIndex(sparse_tensor, shape.size(), non_zero_length, file, + &sparse_index)); return MakeSparseTensorWithSparseCSRIndex( type, shape, dim_names, std::dynamic_pointer_cast(sparse_index), non_zero_length, data, out); diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index bfa00e487c9..8ce78c81c46 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -142,7 +142,9 @@ class ARROW_EXPORT SparseTensorBase { bool is_mutable() const { return data_->is_mutable(); } /// Total number of non-zero cells in the sparse tensor - int64_t non_zero_length() const { return sparse_index_ ? sparse_index_->non_zero_length() : 0; } + int64_t non_zero_length() const { + return sparse_index_ ? sparse_index_->non_zero_length() : 0; + } bool Equals(const SparseTensorBase& other) const; From 3dd434c83c4a7ea113b716227bf95e4167cc540b Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 9 Jan 2019 17:03:33 +0900 Subject: [PATCH 32/40] Capitalize member function name --- cpp/src/arrow/compare.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 4230c24676f..f64428008db 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -787,7 +787,7 @@ namespace { template struct SparseTensorEqualsImpl { - static bool compare(const SparseTensor& left, + static bool Compare(const SparseTensor& left, const SparseTensor& right) { // TODO(mrkn): should we support the equality among different formats? return false; @@ -796,7 +796,7 @@ struct SparseTensorEqualsImpl { template struct SparseTensorEqualsImpl { - static bool compare(const SparseTensor& left, + static bool Compare(const SparseTensor& left, const SparseTensor& right) { DCHECK(left.type()->id() == right.type()->id()); DCHECK(left.shape() == right.shape()); @@ -827,13 +827,13 @@ inline bool SparseTensorEqualsImplDispatch(const SparseTensor& switch (right.sparse_tensor_format_id()) { case SparseTensorFormat::COO: { const auto& right_coo = checked_cast&>(right); - return SparseTensorEqualsImpl::compare(left, + return SparseTensorEqualsImpl::Compare(left, right_coo); } case SparseTensorFormat::CSR: { const auto& right_csr = checked_cast&>(right); - return SparseTensorEqualsImpl::compare(left, + return SparseTensorEqualsImpl::Compare(left, right_csr); } From 97e85bd3535f7896164847a07a4d4e0b82d0cb81 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 9 Jan 2019 17:08:36 +0900 Subject: [PATCH 33/40] Use std::make_shared --- cpp/src/arrow/ipc/reader.cc | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index d930ab380c4..4aebccdb8b3 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -773,9 +773,7 @@ Status MakeSparseTensorWithSparseCOOIndex( const std::vector& dim_names, const std::shared_ptr& sparse_index, int64_t non_zero_length, const std::shared_ptr& data, std::shared_ptr* out) { - auto* sparse_tensor = - new SparseTensor(sparse_index, type, data, shape, dim_names); - *out = std::shared_ptr(sparse_tensor); + *out = std::make_shared>(sparse_index, type, data, shape, dim_names); return Status::OK(); } @@ -784,9 +782,7 @@ Status MakeSparseTensorWithSparseCSRIndex( const std::vector& dim_names, const std::shared_ptr& sparse_index, int64_t non_zero_length, const std::shared_ptr& data, std::shared_ptr* out) { - auto* sparse_tensor = - new SparseTensor(sparse_index, type, data, shape, dim_names); - *out = std::shared_ptr(sparse_tensor); + *out = std::make_shared>(sparse_index, type, data, shape, dim_names); return Status::OK(); } From 37a0a14c6b6232ddda0aa3648cff63c741c0e31d Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 9 Jan 2019 17:11:36 +0900 Subject: [PATCH 34/40] Remove needless function declaration --- cpp/src/arrow/ipc/reader.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index 6668418e7ca..0d49f84dbb7 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -244,14 +244,6 @@ Status ReadTensor(const Message& message, std::shared_ptr* out); ARROW_EXPORT Status ReadSparseTensor(io::InputStream* file, std::shared_ptr* out); -/// \brief EXPERIMENTAL: Read arrow::Tensor from IPC message -/// -/// \param[in] message a Message containing the tensor metadata and body -/// \param[out] out the read tensor -/// \return Status -ARROW_EXPORT -Status ReadTensor(const Message& message, std::shared_ptr* out); - /// \brief EXPERIMENTAL: Read arrow::SparseTensor from IPC message /// /// \param[in] message a Message containing the tensor metadata and body From 07a6518632203b3ab88be19d7ac46b17133d51f9 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 9 Jan 2019 17:13:21 +0900 Subject: [PATCH 35/40] Use substitution instead of constructor call --- cpp/src/arrow/sparse_tensor-test.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/sparse_tensor-test.cc b/cpp/src/arrow/sparse_tensor-test.cc index 21f9991fe82..502c746f27a 100644 --- a/cpp/src/arrow/sparse_tensor-test.cc +++ b/cpp/src/arrow/sparse_tensor-test.cc @@ -64,7 +64,7 @@ TEST(TestSparseCOOTensor, CreationFromNumericTensor) { std::vector shape = {2, 3, 4}; std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; - std::shared_ptr buffer(Buffer::Wrap(values)); + std::shared_ptr buffer = Buffer::Wrap(values); std::vector dim_names = {"foo", "bar", "baz"}; NumericTensor tensor1(buffer, shape); NumericTensor tensor2(buffer, shape, {}, dim_names); @@ -129,7 +129,7 @@ TEST(TestSparseCOOTensor, CreationFromTensor) { std::vector shape = {2, 3, 4}; std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; - std::shared_ptr buffer(Buffer::Wrap(values)); + std::shared_ptr buffer = Buffer::Wrap(values); std::vector dim_names = {"foo", "bar", "baz"}; Tensor tensor1(int64(), buffer, shape); Tensor tensor2(int64(), buffer, shape, {}, dim_names); @@ -190,7 +190,7 @@ TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { std::vector shape = {6, 4}; std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; - std::shared_ptr buffer(Buffer::Wrap(values)); + std::shared_ptr buffer = Buffer::Wrap(values); std::vector dim_names = {"foo", "bar", "baz"}; NumericTensor tensor1(buffer, shape); NumericTensor tensor2(buffer, shape, {}, dim_names); From 90e8b316674f00cb403ec16aa3eaf74a59afa93d Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 9 Jan 2019 17:38:34 +0900 Subject: [PATCH 36/40] Rename sparse tensor classes - SparseTensorBase to SparseTensor - SparseTensor<...> to SparseTensorImpl<...> --- cpp/src/arrow/compare.cc | 28 +++++++------- cpp/src/arrow/compare.h | 6 +-- cpp/src/arrow/ipc/metadata-internal.cc | 4 +- cpp/src/arrow/ipc/metadata-internal.h | 4 +- cpp/src/arrow/ipc/read-write-test.cc | 14 +++---- cpp/src/arrow/ipc/reader.cc | 14 +++---- cpp/src/arrow/ipc/reader.h | 6 +-- cpp/src/arrow/ipc/writer.cc | 8 ++-- cpp/src/arrow/ipc/writer.h | 4 +- cpp/src/arrow/sparse_tensor-test.cc | 18 ++++----- cpp/src/arrow/sparse_tensor.cc | 52 +++++++++++++------------- cpp/src/arrow/sparse_tensor.h | 44 +++++++++++----------- cpp/src/arrow/tensor.h | 4 +- 13 files changed, 103 insertions(+), 103 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index f64428008db..326aac44814 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -787,8 +787,8 @@ namespace { template struct SparseTensorEqualsImpl { - static bool Compare(const SparseTensor& left, - const SparseTensor& right) { + static bool Compare(const SparseTensorImpl& left, + const SparseTensorImpl& right) { // TODO(mrkn): should we support the equality among different formats? return false; } @@ -796,8 +796,8 @@ struct SparseTensorEqualsImpl { template struct SparseTensorEqualsImpl { - static bool Compare(const SparseTensor& left, - const SparseTensor& right) { + static bool Compare(const SparseTensorImpl& left, + const SparseTensorImpl& right) { DCHECK(left.type()->id() == right.type()->id()); DCHECK(left.shape() == right.shape()); DCHECK(left.non_zero_length() == right.non_zero_length()); @@ -821,19 +821,19 @@ struct SparseTensorEqualsImpl { } }; -template -inline bool SparseTensorEqualsImplDispatch(const SparseTensor& left, - const SparseTensorBase& right) { +template +inline bool SparseTensorEqualsImplDispatch(const SparseTensorImpl& left, + const SparseTensor& right) { switch (right.sparse_tensor_format_id()) { case SparseTensorFormat::COO: { - const auto& right_coo = checked_cast&>(right); - return SparseTensorEqualsImpl::Compare(left, + const auto& right_coo = checked_cast&>(right); + return SparseTensorEqualsImpl::Compare(left, right_coo); } case SparseTensorFormat::CSR: { - const auto& right_csr = checked_cast&>(right); - return SparseTensorEqualsImpl::Compare(left, + const auto& right_csr = checked_cast&>(right); + return SparseTensorEqualsImpl::Compare(left, right_csr); } @@ -844,7 +844,7 @@ inline bool SparseTensorEqualsImplDispatch(const SparseTensor& } // namespace -bool SparseTensorEquals(const SparseTensorBase& left, const SparseTensorBase& right) { +bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right) { if (&left == &right) { return true; } else if (left.type()->id() != right.type()->id()) { @@ -859,12 +859,12 @@ bool SparseTensorEquals(const SparseTensorBase& left, const SparseTensorBase& ri switch (left.sparse_tensor_format_id()) { case SparseTensorFormat::COO: { - const auto& left_coo = checked_cast&>(left); + const auto& left_coo = checked_cast&>(left); return SparseTensorEqualsImplDispatch(left_coo, right); } case SparseTensorFormat::CSR: { - const auto& left_csr = checked_cast&>(left); + const auto& left_csr = checked_cast&>(left); return SparseTensorEqualsImplDispatch(left_csr, right); } diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 6067b7929ab..331e81bfd3a 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -29,7 +29,7 @@ namespace arrow { class Array; class DataType; class Tensor; -class SparseTensorBase; +class SparseTensor; /// Returns true if the arrays are exactly equal bool ARROW_EXPORT ArrayEquals(const Array& left, const Array& right); @@ -37,8 +37,8 @@ bool ARROW_EXPORT ArrayEquals(const Array& left, const Array& right); bool ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right); /// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal -bool ARROW_EXPORT SparseTensorEquals(const SparseTensorBase& left, - const SparseTensorBase& right); +bool ARROW_EXPORT SparseTensorEquals(const SparseTensor& left, + const SparseTensor& right); /// Returns true if the arrays are approximately equal. For non-floating point /// types, this is equivalent to ArrayEquals(left, right) diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index aafa2e3fe82..8bc6623551b 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -836,7 +836,7 @@ Status MakeSparseTensorIndex(FBB& fbb, const SparseIndex& sparse_index, return Status::OK(); } -Status MakeSparseTensor(FBB& fbb, const SparseTensorBase& sparse_tensor, +Status MakeSparseTensor(FBB& fbb, const SparseTensor& sparse_tensor, int64_t body_length, const std::vector& buffers, SparseTensorOffset* offset) { flatbuf::Type fb_type_type; @@ -872,7 +872,7 @@ Status MakeSparseTensor(FBB& fbb, const SparseTensorBase& sparse_tensor, return Status::OK(); } -Status WriteSparseTensorMessage(const SparseTensorBase& sparse_tensor, +Status WriteSparseTensorMessage(const SparseTensor& sparse_tensor, int64_t body_length, const std::vector& buffers, std::shared_ptr* out) { diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index bff3dd02231..74a1aef8580 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -41,7 +41,7 @@ namespace arrow { class DataType; class Schema; class Tensor; -class SparseTensorBase; +class SparseTensor; namespace flatbuf = org::apache::arrow::flatbuf; @@ -145,7 +145,7 @@ Status WriteRecordBatchMessage(const int64_t length, const int64_t body_length, Status WriteTensorMessage(const Tensor& tensor, const int64_t buffer_start_offset, std::shared_ptr* out); -Status WriteSparseTensorMessage(const SparseTensorBase& sparse_tensor, +Status WriteSparseTensorMessage(const SparseTensor& sparse_tensor, int64_t body_length, const std::vector& buffers, std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/read-write-test.cc b/cpp/src/arrow/ipc/read-write-test.cc index 820708dcaa3..bc27386f34f 100644 --- a/cpp/src/arrow/ipc/read-write-test.cc +++ b/cpp/src/arrow/ipc/read-write-test.cc @@ -851,14 +851,14 @@ class TestSparseTensorRoundTrip : public ::testing::Test, public IpcTestFixture void TearDown() { io::MemoryMapFixture::TearDown(); } template - void CheckSparseTensorRoundTrip(const SparseTensor& tensor) { + void CheckSparseTensorRoundTrip(const SparseTensorImpl& tensor) { GTEST_FAIL(); } }; template <> void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( - const SparseTensor& tensor) { + const SparseTensorImpl& tensor) { const auto& type = checked_cast(*tensor.type()); const int elem_size = type.bit_width() / 8; @@ -878,7 +878,7 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( ASSERT_OK(mmap_->Seek(0)); - std::shared_ptr result; + std::shared_ptr result; ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); const auto& resulted_sparse_index = @@ -890,7 +890,7 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( template <> void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( - const SparseTensor& tensor) { + const SparseTensorImpl& tensor) { const auto& type = checked_cast(*tensor.type()); const int elem_size = type.bit_width() / 8; @@ -911,7 +911,7 @@ void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( ASSERT_OK(mmap_->Seek(0)); - std::shared_ptr result; + std::shared_ptr result; ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); const auto& resulted_sparse_index = @@ -934,7 +934,7 @@ TEST_F(TestSparseTensorRoundTrip, WithSparseCOOIndex) { auto data = Buffer::Wrap(values); NumericTensor t(data, shape, {}, dim_names); - SparseTensor st(t); + SparseTensorImpl st(t); CheckSparseTensorRoundTrip(st); } @@ -951,7 +951,7 @@ TEST_F(TestSparseTensorRoundTrip, WithSparseCSRIndex) { auto data = Buffer::Wrap(values); NumericTensor t(data, shape, {}, dim_names); - SparseTensor st(t); + SparseTensorImpl st(t); CheckSparseTensorRoundTrip(st); } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 4aebccdb8b3..1207427806b 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -772,8 +772,8 @@ Status MakeSparseTensorWithSparseCOOIndex( const std::shared_ptr& type, const std::vector& shape, const std::vector& dim_names, const std::shared_ptr& sparse_index, int64_t non_zero_length, - const std::shared_ptr& data, std::shared_ptr* out) { - *out = std::make_shared>(sparse_index, type, data, shape, dim_names); + const std::shared_ptr& data, std::shared_ptr* out) { + *out = std::make_shared>(sparse_index, type, data, shape, dim_names); return Status::OK(); } @@ -781,15 +781,15 @@ Status MakeSparseTensorWithSparseCSRIndex( const std::shared_ptr& type, const std::vector& shape, const std::vector& dim_names, const std::shared_ptr& sparse_index, int64_t non_zero_length, - const std::shared_ptr& data, std::shared_ptr* out) { - *out = std::make_shared>(sparse_index, type, data, shape, dim_names); + const std::shared_ptr& data, std::shared_ptr* out) { + *out = std::make_shared>(sparse_index, type, data, shape, dim_names); return Status::OK(); } } // namespace Status ReadSparseTensor(const Buffer& metadata, io::RandomAccessFile* file, - std::shared_ptr* out) { + std::shared_ptr* out) { std::shared_ptr type; std::vector shape; std::vector dim_names; @@ -830,12 +830,12 @@ Status ReadSparseTensor(const Buffer& metadata, io::RandomAccessFile* file, } } -Status ReadSparseTensor(const Message& message, std::shared_ptr* out) { +Status ReadSparseTensor(const Message& message, std::shared_ptr* out) { io::BufferReader buffer_reader(message.body()); return ReadSparseTensor(*message.metadata(), &buffer_reader, out); } -Status ReadSparseTensor(io::InputStream* file, std::shared_ptr* out) { +Status ReadSparseTensor(io::InputStream* file, std::shared_ptr* out) { std::unique_ptr message; RETURN_NOT_OK(ReadContiguousPayload(file, &message)); DCHECK_EQ(message->type(), Message::SPARSE_TENSOR); diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index 0d49f84dbb7..ebecea13ffb 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -33,7 +33,7 @@ class Buffer; class Schema; class Status; class Tensor; -class SparseTensorBase; +class SparseTensor; namespace io { @@ -242,7 +242,7 @@ Status ReadTensor(const Message& message, std::shared_ptr* out); /// \param[out] out the read sparse tensor /// \return Status ARROW_EXPORT -Status ReadSparseTensor(io::InputStream* file, std::shared_ptr* out); +Status ReadSparseTensor(io::InputStream* file, std::shared_ptr* out); /// \brief EXPERIMENTAL: Read arrow::SparseTensor from IPC message /// @@ -250,7 +250,7 @@ Status ReadSparseTensor(io::InputStream* file, std::shared_ptr /// \param[out] out the read sparse tensor /// \return Status ARROW_EXPORT -Status ReadSparseTensor(const Message& message, std::shared_ptr* out); +Status ReadSparseTensor(const Message& message, std::shared_ptr* out); } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index cd1d2773c0b..0bf68142c77 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -703,12 +703,12 @@ class SparseTensorSerializer { return Status::OK(); } - Status SerializeMetadata(const SparseTensorBase& sparse_tensor) { + Status SerializeMetadata(const SparseTensor& sparse_tensor) { return WriteSparseTensorMessage(sparse_tensor, out_->body_length, buffer_meta_, &out_->metadata); } - Status Assemble(const SparseTensorBase& sparse_tensor) { + Status Assemble(const SparseTensor& sparse_tensor) { if (buffer_meta_.size() > 0) { buffer_meta_.clear(); out_->body_buffers.clear(); @@ -753,7 +753,7 @@ class SparseTensorSerializer { int64_t buffer_start_offset_; }; -Status GetSparseTensorPayload(const SparseTensorBase& sparse_tensor, MemoryPool* pool, +Status GetSparseTensorPayload(const SparseTensor& sparse_tensor, MemoryPool* pool, IpcPayload* out) { SparseTensorSerializer writer(0, out); return writer.Assemble(sparse_tensor); @@ -761,7 +761,7 @@ Status GetSparseTensorPayload(const SparseTensorBase& sparse_tensor, MemoryPool* } // namespace internal -Status WriteSparseTensor(const SparseTensorBase& sparse_tensor, io::OutputStream* dst, +Status WriteSparseTensor(const SparseTensor& sparse_tensor, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length, MemoryPool* pool) { internal::IpcPayload payload; diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index 56c1672065c..5feb9e90cb0 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -36,7 +36,7 @@ class Schema; class Status; class Table; class Tensor; -class SparseTensorBase; +class SparseTensor; namespace io { @@ -280,7 +280,7 @@ Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadat // \param[out] metadata_length the actual metadata length, including padding // \param[out] body_length the actual message body length ARROW_EXPORT -Status WriteSparseTensor(const SparseTensorBase& sparse_tensor, io::OutputStream* dst, +Status WriteSparseTensor(const SparseTensor& sparse_tensor, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length, MemoryPool* pool); diff --git a/cpp/src/arrow/sparse_tensor-test.cc b/cpp/src/arrow/sparse_tensor-test.cc index 502c746f27a..2e4cd2d0336 100644 --- a/cpp/src/arrow/sparse_tensor-test.cc +++ b/cpp/src/arrow/sparse_tensor-test.cc @@ -33,17 +33,17 @@ namespace arrow { static inline void CheckSparseIndexFormatType(SparseTensorFormat::type expected, - const SparseTensorBase& sparse_tensor) { + const SparseTensor& sparse_tensor) { ASSERT_EQ(expected, sparse_tensor.sparse_tensor_format_id()); ASSERT_EQ(expected, sparse_tensor.sparse_index()->format_id()); } TEST(TestSparseCOOTensor, CreationEmptyTensor) { std::vector shape = {2, 3, 4}; - SparseTensor st1(int64(), shape); + SparseTensorImpl st1(int64(), shape); std::vector dim_names = {"foo", "bar", "baz"}; - SparseTensor st2(int64(), shape, dim_names); + SparseTensorImpl st2(int64(), shape, dim_names); ASSERT_EQ(0, st1.non_zero_length()); ASSERT_EQ(0, st2.non_zero_length()); @@ -68,8 +68,8 @@ TEST(TestSparseCOOTensor, CreationFromNumericTensor) { std::vector dim_names = {"foo", "bar", "baz"}; NumericTensor tensor1(buffer, shape); NumericTensor tensor2(buffer, shape, {}, dim_names); - SparseTensor st1(tensor1); - SparseTensor st2(tensor2); + SparseTensorImpl st1(tensor1); + SparseTensorImpl st2(tensor2); CheckSparseIndexFormatType(SparseTensorFormat::COO, st1); @@ -133,8 +133,8 @@ TEST(TestSparseCOOTensor, CreationFromTensor) { std::vector dim_names = {"foo", "bar", "baz"}; Tensor tensor1(int64(), buffer, shape); Tensor tensor2(int64(), buffer, shape, {}, dim_names); - SparseTensor st1(tensor1); - SparseTensor st2(tensor2); + SparseTensorImpl st1(tensor1); + SparseTensorImpl st2(tensor2); ASSERT_EQ(12, st1.non_zero_length()); ASSERT_TRUE(st1.is_mutable()); @@ -195,8 +195,8 @@ TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { NumericTensor tensor1(buffer, shape); NumericTensor tensor2(buffer, shape, {}, dim_names); - SparseTensor st1(tensor1); - SparseTensor st2(tensor2); + SparseTensorImpl st1(tensor1); + SparseTensorImpl st2(tensor2); CheckSparseIndexFormatType(SparseTensorFormat::CSR, st1); diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 896750c9ca5..eab7cacc211 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -307,10 +307,10 @@ SparseCSRIndex::SparseCSRIndex(const std::shared_ptr& indptr, std::string SparseCSRIndex::ToString() const { return std::string("SparseCSRIndex"); } // ---------------------------------------------------------------------- -// SparseTensorBase +// SparseTensor // Constructor with all attributes -SparseTensorBase::SparseTensorBase(const std::shared_ptr& type, +SparseTensor::SparseTensor(const std::shared_ptr& type, const std::shared_ptr& data, const std::vector& shape, const std::shared_ptr& sparse_index, @@ -323,7 +323,7 @@ SparseTensorBase::SparseTensorBase(const std::shared_ptr& type, DCHECK(is_tensor_supported(type->id())); } -const std::string& SparseTensorBase::dim_name(int i) const { +const std::string& SparseTensor::dim_name(int i) const { static const std::string kEmpty = ""; if (dim_names_.size() == 0) { return kEmpty; @@ -333,29 +333,29 @@ const std::string& SparseTensorBase::dim_name(int i) const { } } -int64_t SparseTensorBase::size() const { +int64_t SparseTensor::size() const { return std::accumulate(shape_.begin(), shape_.end(), 1LL, std::multiplies()); } -bool SparseTensorBase::Equals(const SparseTensorBase& other) const { +bool SparseTensor::Equals(const SparseTensor& other) const { return SparseTensorEquals(*this, other); } // ---------------------------------------------------------------------- -// SparseTensor +// SparseTensorImpl // Constructor with a dense tensor template -SparseTensor::SparseTensor(const std::shared_ptr& type, - const std::vector& shape, - const std::vector& dim_names) - : SparseTensor(nullptr, type, nullptr, shape, dim_names) {} +SparseTensorImpl::SparseTensorImpl(const std::shared_ptr& type, + const std::vector& shape, + const std::vector& dim_names) + : SparseTensorImpl(nullptr, type, nullptr, shape, dim_names) {} // Constructor with a dense tensor template template -SparseTensor::SparseTensor(const NumericTensor& tensor) - : SparseTensor(nullptr, tensor.type(), nullptr, tensor.shape(), tensor.dim_names_) { +SparseTensorImpl::SparseTensorImpl(const NumericTensor& tensor) + : SparseTensorImpl(nullptr, tensor.type(), nullptr, tensor.shape(), tensor.dim_names_) { SparseTensorConverter converter(tensor); DCHECK_OK(converter.Convert()); sparse_index_ = converter.sparse_index; @@ -364,8 +364,8 @@ SparseTensor::SparseTensor(const NumericTensor& tensor) // Constructor with a dense tensor template -SparseTensor::SparseTensor(const Tensor& tensor) - : SparseTensor(nullptr, tensor.type(), nullptr, tensor.shape(), tensor.dim_names_) { +SparseTensorImpl::SparseTensorImpl(const Tensor& tensor) + : SparseTensorImpl(nullptr, tensor.type(), nullptr, tensor.shape(), tensor.dim_names_) { switch (tensor.type()->id()) { case Type::UINT8: MakeSparseTensorFromTensor(tensor, &sparse_index_, @@ -420,28 +420,28 @@ SparseTensor::SparseTensor(const Tensor& tensor) // Instantiate templates #define INSTANTIATE_SPARSE_TENSOR(IndexType) \ - template class ARROW_TEMPLATE_EXPORT SparseTensor; \ - template ARROW_EXPORT SparseTensor::SparseTensor( \ + template class ARROW_TEMPLATE_EXPORT SparseTensorImpl; \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor( \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor( \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor( \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor( \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor( \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor( \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor( \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor( \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor( \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ const NumericTensor&); \ - template ARROW_EXPORT SparseTensor::SparseTensor( \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ const NumericTensor&) INSTANTIATE_SPARSE_TENSOR(SparseCOOIndex); diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 8ce78c81c46..a6f69240394 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -110,12 +110,12 @@ class ARROW_EXPORT SparseCSRIndex : public SparseIndexBase { }; // ---------------------------------------------------------------------- -// SparseTensorBase class +// SparseTensor class /// \brief EXPERIMENTAL: The base class of sparse tensor container -class ARROW_EXPORT SparseTensorBase { +class ARROW_EXPORT SparseTensor { public: - virtual ~SparseTensorBase() = default; + virtual ~SparseTensor() = default; SparseTensorFormat::type sparse_tensor_format_id() const { return sparse_index_->format_id(); @@ -146,14 +146,14 @@ class ARROW_EXPORT SparseTensorBase { return sparse_index_ ? sparse_index_->non_zero_length() : 0; } - bool Equals(const SparseTensorBase& other) const; + bool Equals(const SparseTensor& other) const; protected: // Constructor with all attributes - SparseTensorBase(const std::shared_ptr& type, - const std::shared_ptr& data, const std::vector& shape, - const std::shared_ptr& sparse_index, - const std::vector& dim_names); + SparseTensor(const std::shared_ptr& type, + const std::shared_ptr& data, const std::vector& shape, + const std::shared_ptr& sparse_index, + const std::vector& dim_names); std::shared_ptr type_; std::shared_ptr data_; @@ -165,34 +165,34 @@ class ARROW_EXPORT SparseTensorBase { }; // ---------------------------------------------------------------------- -// SparseTensor class +// SparseTensorImpl class -/// \brief EXPERIMENTAL: Concrete sparse tensor classes with sparse index type +/// \brief EXPERIMENTAL: Concrete sparse tensor implementation classes with sparse index type template -class ARROW_EXPORT SparseTensor : public SparseTensorBase { +class ARROW_EXPORT SparseTensorImpl : public SparseTensor { public: - virtual ~SparseTensor() = default; + virtual ~SparseTensorImpl() = default; // Constructor with all attributes - SparseTensor(const std::shared_ptr& sparse_index, - const std::shared_ptr& type, const std::shared_ptr& data, - const std::vector& shape, - const std::vector& dim_names) - : SparseTensorBase(type, data, shape, sparse_index, dim_names) {} + SparseTensorImpl(const std::shared_ptr& sparse_index, + const std::shared_ptr& type, const std::shared_ptr& data, + const std::vector& shape, + const std::vector& dim_names) + : SparseTensor(type, data, shape, sparse_index, dim_names) {} // Constructor for empty sparse tensor - SparseTensor(const std::shared_ptr& type, const std::vector& shape, - const std::vector& dim_names = {}); + SparseTensorImpl(const std::shared_ptr& type, const std::vector& shape, + const std::vector& dim_names = {}); // Constructor with a dense numeric tensor template - explicit SparseTensor(const NumericTensor& tensor); + explicit SparseTensorImpl(const NumericTensor& tensor); // Constructor with a dense tensor - explicit SparseTensor(const Tensor& tensor); + explicit SparseTensorImpl(const Tensor& tensor); private: - ARROW_DISALLOW_COPY_AND_ASSIGN(SparseTensor); + ARROW_DISALLOW_COPY_AND_ASSIGN(SparseTensorImpl); }; } // namespace arrow diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h index e386b096037..e81f0f0dff5 100644 --- a/cpp/src/arrow/tensor.h +++ b/cpp/src/arrow/tensor.h @@ -51,7 +51,7 @@ static inline bool is_tensor_supported(Type::type type_id) { } template -class SparseTensor; +class SparseTensorImpl; class ARROW_EXPORT Tensor { public: @@ -114,7 +114,7 @@ class ARROW_EXPORT Tensor { std::vector dim_names_; template - friend class SparseTensor; + friend class SparseTensorImpl; private: ARROW_DISALLOW_COPY_AND_ASSIGN(Tensor); From c83ea6aafc3ac7ade6d67a19027a7f4303d9f77c Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 9 Jan 2019 17:42:14 +0900 Subject: [PATCH 37/40] Add type aliases of sparse tensor types --- cpp/src/arrow/sparse_tensor.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index a6f69240394..a60f533bf81 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -195,6 +195,13 @@ class ARROW_EXPORT SparseTensorImpl : public SparseTensor { ARROW_DISALLOW_COPY_AND_ASSIGN(SparseTensorImpl); }; +/// \brief EXPERIMENTAL: Type alias for COO sparse tensor +using SparseTensorCOO = SparseTensorImpl; + +/// \brief EXPERIMENTAL: Type alias for CSR sparse matrix +using SparseTensorCSR = SparseTensorImpl; +using SparseMatrixCSR = SparseTensorImpl; + } // namespace arrow #endif // ARROW_SPARSE_TENSOR_H From 880bbc4eb1a47992ea542e48ae67f8fe9e2ae0ac Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 9 Jan 2019 17:48:45 +0900 Subject: [PATCH 38/40] Rename too-verbose function name sparse_tensor_format_id -> format_id --- cpp/src/arrow/compare.cc | 4 ++-- cpp/src/arrow/sparse_tensor-test.cc | 2 +- cpp/src/arrow/sparse_tensor.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 326aac44814..4d218c6f544 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -824,7 +824,7 @@ struct SparseTensorEqualsImpl { template inline bool SparseTensorEqualsImplDispatch(const SparseTensorImpl& left, const SparseTensor& right) { - switch (right.sparse_tensor_format_id()) { + switch (right.format_id()) { case SparseTensorFormat::COO: { const auto& right_coo = checked_cast&>(right); return SparseTensorEqualsImpl::Compare(left, @@ -857,7 +857,7 @@ bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right) { return false; } - switch (left.sparse_tensor_format_id()) { + switch (left.format_id()) { case SparseTensorFormat::COO: { const auto& left_coo = checked_cast&>(left); return SparseTensorEqualsImplDispatch(left_coo, right); diff --git a/cpp/src/arrow/sparse_tensor-test.cc b/cpp/src/arrow/sparse_tensor-test.cc index 2e4cd2d0336..d48f2d0229d 100644 --- a/cpp/src/arrow/sparse_tensor-test.cc +++ b/cpp/src/arrow/sparse_tensor-test.cc @@ -34,7 +34,7 @@ namespace arrow { static inline void CheckSparseIndexFormatType(SparseTensorFormat::type expected, const SparseTensor& sparse_tensor) { - ASSERT_EQ(expected, sparse_tensor.sparse_tensor_format_id()); + ASSERT_EQ(expected, sparse_tensor.format_id()); ASSERT_EQ(expected, sparse_tensor.sparse_index()->format_id()); } diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index a60f533bf81..8ace9979060 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -117,7 +117,7 @@ class ARROW_EXPORT SparseTensor { public: virtual ~SparseTensor() = default; - SparseTensorFormat::type sparse_tensor_format_id() const { + SparseTensorFormat::type format_id() const { return sparse_index_->format_id(); } From d57e56fc6c454441b231fb8980d94f5491133a85 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 9 Jan 2019 17:52:15 +0900 Subject: [PATCH 39/40] Merge sparse_tensor_format.h into sparse_tensor.h --- cpp/src/arrow/ipc/metadata-internal.h | 2 +- cpp/src/arrow/sparse_tensor.h | 6 +++++- cpp/src/arrow/sparse_tensor_format.h | 26 -------------------------- 3 files changed, 6 insertions(+), 28 deletions(-) delete mode 100644 cpp/src/arrow/sparse_tensor_format.h diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index 74a1aef8580..4df8050cddb 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -33,7 +33,7 @@ #include "arrow/ipc/dictionary.h" // IYWU pragma: keep #include "arrow/ipc/message.h" #include "arrow/memory_pool.h" -#include "arrow/sparse_tensor_format.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" namespace arrow { diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 8ace9979060..69505388330 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -22,7 +22,6 @@ #include #include -#include "arrow/sparse_tensor_format.h" #include "arrow/tensor.h" namespace arrow { @@ -30,6 +29,11 @@ namespace arrow { // ---------------------------------------------------------------------- // SparseIndex class +/// \brief EXPERIMENTAL: Sparse tensor format enumeration +struct SparseTensorFormat { + enum type { COO, CSR }; +}; + /// \brief EXPERIMENTAL: The base class for representing index of non-zero /// values in sparse tensor class ARROW_EXPORT SparseIndex { diff --git a/cpp/src/arrow/sparse_tensor_format.h b/cpp/src/arrow/sparse_tensor_format.h deleted file mode 100644 index 813378ff0e9..00000000000 --- a/cpp/src/arrow/sparse_tensor_format.h +++ /dev/null @@ -1,26 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_SPARSE_TENSOR_FORMAT_H -#define ARROW_SPARSE_TENSOR_FORMAT_H - -/// \brief EXPERIMENTAL: Sparse tensor format enumeration -struct SparseTensorFormat { - enum type { COO, CSR }; -}; - -#endif // ARROW_SPARSE_TENSOR_FORMAT_H From 148bff82231d9609dfc446e0541a187abf3fb607 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 9 Jan 2019 17:53:00 +0900 Subject: [PATCH 40/40] make format --- cpp/src/arrow/compare.cc | 10 ++-- cpp/src/arrow/compare.h | 3 +- cpp/src/arrow/ipc/metadata-internal.cc | 7 ++- cpp/src/arrow/ipc/metadata-internal.h | 3 +- cpp/src/arrow/ipc/reader.cc | 6 ++- cpp/src/arrow/sparse_tensor.cc | 66 +++++++++++++------------- cpp/src/arrow/sparse_tensor.h | 18 +++---- 7 files changed, 58 insertions(+), 55 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 4d218c6f544..114752934c9 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -826,15 +826,17 @@ inline bool SparseTensorEqualsImplDispatch(const SparseTensorImpl&>(right); + const auto& right_coo = + checked_cast&>(right); return SparseTensorEqualsImpl::Compare(left, - right_coo); + right_coo); } case SparseTensorFormat::CSR: { - const auto& right_csr = checked_cast&>(right); + const auto& right_csr = + checked_cast&>(right); return SparseTensorEqualsImpl::Compare(left, - right_csr); + right_csr); } default: diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 331e81bfd3a..d49d7cc0fdb 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -37,8 +37,7 @@ bool ARROW_EXPORT ArrayEquals(const Array& left, const Array& right); bool ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right); /// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal -bool ARROW_EXPORT SparseTensorEquals(const SparseTensor& left, - const SparseTensor& right); +bool ARROW_EXPORT SparseTensorEquals(const SparseTensor& left, const SparseTensor& right); /// Returns true if the arrays are approximately equal. For non-floating point /// types, this is equivalent to ArrayEquals(left, right) diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 8bc6623551b..da6711395f8 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -836,8 +836,8 @@ Status MakeSparseTensorIndex(FBB& fbb, const SparseIndex& sparse_index, return Status::OK(); } -Status MakeSparseTensor(FBB& fbb, const SparseTensor& sparse_tensor, - int64_t body_length, const std::vector& buffers, +Status MakeSparseTensor(FBB& fbb, const SparseTensor& sparse_tensor, int64_t body_length, + const std::vector& buffers, SparseTensorOffset* offset) { flatbuf::Type fb_type_type; Offset fb_type; @@ -872,8 +872,7 @@ Status MakeSparseTensor(FBB& fbb, const SparseTensor& sparse_tensor, return Status::OK(); } -Status WriteSparseTensorMessage(const SparseTensor& sparse_tensor, - int64_t body_length, +Status WriteSparseTensorMessage(const SparseTensor& sparse_tensor, int64_t body_length, const std::vector& buffers, std::shared_ptr* out) { FBB fbb; diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index 4df8050cddb..6562382b878 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -145,8 +145,7 @@ Status WriteRecordBatchMessage(const int64_t length, const int64_t body_length, Status WriteTensorMessage(const Tensor& tensor, const int64_t buffer_start_offset, std::shared_ptr* out); -Status WriteSparseTensorMessage(const SparseTensor& sparse_tensor, - int64_t body_length, +Status WriteSparseTensorMessage(const SparseTensor& sparse_tensor, int64_t body_length, const std::vector& buffers, std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 1207427806b..e856acafd71 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -773,7 +773,8 @@ Status MakeSparseTensorWithSparseCOOIndex( const std::vector& dim_names, const std::shared_ptr& sparse_index, int64_t non_zero_length, const std::shared_ptr& data, std::shared_ptr* out) { - *out = std::make_shared>(sparse_index, type, data, shape, dim_names); + *out = std::make_shared>(sparse_index, type, data, + shape, dim_names); return Status::OK(); } @@ -782,7 +783,8 @@ Status MakeSparseTensorWithSparseCSRIndex( const std::vector& dim_names, const std::shared_ptr& sparse_index, int64_t non_zero_length, const std::shared_ptr& data, std::shared_ptr* out) { - *out = std::make_shared>(sparse_index, type, data, shape, dim_names); + *out = std::make_shared>(sparse_index, type, data, + shape, dim_names); return Status::OK(); } diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index eab7cacc211..101500d3643 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -311,10 +311,10 @@ std::string SparseCSRIndex::ToString() const { return std::string("SparseCSRInde // Constructor with all attributes SparseTensor::SparseTensor(const std::shared_ptr& type, - const std::shared_ptr& data, - const std::vector& shape, - const std::shared_ptr& sparse_index, - const std::vector& dim_names) + const std::shared_ptr& data, + const std::vector& shape, + const std::shared_ptr& sparse_index, + const std::vector& dim_names) : type_(type), data_(data), shape_(shape), @@ -346,16 +346,17 @@ bool SparseTensor::Equals(const SparseTensor& other) const { // Constructor with a dense tensor template -SparseTensorImpl::SparseTensorImpl(const std::shared_ptr& type, - const std::vector& shape, - const std::vector& dim_names) +SparseTensorImpl::SparseTensorImpl( + const std::shared_ptr& type, const std::vector& shape, + const std::vector& dim_names) : SparseTensorImpl(nullptr, type, nullptr, shape, dim_names) {} // Constructor with a dense tensor template template SparseTensorImpl::SparseTensorImpl(const NumericTensor& tensor) - : SparseTensorImpl(nullptr, tensor.type(), nullptr, tensor.shape(), tensor.dim_names_) { + : SparseTensorImpl(nullptr, tensor.type(), nullptr, tensor.shape(), + tensor.dim_names_) { SparseTensorConverter converter(tensor); DCHECK_OK(converter.Convert()); sparse_index_ = converter.sparse_index; @@ -365,7 +366,8 @@ SparseTensorImpl::SparseTensorImpl(const NumericTensor& t // Constructor with a dense tensor template SparseTensorImpl::SparseTensorImpl(const Tensor& tensor) - : SparseTensorImpl(nullptr, tensor.type(), nullptr, tensor.shape(), tensor.dim_names_) { + : SparseTensorImpl(nullptr, tensor.type(), nullptr, tensor.shape(), + tensor.dim_names_) { switch (tensor.type()->id()) { case Type::UINT8: MakeSparseTensorFromTensor(tensor, &sparse_index_, @@ -419,29 +421,29 @@ SparseTensorImpl::SparseTensorImpl(const Tensor& tensor) // ---------------------------------------------------------------------- // Instantiate templates -#define INSTANTIATE_SPARSE_TENSOR(IndexType) \ - template class ARROW_TEMPLATE_EXPORT SparseTensorImpl; \ - template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ - const NumericTensor&); \ - template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ - const NumericTensor&); \ - template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ - const NumericTensor&); \ - template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ - const NumericTensor&); \ - template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ - const NumericTensor&); \ - template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ - const NumericTensor&); \ - template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ - const NumericTensor&); \ - template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ - const NumericTensor&); \ - template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ - const NumericTensor&); \ - template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ - const NumericTensor&); \ - template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ +#define INSTANTIATE_SPARSE_TENSOR(IndexType) \ + template class ARROW_TEMPLATE_EXPORT SparseTensorImpl; \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ const NumericTensor&) INSTANTIATE_SPARSE_TENSOR(SparseCOOIndex); diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 69505388330..c7693d2ec95 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -121,9 +121,7 @@ class ARROW_EXPORT SparseTensor { public: virtual ~SparseTensor() = default; - SparseTensorFormat::type format_id() const { - return sparse_index_->format_id(); - } + SparseTensorFormat::type format_id() const { return sparse_index_->format_id(); } std::shared_ptr type() const { return type_; } std::shared_ptr data() const { return data_; } @@ -154,8 +152,8 @@ class ARROW_EXPORT SparseTensor { protected: // Constructor with all attributes - SparseTensor(const std::shared_ptr& type, - const std::shared_ptr& data, const std::vector& shape, + SparseTensor(const std::shared_ptr& type, const std::shared_ptr& data, + const std::vector& shape, const std::shared_ptr& sparse_index, const std::vector& dim_names); @@ -171,7 +169,8 @@ class ARROW_EXPORT SparseTensor { // ---------------------------------------------------------------------- // SparseTensorImpl class -/// \brief EXPERIMENTAL: Concrete sparse tensor implementation classes with sparse index type +/// \brief EXPERIMENTAL: Concrete sparse tensor implementation classes with sparse index +/// type template class ARROW_EXPORT SparseTensorImpl : public SparseTensor { public: @@ -179,13 +178,14 @@ class ARROW_EXPORT SparseTensorImpl : public SparseTensor { // Constructor with all attributes SparseTensorImpl(const std::shared_ptr& sparse_index, - const std::shared_ptr& type, const std::shared_ptr& data, - const std::vector& shape, + const std::shared_ptr& type, + const std::shared_ptr& data, const std::vector& shape, const std::vector& dim_names) : SparseTensor(type, data, shape, sparse_index, dim_names) {} // Constructor for empty sparse tensor - SparseTensorImpl(const std::shared_ptr& type, const std::vector& shape, + SparseTensorImpl(const std::shared_ptr& type, + const std::vector& shape, const std::vector& dim_names = {}); // Constructor with a dense numeric tensor