From bf16802ef7a26c88b9f6400bc09cc600b89091cf Mon Sep 17 00:00:00 2001 From: Rok Date: Thu, 22 Oct 2020 17:34:35 +0200 Subject: [PATCH 01/39] Proposal for TensorArray. --- cpp/src/arrow/CMakeLists.txt | 3 +- cpp/src/arrow/extension/tensor_array.cc | 260 ++++++++++++++++++++++++ cpp/src/arrow/extension/tensor_array.h | 87 ++++++++ cpp/src/arrow/extension_type.cc | 4 + cpp/src/arrow/extension_type_test.cc | 141 +++++++++++++ 5 files changed, 494 insertions(+), 1 deletion(-) create mode 100644 cpp/src/arrow/extension/tensor_array.cc create mode 100644 cpp/src/arrow/extension/tensor_array.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 3825e274ee9..145adecc37a 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -558,7 +558,8 @@ if(ARROW_JSON) json/object_parser.cc json/object_writer.cc json/parser.cc - json/reader.cc) + json/reader.cc + extension/tensor_array.cc) endif() if(ARROW_ORC) diff --git a/cpp/src/arrow/extension/tensor_array.cc b/cpp/src/arrow/extension/tensor_array.cc new file mode 100644 index 00000000000..9e432a44a43 --- /dev/null +++ b/cpp/src/arrow/extension/tensor_array.cc @@ -0,0 +1,260 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/tensor_array.h" + +#include "arrow/array/array_nested.h" +#include "arrow/array/array_primitive.h" +#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep +#include "arrow/tensor.h" +#include "arrow/util/logging.h" +#include "arrow/util/sort.h" + +#include +#include + +namespace rj = arrow::rapidjson; + +namespace arrow { +namespace extension { + +std::string FixedShapeTensorType::extension_name() const { + return "arrow.fixed_shape_tensor"; +} + +size_t FixedShapeTensorType::ndim() const { return shape_.size(); } + +std::vector FixedShapeTensorType::shape() const { return shape_; } + +std::vector FixedShapeTensorType::strides() const { + std::vector strides; + const auto& value_type = internal::checked_cast(*value_type_); + DCHECK_OK(internal::ComputeRowMajorStrides(value_type, shape_, &strides)); + internal::Permute(permutation_, &strides); + return strides; +} + +std::vector FixedShapeTensorType::permutation() const { return permutation_; } + +std::vector FixedShapeTensorType::dim_names() const { return dim_names_; } + +bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const { + if (extension_name() != other.extension_name()) { + return false; + } + const auto& other_ext = static_cast(other); + bool equals = storage_type()->Equals(other_ext.storage_type()); + equals &= shape_ == other_ext.shape(); + equals &= permutation_ == other_ext.permutation(); + equals &= dim_names_ == other_ext.dim_names(); + return equals; +} + +std::string FixedShapeTensorType::Serialize() const { + rj::Document document; + document.SetObject(); + rj::Document::AllocatorType& allocator = document.GetAllocator(); + + rj::Value shape(rj::kArrayType); + for (auto v : shape_) { + shape.PushBack(v, allocator); + } + document.AddMember(rj::Value("shape", allocator), shape, allocator); + + if (!permutation_.empty()) { + rj::Value permutation(rj::kArrayType); + for (auto v : permutation_) { + permutation.PushBack(v, allocator); + } + document.AddMember(rj::Value("permutation", allocator), permutation, allocator); + } + + if (!dim_names_.empty()) { + rj::Value dim_names(rj::kArrayType); + for (std::string v : dim_names_) { + dim_names.PushBack(rj::Value{}.SetString(v.c_str(), allocator), allocator); + } + document.AddMember(rj::Value("dim_names", allocator), dim_names, allocator); + } + + rj::StringBuffer buffer; + rj::Writer writer(buffer); + document.Accept(writer); + return buffer.GetString(); +} + +Result> FixedShapeTensorType::Deserialize( + std::shared_ptr storage_type, const std::string& serialized_data) const { + auto value_type = + internal::checked_pointer_cast(storage_type)->value_type(); + rj::Document document; + if (document.Parse(serialized_data.data(), serialized_data.length()).HasParseError() || + !document.HasMember("shape") || !document["shape"].IsArray()) { + return Status::Invalid("Invalid serialized JSON data: ", serialized_data); + } + + std::vector shape; + for (auto& x : document["shape"].GetArray()) { + shape.emplace_back(x.GetInt64()); + } + std::vector permutation; + if (document.HasMember("permutation")) { + for (auto& x : document["permutation"].GetArray()) { + permutation.emplace_back(x.GetInt64()); + } + ARROW_CHECK_EQ(shape.size(), permutation.size()) << "Invalid permutation"; + } + std::vector dim_names; + if (document.HasMember("dim_names")) { + for (auto& x : document["dim_names"].GetArray()) { + dim_names.emplace_back(x.GetString()); + } + ARROW_CHECK_EQ(shape.size(), dim_names.size()) << "Invalid dim_names"; + } + + return tensor_array(value_type, shape, permutation, dim_names); +} + +std::shared_ptr FixedShapeTensorType::MakeArray( + std::shared_ptr data) const { + return std::make_shared(data); +} + +Result> FixedShapeTensorType::MakeArray( + std::shared_ptr tensor) const { + auto permutation = internal::ArgSort(tensor->strides()); + std::reverse(permutation.begin(), permutation.end()); + if (permutation[0] != 0) { + return Status::Invalid( + "Only first-major tensors can be zero-copy converted to arrays"); + } + + auto cell_shape = tensor->shape(); + cell_shape.erase(cell_shape.begin()); + permutation.erase(permutation.begin()); + + auto ext_type = + tensor_array(tensor->type(), cell_shape, permutation, tensor->dim_names()); + + std::shared_ptr arr; + std::shared_ptr value_array; + switch (tensor->type_id()) { + case Type::UINT8: { + value_array = std::make_shared(tensor->size(), tensor->data()); + break; + } + case Type::INT8: { + value_array = std::make_shared(tensor->size(), tensor->data()); + break; + } + case Type::UINT16: { + value_array = std::make_shared(tensor->size(), tensor->data()); + break; + } + case Type::INT16: { + value_array = std::make_shared(tensor->size(), tensor->data()); + break; + } + case Type::UINT32: { + value_array = std::make_shared(tensor->size(), tensor->data()); + break; + } + case Type::INT32: { + value_array = std::make_shared(tensor->size(), tensor->data()); + break; + } + case Type::UINT64: { + value_array = std::make_shared(tensor->size(), tensor->data()); + break; + } + case Type::INT64: { + value_array = std::make_shared(tensor->size(), tensor->data()); + break; + } + case Type::HALF_FLOAT: { + value_array = std::make_shared(tensor->size(), tensor->data()); + break; + } + case Type::FLOAT: { + value_array = std::make_shared(tensor->size(), tensor->data()); + break; + } + case Type::DOUBLE: { + value_array = std::make_shared(tensor->size(), tensor->data()); + break; + } + default: { + return Status::NotImplemented("Unsupported tensor type: ", + tensor->type()->ToString()); + } + } + arr = std::make_shared(ext_type->storage_type(), tensor->shape()[0], + value_array); + auto ext_data = arr->data(); + ext_data->type = ext_type; + return MakeArray(ext_data); +} + +Result> FixedShapeTensorType::ToTensor( + std::shared_ptr arr) const { + // To convert an array of n dimensional tensors to a n+1 dimensional tensor we + // interpret the array's length as the first dimension the new tensor. Further, we + // define n+1 dimensional tensor's strides by front appending a new stride to the n + // dimensional tensor's strides. + + ARROW_DCHECK_EQ(arr->null_count(), 0) << "Null values not supported in tensors."; + auto ext_arr = internal::checked_pointer_cast( + internal::checked_pointer_cast(arr)->storage()); + + std::vector shape = shape_; + shape.insert(shape.begin(), 1, arr->length()); + + std::vector tensor_strides = strides(); + tensor_strides.insert(tensor_strides.begin(), 1, arr->length() * tensor_strides[0]); + + std::shared_ptr buffer = ext_arr->values()->data()->buffers[1]; + return *Tensor::Make(ext_arr->value_type(), buffer, shape, tensor_strides, dim_names()); +} + +std::shared_ptr FixedShapeTensorType::get_storage_type( + const std::shared_ptr& value_type, + const std::vector& shape) const { + const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), + std::multiplies<>()); + return fixed_size_list(value_type, static_cast(size)); +} + +std::shared_ptr tensor_array( + const std::shared_ptr& value_type, const std::vector& shape, + const std::vector& permutation, const std::vector& dim_names) { + ARROW_CHECK(is_tensor_supported(value_type->id())); + if (!permutation.empty()) { + ARROW_CHECK_EQ(shape.size(), permutation.size()) + << "permutation.size() == " << permutation.size() + << " must be empty or have the same length as shape.size() " << shape.size(); + } + if (!dim_names.empty()) { + ARROW_CHECK_EQ(shape.size(), dim_names.size()) + << "dim_names.size() == " << dim_names.size() + << " must be empty or have the same length as shape.size() " << shape.size(); + } + return std::make_shared(value_type, shape, permutation, + dim_names); +} + +} // namespace extension +} // namespace arrow diff --git a/cpp/src/arrow/extension/tensor_array.h b/cpp/src/arrow/extension/tensor_array.h new file mode 100644 index 00000000000..8c055013bd8 --- /dev/null +++ b/cpp/src/arrow/extension/tensor_array.h @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/extension_type.h" + +namespace arrow { +namespace extension { + +class ARROW_EXPORT FixedShapeTensor : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +/// \brief Concrete type class for constant-size Tensor data. +class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { + public: + FixedShapeTensorType(const std::shared_ptr& value_type, + const std::vector& shape, + const std::vector& permutation = {}, + const std::vector& dim_names = {}) + : ExtensionType(get_storage_type(value_type, shape)), + value_type_(value_type), + shape_(shape), + permutation_(permutation), + dim_names_(dim_names) {} + + std::string extension_name() const override; + + size_t ndim() const; + + std::vector shape() const; + + std::vector strides() const; + + std::vector permutation() const; + + std::vector dim_names() const; + + bool ExtensionEquals(const ExtensionType& other) const override; + + std::string Serialize() const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override; + + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + Result> MakeArray(std::shared_ptr tensor) const; + + Result> ToTensor(std::shared_ptr arr) const; + + private: + std::shared_ptr get_storage_type(const std::shared_ptr& value_type, + const std::vector& shape) const; + std::shared_ptr storage_type_; + std::shared_ptr value_type_; + std::vector shape_; + std::vector permutation_; + std::vector dim_names_; +}; + +/// \brief Return a TensorArrayType instance. +ARROW_EXPORT std::shared_ptr tensor_array( + const std::shared_ptr& storage_type, const std::vector& shape, + const std::vector& permutation = {}, + const std::vector& dim_names = {}); + +} // namespace extension +} // namespace arrow diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index e579b691023..b7fcba62ece 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -26,6 +26,7 @@ #include "arrow/array/util.h" #include "arrow/chunked_array.h" +#include "arrow/extension/tensor_array.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" @@ -139,6 +140,9 @@ namespace internal { static void CreateGlobalRegistry() { g_registry = std::make_shared(); + + // Register canonical extension types + ARROW_CHECK_OK(g_registry->RegisterType(extension::tensor_array(int64(), {0}))); } } // namespace internal diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index 31222d74806..33af6285f17 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -23,9 +23,12 @@ #include #include +#include "arrow/testing/matchers.h" #include "arrow/array/array_nested.h" +#include "arrow/array/array_primitive.h" #include "arrow/array/util.h" +#include "arrow/extension/tensor_array.h" #include "arrow/extension_type.h" #include "arrow/io/memory.h" #include "arrow/ipc/options.h" @@ -33,6 +36,7 @@ #include "arrow/ipc/writer.h" #include "arrow/record_batch.h" #include "arrow/status.h" +#include "arrow/tensor.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" @@ -333,4 +337,141 @@ TEST_F(TestExtensionType, ValidateExtensionArray) { ASSERT_OK(ext_arr4->ValidateFull()); } +TEST_F(TestExtensionType, FixedShapeTensorType) { + using FixedShapeTensorType = extension::FixedShapeTensorType; + + std::vector shape = {3, 3, 4}; + std::vector cell_shape = {3, 4}; + auto value_type = int64(); + std::shared_ptr cell_type = fixed_size_list(value_type, 12); + + std::vector dim_names = {"x", "y"}; + std::vector strides = {96, 32, 8}; + std::vector column_major_strides = {8, 24, 72}; + std::vector neither_major_strides = {96, 8, 32}; + std::vector cell_strides = {32, 8}; + std::vector values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; + std::vector values_partial = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; + std::vector shape_partial = {2, 3, 4}; + std::string serialized = R"({"shape":[3,4],"dim_names":["x","y"]})"; + + ASSERT_OK_AND_ASSIGN(auto tensor, + Tensor::Make(value_type, Buffer::Wrap(values), shape)); + ASSERT_OK_AND_ASSIGN( + auto tensor_partial, + Tensor::Make(value_type, Buffer::Wrap(values_partial), shape_partial)); + + std::shared_ptr ext_type = + extension::tensor_array(value_type, cell_shape, {}, dim_names); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); + ASSERT_OK_AND_ASSIGN(auto ds, + ext_type->Deserialize(ext_type->storage_type(), serialized)); + std::shared_ptr deserialized = + std::reinterpret_pointer_cast(ds); + + ASSERT_TRUE(tensor->is_row_major()); + ASSERT_EQ(tensor->strides(), strides); + ASSERT_EQ(tensor_partial->strides(), strides); + + // Test ExtensionType methods + ASSERT_EQ(ext_type->extension_name(), "arrow.fixed_shape_tensor"); + ASSERT_TRUE(ext_type->ExtensionEquals(*exact_ext_type)); + ASSERT_TRUE(ext_type->storage_type()->Equals(*cell_type)); + ASSERT_EQ(ext_type->Serialize(), serialized); + ASSERT_TRUE(deserialized->ExtensionEquals(*ext_type)); + ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); + + // Test FixedShapeTensorType methods + ASSERT_EQ(exact_ext_type->ndim(), cell_shape.size()); + ASSERT_EQ(exact_ext_type->shape(), cell_shape); + ASSERT_EQ(exact_ext_type->strides(), cell_strides); + ASSERT_EQ(exact_ext_type->dim_names(), dim_names); + + // Test MakeArray(std::shared_ptr data) + std::vector> buffers = {nullptr, Buffer::Wrap(values)}; + auto arr_data = std::make_shared(value_type, values.size(), buffers, 0, 0); + auto arr = std::make_shared(arr_data); + EXPECT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type)); + auto data = fsla_arr->data(); + data->type = ext_type; + auto ext_arr = exact_ext_type->MakeArray(data); + ASSERT_EQ(ext_arr->length(), shape[0]); + ASSERT_EQ(ext_arr->null_count(), 0); + + // Test MakeArray(std::shared_ptr tensor) + EXPECT_OK_AND_ASSIGN(auto ext_arr_partial, exact_ext_type->MakeArray(tensor_partial)); + ASSERT_OK(ext_arr->ValidateFull()); + ASSERT_OK(ext_arr_partial->ValidateFull()); + + // Test ToTensor(std::shared_ptr array) + EXPECT_OK_AND_ASSIGN(auto t, exact_ext_type->ToTensor(ext_arr)); + ASSERT_EQ(t->shape(), tensor->shape()); + ASSERT_EQ(t->strides(), tensor->strides()); + ASSERT_TRUE(tensor->Equals(*t)); + + // Test slicing + auto sliced = internal::checked_pointer_cast(ext_arr->Slice(0, 2)); + auto partial = internal::checked_pointer_cast(ext_arr_partial); + ASSERT_OK(sliced->ValidateFull()); + ASSERT_TRUE(sliced->storage()->Equals(*partial->storage())); + ASSERT_EQ(sliced->length(), partial->length()); + + ASSERT_OK(UnregisterExtensionType(ext_type->extension_name())); + ASSERT_OK(RegisterExtensionType(ext_type)); + auto ext_metadata = + key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, + {"ARROW:extension:metadata", serialized}}); + auto ext_field = field("f0", exact_ext_type, true, ext_metadata); + auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); + std::shared_ptr read_batch; + RoundtripBatch(batch, &read_batch); + CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); + ASSERT_OK(UnregisterExtensionType(ext_type->extension_name())); + + auto ext_type_1 = extension::tensor_array(int64(), cell_shape, {}, dim_names); + auto ext_type_2 = extension::tensor_array(int64(), cell_shape, {}, dim_names); + auto ext_type_3 = extension::tensor_array(int32(), cell_shape, {}, dim_names); + ASSERT_TRUE(ext_type_1->ExtensionEquals(*ext_type_2)); + ASSERT_FALSE(ext_type_1->ExtensionEquals(*ext_type_3)); + + auto ext_type_4 = extension::tensor_array(int64(), {3, 4, 7}, {}, {"x", "y", "z"}); + ASSERT_EQ(ext_type_4->strides(), (std::vector{224, 56, 8})); + ext_type_4 = extension::tensor_array(int64(), {3, 4, 7}, {0, 1, 2}, {"x", "y", "z"}); + ASSERT_EQ(ext_type_4->strides(), (std::vector{224, 56, 8})); + + auto ext_type_5 = extension::tensor_array(int64(), {3, 4, 7}, {1, 0, 2}); + ASSERT_EQ(ext_type_5->strides(), (std::vector{56, 224, 8})); + ASSERT_EQ(ext_type_5->Serialize(), R"({"shape":[3,4,7],"permutation":[1,0,2]})"); + + auto ext_type_6 = extension::tensor_array(int64(), {3, 4, 7}, {1, 2, 0}, {}); + ASSERT_EQ(ext_type_6->strides(), (std::vector{56, 8, 224})); + ASSERT_EQ(ext_type_6->Serialize(), R"({"shape":[3,4,7],"permutation":[1,2,0]})"); + + auto ext_type_7 = extension::tensor_array(int64(), {3, 4, 7}, {2, 0, 1}, {}); + ASSERT_EQ(ext_type_7->strides(), (std::vector{8, 224, 56})); + ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); + + auto ext_type_8 = extension::tensor_array(int64(), {3, 4}, {0, 1}); + EXPECT_OK_AND_ASSIGN(auto ext_arr_8, ext_type_8->MakeArray(tensor)); + + ASSERT_OK_AND_ASSIGN( + auto column_major_tensor, + Tensor::Make(value_type, Buffer::Wrap(values), shape, column_major_strides)); + auto ext_type_9 = extension::tensor_array(int64(), {3, 4}, {0, 1}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr( + "Invalid: Only first-major tensors can be zero-copy converted to arrays"), + ext_type_9->MakeArray(column_major_tensor)); + ASSERT_THAT(ext_type_9->MakeArray(column_major_tensor), Raises(StatusCode::Invalid)); + + auto neither_major_tensor = std::make_shared(value_type, Buffer::Wrap(values), + shape, neither_major_strides); + auto ext_type_10 = extension::tensor_array(int64(), {3, 4}, {1, 0}); + ASSERT_OK_AND_ASSIGN(auto ext_arr_10, ext_type_10->MakeArray(neither_major_tensor)); +} + } // namespace arrow From 37ac1a7604fbcc09f90bc3874e2c7af0fa1ff7bc Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 21 Feb 2023 21:12:36 +0100 Subject: [PATCH 02/39] Put register extension behind ARROW_WITH_JSON check. --- cpp/src/arrow/CMakeLists.txt | 9 +- ...{tensor_array.cc => fixed_shape_tensor.cc} | 13 +- .../{tensor_array.h => fixed_shape_tensor.h} | 2 +- .../extension/fixed_shape_tensor_test.cc | 188 ++++++++++++++++++ cpp/src/arrow/extension_type.cc | 9 +- cpp/src/arrow/extension_type_test.cc | 141 ------------- 6 files changed, 211 insertions(+), 151 deletions(-) rename cpp/src/arrow/extension/{tensor_array.cc => fixed_shape_tensor.cc} (96%) rename cpp/src/arrow/extension/{tensor_array.h => fixed_shape_tensor.h} (97%) create mode 100644 cpp/src/arrow/extension/fixed_shape_tensor_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 145adecc37a..fb0f7f1b7a0 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -549,8 +549,10 @@ if(ARROW_IPC) endif() if(ARROW_JSON) + add_definitions(-DARROW_WITH_JSON) list(APPEND ARROW_SRCS + extension/fixed_shape_tensor.cc json/options.cc json/chunked_builder.cc json/chunker.cc @@ -558,8 +560,7 @@ if(ARROW_JSON) json/object_parser.cc json/object_writer.cc json/parser.cc - json/reader.cc - extension/tensor_array.cc) + json/reader.cc) endif() if(ARROW_ORC) @@ -806,6 +807,10 @@ if(ARROW_IPC) add_arrow_test(extension_type_test) endif() +if(ARROW_JSON) + add_arrow_test(canonical_extension_test SOURCES extension/fixed_shape_tensor_test.cc) +endif() + add_arrow_test(misc_test SOURCES datum_test.cc diff --git a/cpp/src/arrow/extension/tensor_array.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc similarity index 96% rename from cpp/src/arrow/extension/tensor_array.cc rename to cpp/src/arrow/extension/fixed_shape_tensor.cc index 9e432a44a43..6dc4bb6752e 100644 --- a/cpp/src/arrow/extension/tensor_array.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/extension/tensor_array.h" +#include "arrow/extension/fixed_shape_tensor.h" #include "arrow/array/array_nested.h" #include "arrow/array/array_primitive.h" @@ -44,7 +44,9 @@ std::vector FixedShapeTensorType::strides() const { std::vector strides; const auto& value_type = internal::checked_cast(*value_type_); DCHECK_OK(internal::ComputeRowMajorStrides(value_type, shape_, &strides)); - internal::Permute(permutation_, &strides); + if (!permutation_.empty()) { + internal::Permute(permutation_, &strides); + } return strides; } @@ -126,7 +128,7 @@ Result> FixedShapeTensorType::Deserialize( ARROW_CHECK_EQ(shape.size(), dim_names.size()) << "Invalid dim_names"; } - return tensor_array(value_type, shape, permutation, dim_names); + return fixed_shape_tensor(value_type, shape, permutation, dim_names); } std::shared_ptr FixedShapeTensorType::MakeArray( @@ -148,7 +150,7 @@ Result> FixedShapeTensorType::MakeArray( permutation.erase(permutation.begin()); auto ext_type = - tensor_array(tensor->type(), cell_shape, permutation, tensor->dim_names()); + fixed_shape_tensor(tensor->type(), cell_shape, permutation, tensor->dim_names()); std::shared_ptr arr; std::shared_ptr value_array; @@ -238,10 +240,11 @@ std::shared_ptr FixedShapeTensorType::get_storage_type( return fixed_size_list(value_type, static_cast(size)); } -std::shared_ptr tensor_array( +std::shared_ptr fixed_shape_tensor( const std::shared_ptr& value_type, const std::vector& shape, const std::vector& permutation, const std::vector& dim_names) { ARROW_CHECK(is_tensor_supported(value_type->id())); + if (!permutation.empty()) { ARROW_CHECK_EQ(shape.size(), permutation.size()) << "permutation.size() == " << permutation.size() diff --git a/cpp/src/arrow/extension/tensor_array.h b/cpp/src/arrow/extension/fixed_shape_tensor.h similarity index 97% rename from cpp/src/arrow/extension/tensor_array.h rename to cpp/src/arrow/extension/fixed_shape_tensor.h index 8c055013bd8..1268c013e4e 100644 --- a/cpp/src/arrow/extension/tensor_array.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -78,7 +78,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { }; /// \brief Return a TensorArrayType instance. -ARROW_EXPORT std::shared_ptr tensor_array( +ARROW_EXPORT std::shared_ptr fixed_shape_tensor( const std::shared_ptr& storage_type, const std::vector& shape, const std::vector& permutation = {}, const std::vector& dim_names = {}); diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc new file mode 100644 index 00000000000..fe212460f49 --- /dev/null +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/fixed_shape_tensor.h" + +#include "arrow/testing/matchers.h" + +#include "arrow/array/array_nested.h" +#include "arrow/array/array_primitive.h" +#include "arrow/io/memory.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/writer.h" +#include "arrow/record_batch.h" +#include "arrow/tensor.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/key_value_metadata.h" + +namespace arrow { + +class TestExtensionType : public ::testing::Test {}; + +auto RoundtripBatch = [](const std::shared_ptr& batch, + std::shared_ptr* out) { + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); + ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), + out_stream.get())); + + ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); + + io::BufferReader reader(complete_ipc_stream); + std::shared_ptr batch_reader; + ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); + ASSERT_OK(batch_reader->ReadNext(out)); +}; + +TEST_F(TestExtensionType, FixedShapeTensorType) { + using FixedShapeTensorType = extension::FixedShapeTensorType; + using arrow::extension::fixed_shape_tensor; + + std::vector shape = {3, 3, 4}; + std::vector cell_shape = {3, 4}; + auto value_type = int64(); + std::shared_ptr cell_type = fixed_size_list(value_type, 12); + + std::vector dim_names = {"x", "y"}; + std::vector strides = {96, 32, 8}; + std::vector column_major_strides = {8, 24, 72}; + std::vector neither_major_strides = {96, 8, 32}; + std::vector cell_strides = {32, 8}; + std::vector values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; + std::vector values_partial = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; + std::vector shape_partial = {2, 3, 4}; + std::string serialized = R"({"shape":[3,4],"dim_names":["x","y"]})"; + + ASSERT_OK_AND_ASSIGN(auto tensor, + Tensor::Make(value_type, Buffer::Wrap(values), shape)); + ASSERT_OK_AND_ASSIGN( + auto tensor_partial, + Tensor::Make(value_type, Buffer::Wrap(values_partial), shape_partial)); + + std::shared_ptr ext_type = + fixed_shape_tensor(value_type, cell_shape, {}, dim_names); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); + ASSERT_OK_AND_ASSIGN(auto ds, + ext_type->Deserialize(ext_type->storage_type(), serialized)); + std::shared_ptr deserialized = + std::reinterpret_pointer_cast(ds); + + ASSERT_TRUE(tensor->is_row_major()); + ASSERT_EQ(tensor->strides(), strides); + ASSERT_EQ(tensor_partial->strides(), strides); + + // Test ExtensionType methods + ASSERT_EQ(ext_type->extension_name(), "arrow.fixed_shape_tensor"); + ASSERT_TRUE(ext_type->ExtensionEquals(*exact_ext_type)); + ASSERT_TRUE(ext_type->storage_type()->Equals(*cell_type)); + ASSERT_EQ(ext_type->Serialize(), serialized); + ASSERT_TRUE(deserialized->ExtensionEquals(*ext_type)); + ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); + + // Test FixedShapeTensorType methods + ASSERT_EQ(exact_ext_type->ndim(), cell_shape.size()); + ASSERT_EQ(exact_ext_type->shape(), cell_shape); + ASSERT_EQ(exact_ext_type->strides(), cell_strides); + ASSERT_EQ(exact_ext_type->dim_names(), dim_names); + + // Test MakeArray(std::shared_ptr data) + std::vector> buffers = {nullptr, Buffer::Wrap(values)}; + auto arr_data = std::make_shared(value_type, values.size(), buffers, 0, 0); + auto arr = std::make_shared(arr_data); + EXPECT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type)); + auto data = fsla_arr->data(); + data->type = ext_type; + auto ext_arr = exact_ext_type->MakeArray(data); + ASSERT_EQ(ext_arr->length(), shape[0]); + ASSERT_EQ(ext_arr->null_count(), 0); + + // Test MakeArray(std::shared_ptr tensor) + EXPECT_OK_AND_ASSIGN(auto ext_arr_partial, exact_ext_type->MakeArray(tensor_partial)); + ASSERT_OK(ext_arr->ValidateFull()); + ASSERT_OK(ext_arr_partial->ValidateFull()); + + // Test ToTensor(std::shared_ptr array) + EXPECT_OK_AND_ASSIGN(auto t, exact_ext_type->ToTensor(ext_arr)); + ASSERT_EQ(t->shape(), tensor->shape()); + ASSERT_EQ(t->strides(), tensor->strides()); + ASSERT_TRUE(tensor->Equals(*t)); + + // Test slicing + auto sliced = internal::checked_pointer_cast(ext_arr->Slice(0, 2)); + auto partial = internal::checked_pointer_cast(ext_arr_partial); + ASSERT_OK(sliced->ValidateFull()); + ASSERT_TRUE(sliced->storage()->Equals(*partial->storage())); + ASSERT_EQ(sliced->length(), partial->length()); + + ASSERT_OK(UnregisterExtensionType(ext_type->extension_name())); + ASSERT_OK(RegisterExtensionType(ext_type)); + auto ext_metadata = + key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, + {"ARROW:extension:metadata", serialized}}); + auto ext_field = field("f0", exact_ext_type, true, ext_metadata); + auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); + std::shared_ptr read_batch; + RoundtripBatch(batch, &read_batch); + CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); + ASSERT_OK(UnregisterExtensionType(ext_type->extension_name())); + + auto ext_type_1 = fixed_shape_tensor(int64(), cell_shape, {}, dim_names); + auto ext_type_2 = fixed_shape_tensor(int64(), cell_shape, {}, dim_names); + auto ext_type_3 = fixed_shape_tensor(int32(), cell_shape, {}, dim_names); + ASSERT_TRUE(ext_type_1->ExtensionEquals(*ext_type_2)); + ASSERT_FALSE(ext_type_1->ExtensionEquals(*ext_type_3)); + + auto ext_type_4 = fixed_shape_tensor(int64(), {3, 4, 7}, {}, {"x", "y", "z"}); + ASSERT_EQ(ext_type_4->strides(), (std::vector{224, 56, 8})); + ext_type_4 = fixed_shape_tensor(int64(), {3, 4, 7}, {0, 1, 2}, {"x", "y", "z"}); + ASSERT_EQ(ext_type_4->strides(), (std::vector{224, 56, 8})); + + auto ext_type_5 = fixed_shape_tensor(int64(), {3, 4, 7}, {1, 0, 2}); + ASSERT_EQ(ext_type_5->strides(), (std::vector{56, 224, 8})); + ASSERT_EQ(ext_type_5->Serialize(), R"({"shape":[3,4,7],"permutation":[1,0,2]})"); + + auto ext_type_6 = fixed_shape_tensor(int64(), {3, 4, 7}, {1, 2, 0}, {}); + ASSERT_EQ(ext_type_6->strides(), (std::vector{56, 8, 224})); + ASSERT_EQ(ext_type_6->Serialize(), R"({"shape":[3,4,7],"permutation":[1,2,0]})"); + + auto ext_type_7 = fixed_shape_tensor(int64(), {3, 4, 7}, {2, 0, 1}, {}); + ASSERT_EQ(ext_type_7->strides(), (std::vector{8, 224, 56})); + ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); + + auto ext_type_8 = fixed_shape_tensor(int64(), {3, 4}, {0, 1}); + EXPECT_OK_AND_ASSIGN(auto ext_arr_8, ext_type_8->MakeArray(tensor)); + + ASSERT_OK_AND_ASSIGN( + auto column_major_tensor, + Tensor::Make(value_type, Buffer::Wrap(values), shape, column_major_strides)); + auto ext_type_9 = fixed_shape_tensor(int64(), {3, 4}, {0, 1}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr( + "Invalid: Only first-major tensors can be zero-copy converted to arrays"), + ext_type_9->MakeArray(column_major_tensor)); + ASSERT_THAT(ext_type_9->MakeArray(column_major_tensor), Raises(StatusCode::Invalid)); + + auto neither_major_tensor = std::make_shared(value_type, Buffer::Wrap(values), + shape, neither_major_strides); + auto ext_type_10 = fixed_shape_tensor(int64(), {3, 4}, {1, 0}); + ASSERT_OK_AND_ASSIGN(auto ext_arr_10, ext_type_10->MakeArray(neither_major_tensor)); +} + +} // namespace arrow diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index b7fcba62ece..44bb0afa29f 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -26,7 +26,9 @@ #include "arrow/array/util.h" #include "arrow/chunked_array.h" -#include "arrow/extension/tensor_array.h" +#ifdef ARROW_WITH_JSON +#include "arrow/extension/fixed_shape_tensor.h" +#endif #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" @@ -141,8 +143,11 @@ namespace internal { static void CreateGlobalRegistry() { g_registry = std::make_shared(); +#ifdef ARROW_WITH_JSON // Register canonical extension types - ARROW_CHECK_OK(g_registry->RegisterType(extension::tensor_array(int64(), {0}))); + ARROW_CHECK_OK( + g_registry->RegisterType(arrow::extension::fixed_shape_tensor(int64(), {}))); +#endif } } // namespace internal diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index 33af6285f17..31222d74806 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -23,12 +23,9 @@ #include #include -#include "arrow/testing/matchers.h" #include "arrow/array/array_nested.h" -#include "arrow/array/array_primitive.h" #include "arrow/array/util.h" -#include "arrow/extension/tensor_array.h" #include "arrow/extension_type.h" #include "arrow/io/memory.h" #include "arrow/ipc/options.h" @@ -36,7 +33,6 @@ #include "arrow/ipc/writer.h" #include "arrow/record_batch.h" #include "arrow/status.h" -#include "arrow/tensor.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" @@ -337,141 +333,4 @@ TEST_F(TestExtensionType, ValidateExtensionArray) { ASSERT_OK(ext_arr4->ValidateFull()); } -TEST_F(TestExtensionType, FixedShapeTensorType) { - using FixedShapeTensorType = extension::FixedShapeTensorType; - - std::vector shape = {3, 3, 4}; - std::vector cell_shape = {3, 4}; - auto value_type = int64(); - std::shared_ptr cell_type = fixed_size_list(value_type, 12); - - std::vector dim_names = {"x", "y"}; - std::vector strides = {96, 32, 8}; - std::vector column_major_strides = {8, 24, 72}; - std::vector neither_major_strides = {96, 8, 32}; - std::vector cell_strides = {32, 8}; - std::vector values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; - std::vector values_partial = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; - std::vector shape_partial = {2, 3, 4}; - std::string serialized = R"({"shape":[3,4],"dim_names":["x","y"]})"; - - ASSERT_OK_AND_ASSIGN(auto tensor, - Tensor::Make(value_type, Buffer::Wrap(values), shape)); - ASSERT_OK_AND_ASSIGN( - auto tensor_partial, - Tensor::Make(value_type, Buffer::Wrap(values_partial), shape_partial)); - - std::shared_ptr ext_type = - extension::tensor_array(value_type, cell_shape, {}, dim_names); - auto exact_ext_type = internal::checked_pointer_cast(ext_type); - ASSERT_OK_AND_ASSIGN(auto ds, - ext_type->Deserialize(ext_type->storage_type(), serialized)); - std::shared_ptr deserialized = - std::reinterpret_pointer_cast(ds); - - ASSERT_TRUE(tensor->is_row_major()); - ASSERT_EQ(tensor->strides(), strides); - ASSERT_EQ(tensor_partial->strides(), strides); - - // Test ExtensionType methods - ASSERT_EQ(ext_type->extension_name(), "arrow.fixed_shape_tensor"); - ASSERT_TRUE(ext_type->ExtensionEquals(*exact_ext_type)); - ASSERT_TRUE(ext_type->storage_type()->Equals(*cell_type)); - ASSERT_EQ(ext_type->Serialize(), serialized); - ASSERT_TRUE(deserialized->ExtensionEquals(*ext_type)); - ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); - - // Test FixedShapeTensorType methods - ASSERT_EQ(exact_ext_type->ndim(), cell_shape.size()); - ASSERT_EQ(exact_ext_type->shape(), cell_shape); - ASSERT_EQ(exact_ext_type->strides(), cell_strides); - ASSERT_EQ(exact_ext_type->dim_names(), dim_names); - - // Test MakeArray(std::shared_ptr data) - std::vector> buffers = {nullptr, Buffer::Wrap(values)}; - auto arr_data = std::make_shared(value_type, values.size(), buffers, 0, 0); - auto arr = std::make_shared(arr_data); - EXPECT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type)); - auto data = fsla_arr->data(); - data->type = ext_type; - auto ext_arr = exact_ext_type->MakeArray(data); - ASSERT_EQ(ext_arr->length(), shape[0]); - ASSERT_EQ(ext_arr->null_count(), 0); - - // Test MakeArray(std::shared_ptr tensor) - EXPECT_OK_AND_ASSIGN(auto ext_arr_partial, exact_ext_type->MakeArray(tensor_partial)); - ASSERT_OK(ext_arr->ValidateFull()); - ASSERT_OK(ext_arr_partial->ValidateFull()); - - // Test ToTensor(std::shared_ptr array) - EXPECT_OK_AND_ASSIGN(auto t, exact_ext_type->ToTensor(ext_arr)); - ASSERT_EQ(t->shape(), tensor->shape()); - ASSERT_EQ(t->strides(), tensor->strides()); - ASSERT_TRUE(tensor->Equals(*t)); - - // Test slicing - auto sliced = internal::checked_pointer_cast(ext_arr->Slice(0, 2)); - auto partial = internal::checked_pointer_cast(ext_arr_partial); - ASSERT_OK(sliced->ValidateFull()); - ASSERT_TRUE(sliced->storage()->Equals(*partial->storage())); - ASSERT_EQ(sliced->length(), partial->length()); - - ASSERT_OK(UnregisterExtensionType(ext_type->extension_name())); - ASSERT_OK(RegisterExtensionType(ext_type)); - auto ext_metadata = - key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, - {"ARROW:extension:metadata", serialized}}); - auto ext_field = field("f0", exact_ext_type, true, ext_metadata); - auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); - std::shared_ptr read_batch; - RoundtripBatch(batch, &read_batch); - CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); - ASSERT_OK(UnregisterExtensionType(ext_type->extension_name())); - - auto ext_type_1 = extension::tensor_array(int64(), cell_shape, {}, dim_names); - auto ext_type_2 = extension::tensor_array(int64(), cell_shape, {}, dim_names); - auto ext_type_3 = extension::tensor_array(int32(), cell_shape, {}, dim_names); - ASSERT_TRUE(ext_type_1->ExtensionEquals(*ext_type_2)); - ASSERT_FALSE(ext_type_1->ExtensionEquals(*ext_type_3)); - - auto ext_type_4 = extension::tensor_array(int64(), {3, 4, 7}, {}, {"x", "y", "z"}); - ASSERT_EQ(ext_type_4->strides(), (std::vector{224, 56, 8})); - ext_type_4 = extension::tensor_array(int64(), {3, 4, 7}, {0, 1, 2}, {"x", "y", "z"}); - ASSERT_EQ(ext_type_4->strides(), (std::vector{224, 56, 8})); - - auto ext_type_5 = extension::tensor_array(int64(), {3, 4, 7}, {1, 0, 2}); - ASSERT_EQ(ext_type_5->strides(), (std::vector{56, 224, 8})); - ASSERT_EQ(ext_type_5->Serialize(), R"({"shape":[3,4,7],"permutation":[1,0,2]})"); - - auto ext_type_6 = extension::tensor_array(int64(), {3, 4, 7}, {1, 2, 0}, {}); - ASSERT_EQ(ext_type_6->strides(), (std::vector{56, 8, 224})); - ASSERT_EQ(ext_type_6->Serialize(), R"({"shape":[3,4,7],"permutation":[1,2,0]})"); - - auto ext_type_7 = extension::tensor_array(int64(), {3, 4, 7}, {2, 0, 1}, {}); - ASSERT_EQ(ext_type_7->strides(), (std::vector{8, 224, 56})); - ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); - - auto ext_type_8 = extension::tensor_array(int64(), {3, 4}, {0, 1}); - EXPECT_OK_AND_ASSIGN(auto ext_arr_8, ext_type_8->MakeArray(tensor)); - - ASSERT_OK_AND_ASSIGN( - auto column_major_tensor, - Tensor::Make(value_type, Buffer::Wrap(values), shape, column_major_strides)); - auto ext_type_9 = extension::tensor_array(int64(), {3, 4}, {0, 1}); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, - testing::HasSubstr( - "Invalid: Only first-major tensors can be zero-copy converted to arrays"), - ext_type_9->MakeArray(column_major_tensor)); - ASSERT_THAT(ext_type_9->MakeArray(column_major_tensor), Raises(StatusCode::Invalid)); - - auto neither_major_tensor = std::make_shared(value_type, Buffer::Wrap(values), - shape, neither_major_strides); - auto ext_type_10 = extension::tensor_array(int64(), {3, 4}, {1, 0}); - ASSERT_OK_AND_ASSIGN(auto ext_arr_10, ext_type_10->MakeArray(neither_major_tensor)); -} - } // namespace arrow From e1d52b4799fd1bc8cfb96fd6fecaef48f9654cf7 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 2 Mar 2023 12:27:10 +0100 Subject: [PATCH 03/39] Review feedback --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 51 ++++++++++--------- cpp/src/arrow/extension/fixed_shape_tensor.h | 32 ++++++++---- .../extension/fixed_shape_tensor_test.cc | 11 ++-- 3 files changed, 55 insertions(+), 39 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 6dc4bb6752e..f101330c7ba 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -32,28 +32,6 @@ namespace rj = arrow::rapidjson; namespace arrow { namespace extension { -std::string FixedShapeTensorType::extension_name() const { - return "arrow.fixed_shape_tensor"; -} - -size_t FixedShapeTensorType::ndim() const { return shape_.size(); } - -std::vector FixedShapeTensorType::shape() const { return shape_; } - -std::vector FixedShapeTensorType::strides() const { - std::vector strides; - const auto& value_type = internal::checked_cast(*value_type_); - DCHECK_OK(internal::ComputeRowMajorStrides(value_type, shape_, &strides)); - if (!permutation_.empty()) { - internal::Permute(permutation_, &strides); - } - return strides; -} - -std::vector FixedShapeTensorType::permutation() const { return permutation_; } - -std::vector FixedShapeTensorType::dim_names() const { return dim_names_; } - bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const { if (extension_name() != other.extension_name()) { return false; @@ -101,6 +79,10 @@ std::string FixedShapeTensorType::Serialize() const { Result> FixedShapeTensorType::Deserialize( std::shared_ptr storage_type, const std::string& serialized_data) const { + if (storage_type->id() != Type::FIXED_SIZE_LIST) { + return Status::Invalid("Expected FixedSizeList storage type, got ", + storage_type->ToString()); + } auto value_type = internal::checked_pointer_cast(storage_type)->value_type(); rj::Document document; @@ -118,14 +100,18 @@ Result> FixedShapeTensorType::Deserialize( for (auto& x : document["permutation"].GetArray()) { permutation.emplace_back(x.GetInt64()); } - ARROW_CHECK_EQ(shape.size(), permutation.size()) << "Invalid permutation"; + if (shape.size() != permutation.size()) { + return Status::Invalid("Invalid permutation"); + } } std::vector dim_names; if (document.HasMember("dim_names")) { for (auto& x : document["dim_names"].GetArray()) { dim_names.emplace_back(x.GetString()); } - ARROW_CHECK_EQ(shape.size(), dim_names.size()) << "Invalid dim_names"; + if (shape.size() != dim_names.size()) { + return Status::Invalid("Invalid dim_names"); + } } return fixed_shape_tensor(value_type, shape, permutation, dim_names); @@ -148,6 +134,9 @@ Result> FixedShapeTensorType::MakeArray( auto cell_shape = tensor->shape(); cell_shape.erase(cell_shape.begin()); permutation.erase(permutation.begin()); + for (auto& x : permutation) { + x--; + } auto ext_type = fixed_shape_tensor(tensor->type(), cell_shape, permutation, tensor->dim_names()); @@ -232,7 +221,19 @@ Result> FixedShapeTensorType::ToTensor( return *Tensor::Make(ext_arr->value_type(), buffer, shape, tensor_strides, dim_names()); } -std::shared_ptr FixedShapeTensorType::get_storage_type( +const std::vector FixedShapeTensorType::ComputeStrides( + const std::shared_ptr value_type, const std::vector shape, + const std::vector permutation) const { + std::vector strides; + const auto& element_type = internal::checked_cast(*value_type); + DCHECK_OK(internal::ComputeRowMajorStrides(element_type, shape, &strides)); + if (!permutation.empty()) { + internal::Permute(permutation, &strides); + } + return strides; +} + +std::shared_ptr FixedShapeTensorType::GetStorageType( const std::shared_ptr& value_type, const std::vector& shape) const { const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 1268c013e4e..abbafe02289 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -23,7 +23,7 @@ namespace arrow { namespace extension { -class ARROW_EXPORT FixedShapeTensor : public ExtensionArray { +class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { public: using ExtensionArray::ExtensionArray; }; @@ -35,26 +35,34 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { const std::vector& shape, const std::vector& permutation = {}, const std::vector& dim_names = {}) - : ExtensionType(get_storage_type(value_type, shape)), + : ExtensionType(GetStorageType(value_type, shape)), value_type_(value_type), shape_(shape), + strides_(ComputeStrides(value_type, shape, permutation)), permutation_(permutation), dim_names_(dim_names) {} - std::string extension_name() const override; + std::string extension_name() const override { return "arrow.fixed_shape_tensor"; } - size_t ndim() const; + // TODO: docs + const size_t ndim() const { return shape_.size(); } - std::vector shape() const; + // TODO: docs + const std::vector& shape() const { return shape_; } - std::vector strides() const; + // TODO: docs + const std::vector strides() const { return strides_; } - std::vector permutation() const; + // TODO: docs + const std::vector& permutation() const { return permutation_; } - std::vector dim_names() const; + // TODO: docs + const std::vector& dim_names() const { return dim_names_; } bool ExtensionEquals(const ExtensionType& other) const override; + bool Equals(const ExtensionType& other) const { return ExtensionEquals(other); }; + std::string Serialize() const override; Result> Deserialize( @@ -68,11 +76,15 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { Result> ToTensor(std::shared_ptr arr) const; private: - std::shared_ptr get_storage_type(const std::shared_ptr& value_type, - const std::vector& shape) const; + const std::vector ComputeStrides(const std::shared_ptr type, + const std::vector shape, + const std::vector permutation) const; + std::shared_ptr GetStorageType(const std::shared_ptr& value_type, + const std::vector& shape) const; std::shared_ptr storage_type_; std::shared_ptr value_type_; std::vector shape_; + std::vector strides_; std::vector permutation_; std::vector dim_names_; }; diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index fe212460f49..005dfb0f8c9 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -83,16 +83,19 @@ TEST_F(TestExtensionType, FixedShapeTensorType) { std::shared_ptr deserialized = std::reinterpret_pointer_cast(ds); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid: Expected FixedSizeList storage type"), + ext_type->Deserialize(boolean(), serialized)); ASSERT_TRUE(tensor->is_row_major()); ASSERT_EQ(tensor->strides(), strides); ASSERT_EQ(tensor_partial->strides(), strides); // Test ExtensionType methods ASSERT_EQ(ext_type->extension_name(), "arrow.fixed_shape_tensor"); - ASSERT_TRUE(ext_type->ExtensionEquals(*exact_ext_type)); + ASSERT_TRUE(ext_type->Equals(*exact_ext_type)); ASSERT_TRUE(ext_type->storage_type()->Equals(*cell_type)); ASSERT_EQ(ext_type->Serialize(), serialized); - ASSERT_TRUE(deserialized->ExtensionEquals(*ext_type)); + ASSERT_TRUE(deserialized->Equals(*ext_type)); ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); // Test FixedShapeTensorType methods @@ -145,8 +148,8 @@ TEST_F(TestExtensionType, FixedShapeTensorType) { auto ext_type_1 = fixed_shape_tensor(int64(), cell_shape, {}, dim_names); auto ext_type_2 = fixed_shape_tensor(int64(), cell_shape, {}, dim_names); auto ext_type_3 = fixed_shape_tensor(int32(), cell_shape, {}, dim_names); - ASSERT_TRUE(ext_type_1->ExtensionEquals(*ext_type_2)); - ASSERT_FALSE(ext_type_1->ExtensionEquals(*ext_type_3)); + ASSERT_TRUE(ext_type_1->Equals(*ext_type_2)); + ASSERT_FALSE(ext_type_1->Equals(*ext_type_3)); auto ext_type_4 = fixed_shape_tensor(int64(), {3, 4, 7}, {}, {"x", "y", "z"}); ASSERT_EQ(ext_type_4->strides(), (std::vector{224, 56, 8})); From 3d450b7f4791f9beb907aeb31add8c4e5ae9d670 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 2 Mar 2023 17:14:21 +0100 Subject: [PATCH 04/39] Review feedback --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 4 + cpp/src/arrow/extension/fixed_shape_tensor.h | 34 ++- .../extension/fixed_shape_tensor_test.cc | 242 +++++++++++++----- 3 files changed, 201 insertions(+), 79 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index f101330c7ba..01cfd1a9697 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -133,6 +133,10 @@ Result> FixedShapeTensorType::MakeArray( auto cell_shape = tensor->shape(); cell_shape.erase(cell_shape.begin()); + if (cell_shape != shape_) { + return Status::Invalid("Expected cell shape does not match input tensor shape"); + } + permutation.erase(permutation.begin()); for (auto& x : permutation) { x--; diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index abbafe02289..b8f999c3f49 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -44,35 +44,51 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { std::string extension_name() const override { return "arrow.fixed_shape_tensor"; } - // TODO: docs - const size_t ndim() const { return shape_.size(); } + /// Number of dimensions of tensor elements + size_t ndim() { return shape_.size(); } - // TODO: docs + /// Shape of tensor elements const std::vector& shape() const { return shape_; } - // TODO: docs + /// Strides of tensor elements. Strides state offset in bytes between adjacent + /// elements along each dimension. const std::vector strides() const { return strides_; } - // TODO: docs + /// Permutation mapping from logical to physical memory layout of tensor elements const std::vector& permutation() const { return permutation_; } - // TODO: docs + /// Dimension names of tensor elements. Dimensions are ordered logically. const std::vector& dim_names() const { return dim_names_; } bool ExtensionEquals(const ExtensionType& other) const override; - bool Equals(const ExtensionType& other) const { return ExtensionEquals(other); }; - std::string Serialize() const override; Result> Deserialize( std::shared_ptr storage_type, const std::string& serialized_data) const override; + /// Create a FixedShapeTensorArray from ArrayData std::shared_ptr MakeArray(std::shared_ptr data) const override; + /// \brief Create a FixedShapeTensorArray from a Tensor + /// + /// This function will create a FixedShapeTensorArray from a Tensor, taking it's + /// first dimension as the "element dimension" and the remaining dimensions as the + /// "tensor dimensions". The tensor dimensions must match the FixedShapeTensorType's + /// element shape. This function assumes that the tensor's memory layout is + /// row-major. + /// + /// \param[in] tensor The Tensor to convert to a FixedShapeTensorArray Result> MakeArray(std::shared_ptr tensor) const; + /// \brief Create a Tensor from FixedShapeTensorArray + /// + /// This function will create a Tensor from a FixedShapeTensorArray, setting it's + /// first dimension as length equal to the FixedShapeTensorArray's length and the + /// remaining dimensions as the FixedShapeTensorType's element shape. + /// + /// \param[in] arr The FixedShapeTensorArray to convert to a Tensor Result> ToTensor(std::shared_ptr arr) const; private: @@ -89,7 +105,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { std::vector dim_names_; }; -/// \brief Return a TensorArrayType instance. +/// \brief Return a FixedShapeTensorType instance. ARROW_EXPORT std::shared_ptr fixed_shape_tensor( const std::shared_ptr& storage_type, const std::vector& shape, const std::vector& permutation = {}, diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 005dfb0f8c9..49ba153b31c 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -31,7 +31,40 @@ namespace arrow { -class TestExtensionType : public ::testing::Test {}; +using FixedShapeTensorType = extension::FixedShapeTensorType; +using extension::fixed_shape_tensor; + +class TestExtensionType : public ::testing::Test { + public: + void SetUp() { + shape = {3, 3, 4}; + cell_shape = {3, 4}; + value_type = int64(); + cell_type = fixed_size_list(value_type, 12); + dim_names = {"x", "y"}; + values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; + values_partial = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; + shape_partial = {2, 3, 4}; + tensor_strides = {96, 32, 8}; + cell_strides = {32, 8}; + serialized = R"({"shape":[3,4],"dim_names":["x","y"]})"; + } + + protected: + std::vector shape; + std::vector shape_partial; + std::vector cell_shape; + std::shared_ptr value_type; + std::shared_ptr cell_type; + std::vector dim_names; + std::vector values; + std::vector values_partial; + std::vector tensor_strides; + std::vector cell_strides; + std::string serialized; +}; auto RoundtripBatch = [](const std::shared_ptr& batch, std::shared_ptr* out) { @@ -47,64 +80,39 @@ auto RoundtripBatch = [](const std::shared_ptr& batch, ASSERT_OK(batch_reader->ReadNext(out)); }; -TEST_F(TestExtensionType, FixedShapeTensorType) { - using FixedShapeTensorType = extension::FixedShapeTensorType; - using arrow::extension::fixed_shape_tensor; - - std::vector shape = {3, 3, 4}; - std::vector cell_shape = {3, 4}; - auto value_type = int64(); - std::shared_ptr cell_type = fixed_size_list(value_type, 12); - - std::vector dim_names = {"x", "y"}; - std::vector strides = {96, 32, 8}; - std::vector column_major_strides = {8, 24, 72}; - std::vector neither_major_strides = {96, 8, 32}; - std::vector cell_strides = {32, 8}; - std::vector values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; - std::vector values_partial = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; - std::vector shape_partial = {2, 3, 4}; - std::string serialized = R"({"shape":[3,4],"dim_names":["x","y"]})"; - - ASSERT_OK_AND_ASSIGN(auto tensor, - Tensor::Make(value_type, Buffer::Wrap(values), shape)); - ASSERT_OK_AND_ASSIGN( - auto tensor_partial, - Tensor::Make(value_type, Buffer::Wrap(values_partial), shape_partial)); +TEST_F(TestExtensionType, CheckDummyRegistration) { + // We need a dummy registration at runtime to allow for IPC deserialization + auto ext_type = fixed_shape_tensor(int64(), {}); + auto registered_type = GetExtensionType(ext_type->extension_name()); + ASSERT_TRUE(registered_type->Equals(*ext_type)); +} - std::shared_ptr ext_type = - fixed_shape_tensor(value_type, cell_shape, {}, dim_names); +TEST_F(TestExtensionType, CreateExtensionType) { + auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); auto exact_ext_type = internal::checked_pointer_cast(ext_type); - ASSERT_OK_AND_ASSIGN(auto ds, - ext_type->Deserialize(ext_type->storage_type(), serialized)); - std::shared_ptr deserialized = - std::reinterpret_pointer_cast(ds); - - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("Invalid: Expected FixedSizeList storage type"), - ext_type->Deserialize(boolean(), serialized)); - ASSERT_TRUE(tensor->is_row_major()); - ASSERT_EQ(tensor->strides(), strides); - ASSERT_EQ(tensor_partial->strides(), strides); // Test ExtensionType methods ASSERT_EQ(ext_type->extension_name(), "arrow.fixed_shape_tensor"); ASSERT_TRUE(ext_type->Equals(*exact_ext_type)); ASSERT_TRUE(ext_type->storage_type()->Equals(*cell_type)); ASSERT_EQ(ext_type->Serialize(), serialized); + ASSERT_OK_AND_ASSIGN(auto ds, + ext_type->Deserialize(ext_type->storage_type(), serialized)); + auto deserialized = std::reinterpret_pointer_cast(ds); ASSERT_TRUE(deserialized->Equals(*ext_type)); - ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); // Test FixedShapeTensorType methods + ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); ASSERT_EQ(exact_ext_type->ndim(), cell_shape.size()); ASSERT_EQ(exact_ext_type->shape(), cell_shape); ASSERT_EQ(exact_ext_type->strides(), cell_strides); ASSERT_EQ(exact_ext_type->dim_names(), dim_names); +} + +TEST_F(TestExtensionType, CreateFromArray) { + auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); - // Test MakeArray(std::shared_ptr data) std::vector> buffers = {nullptr, Buffer::Wrap(values)}; auto arr_data = std::make_shared(value_type, values.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); @@ -114,24 +122,113 @@ TEST_F(TestExtensionType, FixedShapeTensorType) { auto ext_arr = exact_ext_type->MakeArray(data); ASSERT_EQ(ext_arr->length(), shape[0]); ASSERT_EQ(ext_arr->null_count(), 0); +} - // Test MakeArray(std::shared_ptr tensor) +TEST_F(TestExtensionType, CreateFromTensor) { + std::vector column_major_strides = {8, 24, 72}; + std::vector neither_major_strides = {96, 8, 32}; + + ASSERT_OK_AND_ASSIGN(auto tensor, + Tensor::Make(value_type, Buffer::Wrap(values), shape)); + + auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); + EXPECT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); + + ASSERT_OK(ext_arr->ValidateFull()); + ASSERT_TRUE(tensor->is_row_major()); + ASSERT_EQ(tensor->strides(), tensor_strides); + ASSERT_EQ(ext_arr->length(), shape[0]); + + auto ext_type_2 = fixed_shape_tensor(int64(), {3, 4}, {0, 1}); + EXPECT_OK_AND_ASSIGN(auto ext_arr_2, ext_type_2->MakeArray(tensor)); + + ASSERT_OK_AND_ASSIGN( + auto column_major_tensor, + Tensor::Make(value_type, Buffer::Wrap(values), shape, column_major_strides)); + auto ext_type_3 = fixed_shape_tensor(int64(), {3, 4}, {0, 1}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr( + "Invalid: Only first-major tensors can be zero-copy converted to arrays"), + ext_type_3->MakeArray(column_major_tensor)); + ASSERT_THAT(ext_type_3->MakeArray(column_major_tensor), Raises(StatusCode::Invalid)); + + auto neither_major_tensor = std::make_shared(value_type, Buffer::Wrap(values), + shape, neither_major_strides); + auto ext_type_4 = fixed_shape_tensor(int64(), {3, 4}, {1, 0}); + ASSERT_OK_AND_ASSIGN(auto ext_arr_4, ext_type_4->MakeArray(neither_major_tensor)); +} + +TEST_F(TestExtensionType, RoundtripTensor) { + ASSERT_OK_AND_ASSIGN(auto tensor, + Tensor::Make(value_type, Buffer::Wrap(values), shape)); + auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); + EXPECT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); + + EXPECT_OK_AND_ASSIGN(auto tensor_from_array, exact_ext_type->ToTensor(ext_arr)); + ASSERT_EQ(tensor_from_array->shape(), tensor->shape()); + ASSERT_EQ(tensor_from_array->strides(), tensor->strides()); + ASSERT_TRUE(tensor->Equals(*tensor_from_array)); +} + +TEST_F(TestExtensionType, SliceTensor) { + ASSERT_OK_AND_ASSIGN(auto tensor, + Tensor::Make(value_type, Buffer::Wrap(values), shape)); + ASSERT_OK_AND_ASSIGN( + auto tensor_partial, + Tensor::Make(value_type, Buffer::Wrap(values_partial), shape_partial)); + ASSERT_EQ(tensor->strides(), tensor_strides); + ASSERT_EQ(tensor_partial->strides(), tensor_strides); + auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); + + EXPECT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); EXPECT_OK_AND_ASSIGN(auto ext_arr_partial, exact_ext_type->MakeArray(tensor_partial)); ASSERT_OK(ext_arr->ValidateFull()); ASSERT_OK(ext_arr_partial->ValidateFull()); - // Test ToTensor(std::shared_ptr array) - EXPECT_OK_AND_ASSIGN(auto t, exact_ext_type->ToTensor(ext_arr)); - ASSERT_EQ(t->shape(), tensor->shape()); - ASSERT_EQ(t->strides(), tensor->strides()); - ASSERT_TRUE(tensor->Equals(*t)); - - // Test slicing auto sliced = internal::checked_pointer_cast(ext_arr->Slice(0, 2)); auto partial = internal::checked_pointer_cast(ext_arr_partial); + + ASSERT_TRUE(sliced->Equals(*partial)); ASSERT_OK(sliced->ValidateFull()); + ASSERT_OK(partial->ValidateFull()); ASSERT_TRUE(sliced->storage()->Equals(*partial->storage())); ASSERT_EQ(sliced->length(), partial->length()); +} + +void CheckSerializationRoundtrip(const std::shared_ptr& ext_type) { + auto serialized = ext_type->Serialize(); + ASSERT_OK_AND_ASSIGN(auto deserialized, + ext_type->Deserialize(ext_type->storage_type(), serialized)); + ASSERT_TRUE(ext_type->Equals(*deserialized)); +} + +TEST_F(TestExtensionType, MetadataSerializationRoundtrip) { + CheckSerializationRoundtrip(fixed_shape_tensor(value_type, {}, {}, {})); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type, {0}, {}, {})); + + auto ext_type = fixed_shape_tensor(value_type, cell_shape, {0, 1}, dim_names); + CheckSerializationRoundtrip(ext_type); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid: Expected FixedSizeList storage type"), + ext_type->Deserialize(boolean(), serialized)); +} + +TEST_F(TestExtensionType, RoudtripBatch) { + auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); + + std::vector> buffers = {nullptr, Buffer::Wrap(values)}; + auto arr_data = std::make_shared(value_type, values.size(), buffers, 0, 0); + auto arr = std::make_shared(arr_data); + EXPECT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type)); + auto data = fsla_arr->data(); + data->type = ext_type; + auto ext_arr = exact_ext_type->MakeArray(data); ASSERT_OK(UnregisterExtensionType(ext_type->extension_name())); ASSERT_OK(RegisterExtensionType(ext_type)); @@ -143,7 +240,31 @@ TEST_F(TestExtensionType, FixedShapeTensorType) { std::shared_ptr read_batch; RoundtripBatch(batch, &read_batch); CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); +} + +TEST_F(TestExtensionType, RoudtripBatchFromTensor) { + auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); + ASSERT_OK_AND_ASSIGN( + auto tensor, Tensor::Make(value_type, Buffer::Wrap(values), shape, {}, dim_names)); + EXPECT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); + ext_arr->data()->type = exact_ext_type; + ASSERT_OK(UnregisterExtensionType(ext_type->extension_name())); + ASSERT_OK(RegisterExtensionType(ext_type)); + auto ext_metadata = + key_value_metadata({{"ARROW:extension:name", ext_type->extension_name()}, + {"ARROW:extension:metadata", serialized}}); + auto ext_field = field("f0", ext_type, true, ext_metadata); + auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); + std::shared_ptr read_batch; + RoundtripBatch(batch, &read_batch); + CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); +} + +TEST_F(TestExtensionType, ComputeStrides) { + auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); auto ext_type_1 = fixed_shape_tensor(int64(), cell_shape, {}, dim_names); auto ext_type_2 = fixed_shape_tensor(int64(), cell_shape, {}, dim_names); @@ -167,25 +288,6 @@ TEST_F(TestExtensionType, FixedShapeTensorType) { auto ext_type_7 = fixed_shape_tensor(int64(), {3, 4, 7}, {2, 0, 1}, {}); ASSERT_EQ(ext_type_7->strides(), (std::vector{8, 224, 56})); ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); - - auto ext_type_8 = fixed_shape_tensor(int64(), {3, 4}, {0, 1}); - EXPECT_OK_AND_ASSIGN(auto ext_arr_8, ext_type_8->MakeArray(tensor)); - - ASSERT_OK_AND_ASSIGN( - auto column_major_tensor, - Tensor::Make(value_type, Buffer::Wrap(values), shape, column_major_strides)); - auto ext_type_9 = fixed_shape_tensor(int64(), {3, 4}, {0, 1}); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, - testing::HasSubstr( - "Invalid: Only first-major tensors can be zero-copy converted to arrays"), - ext_type_9->MakeArray(column_major_tensor)); - ASSERT_THAT(ext_type_9->MakeArray(column_major_tensor), Raises(StatusCode::Invalid)); - - auto neither_major_tensor = std::make_shared(value_type, Buffer::Wrap(values), - shape, neither_major_strides); - auto ext_type_10 = fixed_shape_tensor(int64(), {3, 4}, {1, 0}); - ASSERT_OK_AND_ASSIGN(auto ext_arr_10, ext_type_10->MakeArray(neither_major_tensor)); } } // namespace arrow From 9667d61c268484df234345fd25d865cf50e524be Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 2 Mar 2023 20:06:03 +0100 Subject: [PATCH 05/39] Apply suggestions from code review Co-authored-by: Ben Harkins <60872452+benibus@users.noreply.github.com> --- cpp/src/arrow/extension/fixed_shape_tensor.h | 2 +- cpp/src/arrow/extension/fixed_shape_tensor_test.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index b8f999c3f49..181c16a360f 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -52,7 +52,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { /// Strides of tensor elements. Strides state offset in bytes between adjacent /// elements along each dimension. - const std::vector strides() const { return strides_; } + const std::vector& strides() const { return strides_; } /// Permutation mapping from logical to physical memory layout of tensor elements const std::vector& permutation() const { return permutation_; } diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 49ba153b31c..2a6329bca02 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -36,7 +36,7 @@ using extension::fixed_shape_tensor; class TestExtensionType : public ::testing::Test { public: - void SetUp() { + void SetUp() override { shape = {3, 3, 4}; cell_shape = {3, 4}; value_type = int64(); From 96b2cfd3d547b0395218d634745d177a220edbb7 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 2 Mar 2023 20:35:58 +0100 Subject: [PATCH 06/39] Review feedback --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 39 ++-- cpp/src/arrow/extension/fixed_shape_tensor.h | 12 +- .../extension/fixed_shape_tensor_test.cc | 176 +++++++++--------- 3 files changed, 114 insertions(+), 113 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 01cfd1a9697..11e0fd0da87 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -225,26 +225,6 @@ Result> FixedShapeTensorType::ToTensor( return *Tensor::Make(ext_arr->value_type(), buffer, shape, tensor_strides, dim_names()); } -const std::vector FixedShapeTensorType::ComputeStrides( - const std::shared_ptr value_type, const std::vector shape, - const std::vector permutation) const { - std::vector strides; - const auto& element_type = internal::checked_cast(*value_type); - DCHECK_OK(internal::ComputeRowMajorStrides(element_type, shape, &strides)); - if (!permutation.empty()) { - internal::Permute(permutation, &strides); - } - return strides; -} - -std::shared_ptr FixedShapeTensorType::GetStorageType( - const std::shared_ptr& value_type, - const std::vector& shape) const { - const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), - std::multiplies<>()); - return fixed_size_list(value_type, static_cast(size)); -} - std::shared_ptr fixed_shape_tensor( const std::shared_ptr& value_type, const std::vector& shape, const std::vector& permutation, const std::vector& dim_names) { @@ -264,5 +244,24 @@ std::shared_ptr fixed_shape_tensor( dim_names); } +const std::shared_ptr GetStorageType( + const std::shared_ptr& value_type, const std::vector& shape) { + const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), + std::multiplies<>()); + return fixed_size_list(value_type, static_cast(size)); +} + +const std::vector ComputeStrides(const std::shared_ptr& value_type, + const std::vector& shape, + const std::vector& permutation) { + std::vector strides; + const auto& element_type = internal::checked_cast(*value_type); + DCHECK_OK(internal::ComputeRowMajorStrides(element_type, shape, &strides)); + if (!permutation.empty()) { + internal::Permute(permutation, &strides); + } + return strides; +} + } // namespace extension } // namespace arrow diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 181c16a360f..d35c77c95c8 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -23,6 +23,13 @@ namespace arrow { namespace extension { +const std::shared_ptr GetStorageType( + const std::shared_ptr& value_type, const std::vector& shape); + +const std::vector ComputeStrides(const std::shared_ptr& value_type, + const std::vector& shape, + const std::vector& permutation); + class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { public: using ExtensionArray::ExtensionArray; @@ -92,11 +99,6 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { Result> ToTensor(std::shared_ptr arr) const; private: - const std::vector ComputeStrides(const std::shared_ptr type, - const std::vector shape, - const std::vector permutation) const; - std::shared_ptr GetStorageType(const std::shared_ptr& value_type, - const std::vector& shape) const; std::shared_ptr storage_type_; std::shared_ptr value_type_; std::vector shape_; diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 2a6329bca02..eea90db3aa2 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -37,33 +37,35 @@ using extension::fixed_shape_tensor; class TestExtensionType : public ::testing::Test { public: void SetUp() override { - shape = {3, 3, 4}; - cell_shape = {3, 4}; - value_type = int64(); - cell_type = fixed_size_list(value_type, 12); - dim_names = {"x", "y"}; - values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, - 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; - values_partial = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; - shape_partial = {2, 3, 4}; - tensor_strides = {96, 32, 8}; - cell_strides = {32, 8}; - serialized = R"({"shape":[3,4],"dim_names":["x","y"]})"; + shape_ = {3, 3, 4}; + cell_shape_ = {3, 4}; + value_type_ = int64(); + cell_type_ = fixed_size_list(value_type_, 12); + dim_names_ = {"x", "y"}; + ext_type_ = fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_); + values_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; + values_partial_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; + shape_partial_ = {2, 3, 4}; + tensor_strides_ = {96, 32, 8}; + cell_strides_ = {32, 8}; + serialized_ = R"({"shape":[3,4],"dim_names":["x","y"]})"; } protected: - std::vector shape; - std::vector shape_partial; - std::vector cell_shape; - std::shared_ptr value_type; - std::shared_ptr cell_type; - std::vector dim_names; - std::vector values; - std::vector values_partial; - std::vector tensor_strides; - std::vector cell_strides; - std::string serialized; + std::vector shape_; + std::vector shape_partial_; + std::vector cell_shape_; + std::shared_ptr value_type_; + std::shared_ptr cell_type_; + std::vector dim_names_; + std::shared_ptr ext_type_; + std::vector values_; + std::vector values_partial_; + std::vector tensor_strides_; + std::vector cell_strides_; + std::string serialized_; }; auto RoundtripBatch = [](const std::shared_ptr& batch, @@ -88,39 +90,37 @@ TEST_F(TestExtensionType, CheckDummyRegistration) { } TEST_F(TestExtensionType, CreateExtensionType) { - auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); - auto exact_ext_type = internal::checked_pointer_cast(ext_type); + auto exact_ext_type = internal::checked_pointer_cast(ext_type_); // Test ExtensionType methods - ASSERT_EQ(ext_type->extension_name(), "arrow.fixed_shape_tensor"); - ASSERT_TRUE(ext_type->Equals(*exact_ext_type)); - ASSERT_TRUE(ext_type->storage_type()->Equals(*cell_type)); - ASSERT_EQ(ext_type->Serialize(), serialized); + ASSERT_EQ(ext_type_->extension_name(), "arrow.fixed_shape_tensor"); + ASSERT_TRUE(ext_type_->Equals(*exact_ext_type)); + ASSERT_TRUE(ext_type_->storage_type()->Equals(*cell_type_)); + ASSERT_EQ(ext_type_->Serialize(), serialized_); ASSERT_OK_AND_ASSIGN(auto ds, - ext_type->Deserialize(ext_type->storage_type(), serialized)); + ext_type_->Deserialize(ext_type_->storage_type(), serialized_)); auto deserialized = std::reinterpret_pointer_cast(ds); - ASSERT_TRUE(deserialized->Equals(*ext_type)); + ASSERT_TRUE(deserialized->Equals(*ext_type_)); // Test FixedShapeTensorType methods ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); - ASSERT_EQ(exact_ext_type->ndim(), cell_shape.size()); - ASSERT_EQ(exact_ext_type->shape(), cell_shape); - ASSERT_EQ(exact_ext_type->strides(), cell_strides); - ASSERT_EQ(exact_ext_type->dim_names(), dim_names); + ASSERT_EQ(exact_ext_type->ndim(), cell_shape_.size()); + ASSERT_EQ(exact_ext_type->shape(), cell_shape_); + ASSERT_EQ(exact_ext_type->strides(), cell_strides_); + ASSERT_EQ(exact_ext_type->dim_names(), dim_names_); } TEST_F(TestExtensionType, CreateFromArray) { - auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); - auto exact_ext_type = internal::checked_pointer_cast(ext_type); + auto exact_ext_type = internal::checked_pointer_cast(ext_type_); - std::vector> buffers = {nullptr, Buffer::Wrap(values)}; - auto arr_data = std::make_shared(value_type, values.size(), buffers, 0, 0); + std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; + auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); - EXPECT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type)); + EXPECT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); auto data = fsla_arr->data(); - data->type = ext_type; + data->type = ext_type_; auto ext_arr = exact_ext_type->MakeArray(data); - ASSERT_EQ(ext_arr->length(), shape[0]); + ASSERT_EQ(ext_arr->length(), shape_[0]); ASSERT_EQ(ext_arr->null_count(), 0); } @@ -129,23 +129,22 @@ TEST_F(TestExtensionType, CreateFromTensor) { std::vector neither_major_strides = {96, 8, 32}; ASSERT_OK_AND_ASSIGN(auto tensor, - Tensor::Make(value_type, Buffer::Wrap(values), shape)); + Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); - auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); - auto exact_ext_type = internal::checked_pointer_cast(ext_type); + auto exact_ext_type = internal::checked_pointer_cast(ext_type_); EXPECT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); ASSERT_OK(ext_arr->ValidateFull()); ASSERT_TRUE(tensor->is_row_major()); - ASSERT_EQ(tensor->strides(), tensor_strides); - ASSERT_EQ(ext_arr->length(), shape[0]); + ASSERT_EQ(tensor->strides(), tensor_strides_); + ASSERT_EQ(ext_arr->length(), shape_[0]); auto ext_type_2 = fixed_shape_tensor(int64(), {3, 4}, {0, 1}); EXPECT_OK_AND_ASSIGN(auto ext_arr_2, ext_type_2->MakeArray(tensor)); ASSERT_OK_AND_ASSIGN( auto column_major_tensor, - Tensor::Make(value_type, Buffer::Wrap(values), shape, column_major_strides)); + Tensor::Make(value_type_, Buffer::Wrap(values_), shape_, column_major_strides)); auto ext_type_3 = fixed_shape_tensor(int64(), {3, 4}, {0, 1}); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, @@ -154,17 +153,16 @@ TEST_F(TestExtensionType, CreateFromTensor) { ext_type_3->MakeArray(column_major_tensor)); ASSERT_THAT(ext_type_3->MakeArray(column_major_tensor), Raises(StatusCode::Invalid)); - auto neither_major_tensor = std::make_shared(value_type, Buffer::Wrap(values), - shape, neither_major_strides); + auto neither_major_tensor = std::make_shared(value_type_, Buffer::Wrap(values_), + shape_, neither_major_strides); auto ext_type_4 = fixed_shape_tensor(int64(), {3, 4}, {1, 0}); ASSERT_OK_AND_ASSIGN(auto ext_arr_4, ext_type_4->MakeArray(neither_major_tensor)); } TEST_F(TestExtensionType, RoundtripTensor) { ASSERT_OK_AND_ASSIGN(auto tensor, - Tensor::Make(value_type, Buffer::Wrap(values), shape)); - auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); - auto exact_ext_type = internal::checked_pointer_cast(ext_type); + Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); + auto exact_ext_type = internal::checked_pointer_cast(ext_type_); EXPECT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); EXPECT_OK_AND_ASSIGN(auto tensor_from_array, exact_ext_type->ToTensor(ext_arr)); @@ -175,14 +173,14 @@ TEST_F(TestExtensionType, RoundtripTensor) { TEST_F(TestExtensionType, SliceTensor) { ASSERT_OK_AND_ASSIGN(auto tensor, - Tensor::Make(value_type, Buffer::Wrap(values), shape)); + Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); ASSERT_OK_AND_ASSIGN( auto tensor_partial, - Tensor::Make(value_type, Buffer::Wrap(values_partial), shape_partial)); - ASSERT_EQ(tensor->strides(), tensor_strides); - ASSERT_EQ(tensor_partial->strides(), tensor_strides); - auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); - auto exact_ext_type = internal::checked_pointer_cast(ext_type); + Tensor::Make(value_type_, Buffer::Wrap(values_partial_), shape_partial_)); + ASSERT_EQ(tensor->strides(), tensor_strides_); + ASSERT_EQ(tensor_partial->strides(), tensor_strides_); + auto ext_type = fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_); + auto exact_ext_type = internal::checked_pointer_cast(ext_type_); EXPECT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); EXPECT_OK_AND_ASSIGN(auto ext_arr_partial, exact_ext_type->MakeArray(tensor_partial)); @@ -207,34 +205,38 @@ void CheckSerializationRoundtrip(const std::shared_ptr& ext_type) } TEST_F(TestExtensionType, MetadataSerializationRoundtrip) { - CheckSerializationRoundtrip(fixed_shape_tensor(value_type, {}, {}, {})); - CheckSerializationRoundtrip(fixed_shape_tensor(value_type, {0}, {}, {})); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {}, {}, {})); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {0}, {}, {})); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {1}, {0}, {"x"})); + CheckSerializationRoundtrip( + fixed_shape_tensor(value_type_, {256, 256, 3}, {0, 1, 2}, {"H", "W", "C"})); + CheckSerializationRoundtrip( + fixed_shape_tensor(value_type_, {256, 256, 3}, {2, 0, 1}, {"C", "H", "W"})); - auto ext_type = fixed_shape_tensor(value_type, cell_shape, {0, 1}, dim_names); - CheckSerializationRoundtrip(ext_type); + auto ext_type = fixed_shape_tensor(value_type_, cell_shape_, {0, 1}, dim_names_); + CheckSerializationRoundtrip(ext_type_); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("Invalid: Expected FixedSizeList storage type"), - ext_type->Deserialize(boolean(), serialized)); + ext_type->Deserialize(boolean(), serialized_)); } TEST_F(TestExtensionType, RoudtripBatch) { - auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); - auto exact_ext_type = internal::checked_pointer_cast(ext_type); + auto exact_ext_type = internal::checked_pointer_cast(ext_type_); - std::vector> buffers = {nullptr, Buffer::Wrap(values)}; - auto arr_data = std::make_shared(value_type, values.size(), buffers, 0, 0); + std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; + auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); - EXPECT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type)); + EXPECT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); auto data = fsla_arr->data(); - data->type = ext_type; + data->type = ext_type_; auto ext_arr = exact_ext_type->MakeArray(data); - ASSERT_OK(UnregisterExtensionType(ext_type->extension_name())); - ASSERT_OK(RegisterExtensionType(ext_type)); + ASSERT_OK(UnregisterExtensionType(ext_type_->extension_name())); + ASSERT_OK(RegisterExtensionType(ext_type_)); auto ext_metadata = key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, - {"ARROW:extension:metadata", serialized}}); + {"ARROW:extension:metadata", serialized_}}); auto ext_field = field("f0", exact_ext_type, true, ext_metadata); auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); std::shared_ptr read_batch; @@ -243,19 +245,18 @@ TEST_F(TestExtensionType, RoudtripBatch) { } TEST_F(TestExtensionType, RoudtripBatchFromTensor) { - auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); - auto exact_ext_type = internal::checked_pointer_cast(ext_type); - ASSERT_OK_AND_ASSIGN( - auto tensor, Tensor::Make(value_type, Buffer::Wrap(values), shape, {}, dim_names)); + auto exact_ext_type = internal::checked_pointer_cast(ext_type_); + ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, Buffer::Wrap(values_), + shape_, {}, dim_names_)); EXPECT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); ext_arr->data()->type = exact_ext_type; - ASSERT_OK(UnregisterExtensionType(ext_type->extension_name())); - ASSERT_OK(RegisterExtensionType(ext_type)); + ASSERT_OK(UnregisterExtensionType(ext_type_->extension_name())); + ASSERT_OK(RegisterExtensionType(ext_type_)); auto ext_metadata = - key_value_metadata({{"ARROW:extension:name", ext_type->extension_name()}, - {"ARROW:extension:metadata", serialized}}); - auto ext_field = field("f0", ext_type, true, ext_metadata); + key_value_metadata({{"ARROW:extension:name", ext_type_->extension_name()}, + {"ARROW:extension:metadata", serialized_}}); + auto ext_field = field("f0", ext_type_, true, ext_metadata); auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); std::shared_ptr read_batch; RoundtripBatch(batch, &read_batch); @@ -263,12 +264,11 @@ TEST_F(TestExtensionType, RoudtripBatchFromTensor) { } TEST_F(TestExtensionType, ComputeStrides) { - auto ext_type = fixed_shape_tensor(value_type, cell_shape, {}, dim_names); - auto exact_ext_type = internal::checked_pointer_cast(ext_type); + auto exact_ext_type = internal::checked_pointer_cast(ext_type_); - auto ext_type_1 = fixed_shape_tensor(int64(), cell_shape, {}, dim_names); - auto ext_type_2 = fixed_shape_tensor(int64(), cell_shape, {}, dim_names); - auto ext_type_3 = fixed_shape_tensor(int32(), cell_shape, {}, dim_names); + auto ext_type_1 = fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_); + auto ext_type_2 = fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_); + auto ext_type_3 = fixed_shape_tensor(int32(), cell_shape_, {}, dim_names_); ASSERT_TRUE(ext_type_1->Equals(*ext_type_2)); ASSERT_FALSE(ext_type_1->Equals(*ext_type_3)); From 30a190476848c828c52852fb201b517d8e9835b9 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 3 Mar 2023 10:46:55 +0100 Subject: [PATCH 07/39] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- cpp/src/arrow/extension/fixed_shape_tensor.h | 6 +++--- cpp/src/arrow/extension/fixed_shape_tensor_test.cc | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index d35c77c95c8..9d32a2d0fa3 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -64,7 +64,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { /// Permutation mapping from logical to physical memory layout of tensor elements const std::vector& permutation() const { return permutation_; } - /// Dimension names of tensor elements. Dimensions are ordered logically. + /// Dimension names of tensor elements. Dimensions are ordered physically. const std::vector& dim_names() const { return dim_names_; } bool ExtensionEquals(const ExtensionType& other) const override; @@ -80,7 +80,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { /// \brief Create a FixedShapeTensorArray from a Tensor /// - /// This function will create a FixedShapeTensorArray from a Tensor, taking it's + /// This function will create a FixedShapeTensorArray from a Tensor, taking its /// first dimension as the "element dimension" and the remaining dimensions as the /// "tensor dimensions". The tensor dimensions must match the FixedShapeTensorType's /// element shape. This function assumes that the tensor's memory layout is @@ -91,7 +91,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { /// \brief Create a Tensor from FixedShapeTensorArray /// - /// This function will create a Tensor from a FixedShapeTensorArray, setting it's + /// This function will create a Tensor from a FixedShapeTensorArray, setting its /// first dimension as length equal to the FixedShapeTensorArray's length and the /// remaining dimensions as the FixedShapeTensorType's element shape. /// diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index eea90db3aa2..a943906a3ff 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -95,6 +95,7 @@ TEST_F(TestExtensionType, CreateExtensionType) { // Test ExtensionType methods ASSERT_EQ(ext_type_->extension_name(), "arrow.fixed_shape_tensor"); ASSERT_TRUE(ext_type_->Equals(*exact_ext_type)); + ASSERT_FALSE(ext_type_->Equals(*cell_type_)); ASSERT_TRUE(ext_type_->storage_type()->Equals(*cell_type_)); ASSERT_EQ(ext_type_->Serialize(), serialized_); ASSERT_OK_AND_ASSIGN(auto ds, @@ -285,7 +286,7 @@ TEST_F(TestExtensionType, ComputeStrides) { ASSERT_EQ(ext_type_6->strides(), (std::vector{56, 8, 224})); ASSERT_EQ(ext_type_6->Serialize(), R"({"shape":[3,4,7],"permutation":[1,2,0]})"); - auto ext_type_7 = fixed_shape_tensor(int64(), {3, 4, 7}, {2, 0, 1}, {}); + auto ext_type_7 = fixed_shape_tensor(int32(), {3, 4, 7}, {2, 0, 1}, {}); ASSERT_EQ(ext_type_7->strides(), (std::vector{8, 224, 56})); ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); } From fb19d033cf933b94995a9a4234ebc50d7ea1480f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 3 Mar 2023 12:10:09 +0100 Subject: [PATCH 08/39] Review feedback --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 45 ++++++------ cpp/src/arrow/extension/fixed_shape_tensor.h | 11 +-- .../extension/fixed_shape_tensor_test.cc | 71 +++++++++++-------- cpp/src/arrow/extension_type.cc | 4 +- 4 files changed, 70 insertions(+), 61 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 11e0fd0da87..01bca800c8e 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -32,6 +32,18 @@ namespace rj = arrow::rapidjson; namespace arrow { namespace extension { +const std::vector& FixedShapeTensorType::strides() { + if (this->strides_.empty()) { + const auto& element_type = + internal::checked_cast(*value_type_); + DCHECK_OK(internal::ComputeRowMajorStrides(element_type, shape_, &strides_)); + if (!permutation_.empty()) { + internal::Permute(permutation_, &strides_); + } + } + return this->strides_; +} + bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const { if (extension_name() != other.extension_name()) { return false; @@ -142,8 +154,8 @@ Result> FixedShapeTensorType::MakeArray( x--; } - auto ext_type = - fixed_shape_tensor(tensor->type(), cell_shape, permutation, tensor->dim_names()); + auto ext_type = internal::checked_pointer_cast( + fixed_shape_tensor(tensor->type(), cell_shape, permutation, tensor->dim_names())); std::shared_ptr arr; std::shared_ptr value_array; @@ -205,31 +217,32 @@ Result> FixedShapeTensorType::MakeArray( } Result> FixedShapeTensorType::ToTensor( - std::shared_ptr arr) const { + std::shared_ptr arr) { // To convert an array of n dimensional tensors to a n+1 dimensional tensor we // interpret the array's length as the first dimension the new tensor. Further, we // define n+1 dimensional tensor's strides by front appending a new stride to the n // dimensional tensor's strides. + ARROW_CHECK(is_tensor_supported(this->value_type_->id())); + ARROW_DCHECK_EQ(arr->null_count(), 0) << "Null values not supported in tensors."; auto ext_arr = internal::checked_pointer_cast( internal::checked_pointer_cast(arr)->storage()); - std::vector shape = shape_; + std::vector shape = this->shape(); shape.insert(shape.begin(), 1, arr->length()); - std::vector tensor_strides = strides(); + std::vector tensor_strides = this->strides(); tensor_strides.insert(tensor_strides.begin(), 1, arr->length() * tensor_strides[0]); std::shared_ptr buffer = ext_arr->values()->data()->buffers[1]; return *Tensor::Make(ext_arr->value_type(), buffer, shape, tensor_strides, dim_names()); } -std::shared_ptr fixed_shape_tensor( - const std::shared_ptr& value_type, const std::vector& shape, - const std::vector& permutation, const std::vector& dim_names) { - ARROW_CHECK(is_tensor_supported(value_type->id())); - +std::shared_ptr fixed_shape_tensor(const std::shared_ptr& value_type, + const std::vector& shape, + const std::vector& permutation, + const std::vector& dim_names) { if (!permutation.empty()) { ARROW_CHECK_EQ(shape.size(), permutation.size()) << "permutation.size() == " << permutation.size() @@ -251,17 +264,5 @@ const std::shared_ptr GetStorageType( return fixed_size_list(value_type, static_cast(size)); } -const std::vector ComputeStrides(const std::shared_ptr& value_type, - const std::vector& shape, - const std::vector& permutation) { - std::vector strides; - const auto& element_type = internal::checked_cast(*value_type); - DCHECK_OK(internal::ComputeRowMajorStrides(element_type, shape, &strides)); - if (!permutation.empty()) { - internal::Permute(permutation, &strides); - } - return strides; -} - } // namespace extension } // namespace arrow diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 9d32a2d0fa3..a54e82e147e 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -26,10 +26,6 @@ namespace extension { const std::shared_ptr GetStorageType( const std::shared_ptr& value_type, const std::vector& shape); -const std::vector ComputeStrides(const std::shared_ptr& value_type, - const std::vector& shape, - const std::vector& permutation); - class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { public: using ExtensionArray::ExtensionArray; @@ -45,7 +41,6 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { : ExtensionType(GetStorageType(value_type, shape)), value_type_(value_type), shape_(shape), - strides_(ComputeStrides(value_type, shape, permutation)), permutation_(permutation), dim_names_(dim_names) {} @@ -59,7 +54,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { /// Strides of tensor elements. Strides state offset in bytes between adjacent /// elements along each dimension. - const std::vector& strides() const { return strides_; } + const std::vector& strides(); /// Permutation mapping from logical to physical memory layout of tensor elements const std::vector& permutation() const { return permutation_; } @@ -96,7 +91,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { /// remaining dimensions as the FixedShapeTensorType's element shape. /// /// \param[in] arr The FixedShapeTensorArray to convert to a Tensor - Result> ToTensor(std::shared_ptr arr) const; + Result> ToTensor(std::shared_ptr arr); private: std::shared_ptr storage_type_; @@ -108,7 +103,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { }; /// \brief Return a FixedShapeTensorType instance. -ARROW_EXPORT std::shared_ptr fixed_shape_tensor( +ARROW_EXPORT std::shared_ptr fixed_shape_tensor( const std::shared_ptr& storage_type, const std::vector& shape, const std::vector& permutation = {}, const std::vector& dim_names = {}); diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index a943906a3ff..6147817d109 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -42,7 +42,8 @@ class TestExtensionType : public ::testing::Test { value_type_ = int64(); cell_type_ = fixed_size_list(value_type_, 12); dim_names_ = {"x", "y"}; - ext_type_ = fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_); + ext_type_ = internal::checked_pointer_cast( + fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_)); values_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; values_partial_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, @@ -83,10 +84,9 @@ auto RoundtripBatch = [](const std::shared_ptr& batch, }; TEST_F(TestExtensionType, CheckDummyRegistration) { - // We need a dummy registration at runtime to allow for IPC deserialization - auto ext_type = fixed_shape_tensor(int64(), {}); - auto registered_type = GetExtensionType(ext_type->extension_name()); - ASSERT_TRUE(registered_type->Equals(*ext_type)); + // We need a registered dummy type at runtime to allow for IPC deserialization + auto registered_type = GetExtensionType("arrow.fixed_shape_tensor"); + ASSERT_TRUE(registered_type->type_id == Type::EXTENSION); } TEST_F(TestExtensionType, CreateExtensionType) { @@ -140,13 +140,15 @@ TEST_F(TestExtensionType, CreateFromTensor) { ASSERT_EQ(tensor->strides(), tensor_strides_); ASSERT_EQ(ext_arr->length(), shape_[0]); - auto ext_type_2 = fixed_shape_tensor(int64(), {3, 4}, {0, 1}); + auto ext_type_2 = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), {3, 4}, {0, 1})); EXPECT_OK_AND_ASSIGN(auto ext_arr_2, ext_type_2->MakeArray(tensor)); ASSERT_OK_AND_ASSIGN( auto column_major_tensor, Tensor::Make(value_type_, Buffer::Wrap(values_), shape_, column_major_strides)); - auto ext_type_3 = fixed_shape_tensor(int64(), {3, 4}, {0, 1}); + auto ext_type_3 = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), {3, 4}, {0, 1})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr( @@ -156,7 +158,8 @@ TEST_F(TestExtensionType, CreateFromTensor) { auto neither_major_tensor = std::make_shared(value_type_, Buffer::Wrap(values_), shape_, neither_major_strides); - auto ext_type_4 = fixed_shape_tensor(int64(), {3, 4}, {1, 0}); + auto ext_type_4 = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), {3, 4}, {1, 0})); ASSERT_OK_AND_ASSIGN(auto ext_arr_4, ext_type_4->MakeArray(neither_major_tensor)); } @@ -206,15 +209,19 @@ void CheckSerializationRoundtrip(const std::shared_ptr& ext_type) } TEST_F(TestExtensionType, MetadataSerializationRoundtrip) { - CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {}, {}, {})); - CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {0}, {}, {})); - CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {1}, {0}, {"x"})); - CheckSerializationRoundtrip( - fixed_shape_tensor(value_type_, {256, 256, 3}, {0, 1, 2}, {"H", "W", "C"})); - CheckSerializationRoundtrip( - fixed_shape_tensor(value_type_, {256, 256, 3}, {2, 0, 1}, {"C", "H", "W"})); - - auto ext_type = fixed_shape_tensor(value_type_, cell_shape_, {0, 1}, dim_names_); + CheckSerializationRoundtrip(internal::checked_pointer_cast( + fixed_shape_tensor(value_type_, {}, {}, {}))); + CheckSerializationRoundtrip(internal::checked_pointer_cast( + fixed_shape_tensor(value_type_, {0}, {}, {}))); + CheckSerializationRoundtrip(internal::checked_pointer_cast( + fixed_shape_tensor(value_type_, {1}, {0}, {"x"}))); + CheckSerializationRoundtrip(internal::checked_pointer_cast( + fixed_shape_tensor(value_type_, {256, 256, 3}, {0, 1, 2}, {"H", "W", "C"}))); + CheckSerializationRoundtrip(internal::checked_pointer_cast( + fixed_shape_tensor(value_type_, {256, 256, 3}, {2, 0, 1}, {"C", "H", "W"}))); + + auto ext_type = internal::checked_pointer_cast( + fixed_shape_tensor(value_type_, cell_shape_, {0, 1}, dim_names_)); CheckSerializationRoundtrip(ext_type_); EXPECT_RAISES_WITH_MESSAGE_THAT( @@ -233,8 +240,6 @@ TEST_F(TestExtensionType, RoudtripBatch) { data->type = ext_type_; auto ext_arr = exact_ext_type->MakeArray(data); - ASSERT_OK(UnregisterExtensionType(ext_type_->extension_name())); - ASSERT_OK(RegisterExtensionType(ext_type_)); auto ext_metadata = key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, {"ARROW:extension:metadata", serialized_}}); @@ -252,8 +257,6 @@ TEST_F(TestExtensionType, RoudtripBatchFromTensor) { EXPECT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); ext_arr->data()->type = exact_ext_type; - ASSERT_OK(UnregisterExtensionType(ext_type_->extension_name())); - ASSERT_OK(RegisterExtensionType(ext_type_)); auto ext_metadata = key_value_metadata({{"ARROW:extension:name", ext_type_->extension_name()}, {"ARROW:extension:metadata", serialized_}}); @@ -267,27 +270,35 @@ TEST_F(TestExtensionType, RoudtripBatchFromTensor) { TEST_F(TestExtensionType, ComputeStrides) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); - auto ext_type_1 = fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_); - auto ext_type_2 = fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_); - auto ext_type_3 = fixed_shape_tensor(int32(), cell_shape_, {}, dim_names_); + auto ext_type_1 = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_)); + auto ext_type_2 = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_)); + auto ext_type_3 = internal::checked_pointer_cast( + fixed_shape_tensor(int32(), cell_shape_, {}, dim_names_)); ASSERT_TRUE(ext_type_1->Equals(*ext_type_2)); ASSERT_FALSE(ext_type_1->Equals(*ext_type_3)); - auto ext_type_4 = fixed_shape_tensor(int64(), {3, 4, 7}, {}, {"x", "y", "z"}); + auto ext_type_4 = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), {3, 4, 7}, {}, {"x", "y", "z"})); ASSERT_EQ(ext_type_4->strides(), (std::vector{224, 56, 8})); - ext_type_4 = fixed_shape_tensor(int64(), {3, 4, 7}, {0, 1, 2}, {"x", "y", "z"}); + ext_type_4 = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), {3, 4, 7}, {0, 1, 2}, {"x", "y", "z"})); ASSERT_EQ(ext_type_4->strides(), (std::vector{224, 56, 8})); - auto ext_type_5 = fixed_shape_tensor(int64(), {3, 4, 7}, {1, 0, 2}); + auto ext_type_5 = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), {3, 4, 7}, {1, 0, 2})); ASSERT_EQ(ext_type_5->strides(), (std::vector{56, 224, 8})); ASSERT_EQ(ext_type_5->Serialize(), R"({"shape":[3,4,7],"permutation":[1,0,2]})"); - auto ext_type_6 = fixed_shape_tensor(int64(), {3, 4, 7}, {1, 2, 0}, {}); + auto ext_type_6 = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), {3, 4, 7}, {1, 2, 0}, {})); ASSERT_EQ(ext_type_6->strides(), (std::vector{56, 8, 224})); ASSERT_EQ(ext_type_6->Serialize(), R"({"shape":[3,4,7],"permutation":[1,2,0]})"); - auto ext_type_7 = fixed_shape_tensor(int32(), {3, 4, 7}, {2, 0, 1}, {}); - ASSERT_EQ(ext_type_7->strides(), (std::vector{8, 224, 56})); + auto ext_type_7 = internal::checked_pointer_cast( + fixed_shape_tensor(int32(), {3, 4, 7}, {2, 0, 1}, {})); + ASSERT_EQ(ext_type_7->strides(), (std::vector{4, 112, 28})); ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); } diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 44bb0afa29f..122944d0272 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -145,8 +145,10 @@ static void CreateGlobalRegistry() { #ifdef ARROW_WITH_JSON // Register canonical extension types + auto ext_type = checked_pointer_cast(extension::fixed_shape_tensor(int64(), {})); + ARROW_CHECK_OK( - g_registry->RegisterType(arrow::extension::fixed_shape_tensor(int64(), {}))); + g_registry->RegisterType(ext_type)); #endif } From cf546bf3717ce8adc65865a91866e74d74023fb5 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 3 Mar 2023 15:06:44 +0100 Subject: [PATCH 09/39] Review feedback --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 22 +++++-- .../extension/fixed_shape_tensor_test.cc | 58 ++++++++++++------- cpp/src/arrow/extension_type.cc | 6 +- 3 files changed, 57 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 01bca800c8e..65fee350333 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -49,11 +49,23 @@ bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const { return false; } const auto& other_ext = static_cast(other); - bool equals = storage_type()->Equals(other_ext.storage_type()); - equals &= shape_ == other_ext.shape(); - equals &= permutation_ == other_ext.permutation(); - equals &= dim_names_ == other_ext.dim_names(); - return equals; + + auto is_permutation_trivial = [](const std::vector& permutation) { + for (size_t i = 1; i < permutation.size(); ++i) { + if (permutation[i - 1] + 1 != permutation[i]) { + return false; + } + } + return true; + }; + const bool permutation_equivalent = + (permutation_ == other_ext.permutation()) || + ((permutation_.empty() && is_permutation_trivial(other_ext.permutation())) && + (is_permutation_trivial(permutation_) || other_ext.permutation().empty())); + + return storage_type()->Equals(other_ext.storage_type()) && + shape_ == other_ext.shape() && dim_names_ == other_ext.dim_names() && + permutation_equivalent; } std::string FixedShapeTensorType::Serialize() const { diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 6147817d109..16495f6a2da 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -109,6 +109,10 @@ TEST_F(TestExtensionType, CreateExtensionType) { ASSERT_EQ(exact_ext_type->shape(), cell_shape_); ASSERT_EQ(exact_ext_type->strides(), cell_strides_); ASSERT_EQ(exact_ext_type->dim_names(), dim_names_); + + // TODO: Test invalid constructor input + // ASSERT_DEATH(fixed_shape_tensor(value_type_, cell_shape_, {0}), ""); + // ASSERT_DEATH(fixed_shape_tensor(value_type_, cell_shape_, {}, {"x"}), ""); } TEST_F(TestExtensionType, CreateFromArray) { @@ -201,32 +205,44 @@ TEST_F(TestExtensionType, SliceTensor) { ASSERT_EQ(sliced->length(), partial->length()); } -void CheckSerializationRoundtrip(const std::shared_ptr& ext_type) { - auto serialized = ext_type->Serialize(); +void CheckSerializationRoundtrip(const std::shared_ptr& ext_type) { + auto fst_type = internal::checked_pointer_cast(ext_type); + auto serialized = fst_type->Serialize(); ASSERT_OK_AND_ASSIGN(auto deserialized, - ext_type->Deserialize(ext_type->storage_type(), serialized)); - ASSERT_TRUE(ext_type->Equals(*deserialized)); + fst_type->Deserialize(fst_type->storage_type(), serialized)); + ASSERT_TRUE(fst_type->Equals(*deserialized)); +} + +void CheckDeserializationRaises(const std::shared_ptr& storage_type, + const std::string& serialized, + const std::string& expected_message) { + auto fst_type = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), {3, 4})); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr(expected_message), + fst_type->Deserialize(storage_type, serialized)); } TEST_F(TestExtensionType, MetadataSerializationRoundtrip) { - CheckSerializationRoundtrip(internal::checked_pointer_cast( - fixed_shape_tensor(value_type_, {}, {}, {}))); - CheckSerializationRoundtrip(internal::checked_pointer_cast( - fixed_shape_tensor(value_type_, {0}, {}, {}))); - CheckSerializationRoundtrip(internal::checked_pointer_cast( - fixed_shape_tensor(value_type_, {1}, {0}, {"x"}))); - CheckSerializationRoundtrip(internal::checked_pointer_cast( - fixed_shape_tensor(value_type_, {256, 256, 3}, {0, 1, 2}, {"H", "W", "C"}))); - CheckSerializationRoundtrip(internal::checked_pointer_cast( - fixed_shape_tensor(value_type_, {256, 256, 3}, {2, 0, 1}, {"C", "H", "W"}))); - - auto ext_type = internal::checked_pointer_cast( - fixed_shape_tensor(value_type_, cell_shape_, {0, 1}, dim_names_)); CheckSerializationRoundtrip(ext_type_); - - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("Invalid: Expected FixedSizeList storage type"), - ext_type->Deserialize(boolean(), serialized_)); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {}, {}, {})); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {0}, {}, {})); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {1}, {0}, {"x"})); + CheckSerializationRoundtrip( + fixed_shape_tensor(value_type_, {256, 256, 3}, {0, 1, 2}, {"H", "W", "C"})); + CheckSerializationRoundtrip( + fixed_shape_tensor(value_type_, {256, 256, 3}, {2, 0, 1}, {"C", "H", "W"})); + + auto storage_type = fixed_size_list(int64(), 12); + CheckDeserializationRaises(boolean(), R"({"shape":[3,4]})", + "Expected FixedSizeList storage type, got bool"); + CheckDeserializationRaises(storage_type, R"({"dim_names":["x","y"]})", + "Invalid serialized JSON data"); + CheckDeserializationRaises(storage_type, R"({"shape":(3,4)})", + "Invalid serialized JSON data"); + CheckDeserializationRaises(storage_type, R"({"shape":[3,4],"permutation":[1,0,2]})", + "Invalid permutation"); + CheckDeserializationRaises(storage_type, R"({"shape":[3],"dim_names":["x","y"]})", + "Invalid dim_names"); } TEST_F(TestExtensionType, RoudtripBatch) { diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 122944d0272..542de7a09ee 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -145,10 +145,10 @@ static void CreateGlobalRegistry() { #ifdef ARROW_WITH_JSON // Register canonical extension types - auto ext_type = checked_pointer_cast(extension::fixed_shape_tensor(int64(), {})); + auto ext_type = + checked_pointer_cast(extension::fixed_shape_tensor(int64(), {})); - ARROW_CHECK_OK( - g_registry->RegisterType(ext_type)); + ARROW_CHECK_OK(g_registry->RegisterType(ext_type)); #endif } From 0b46adb1a6e523874e1f826cffd0579a4ca38a45 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 3 Mar 2023 22:12:47 +0100 Subject: [PATCH 10/39] Review feedback --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 60 +++++++++++-------- cpp/src/arrow/extension/fixed_shape_tensor.h | 10 ++++ .../extension/fixed_shape_tensor_test.cc | 9 ++- 3 files changed, 52 insertions(+), 27 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 65fee350333..91e3fe3804b 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -32,18 +32,6 @@ namespace rj = arrow::rapidjson; namespace arrow { namespace extension { -const std::vector& FixedShapeTensorType::strides() { - if (this->strides_.empty()) { - const auto& element_type = - internal::checked_cast(*value_type_); - DCHECK_OK(internal::ComputeRowMajorStrides(element_type, shape_, &strides_)); - if (!permutation_.empty()) { - internal::Permute(permutation_, &strides_); - } - } - return this->strides_; -} - bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const { if (extension_name() != other.extension_name()) { return false; @@ -251,24 +239,48 @@ Result> FixedShapeTensorType::ToTensor( return *Tensor::Make(ext_arr->value_type(), buffer, shape, tensor_strides, dim_names()); } -std::shared_ptr fixed_shape_tensor(const std::shared_ptr& value_type, - const std::vector& shape, - const std::vector& permutation, - const std::vector& dim_names) { - if (!permutation.empty()) { - ARROW_CHECK_EQ(shape.size(), permutation.size()) - << "permutation.size() == " << permutation.size() - << " must be empty or have the same length as shape.size() " << shape.size(); +Result> FixedShapeTensorType::Make( + const std::shared_ptr& value_type, const std::vector& shape, + const std::vector& permutation, const std::vector& dim_names) { + if (!permutation.empty() && shape.size() != permutation.size()) { + return Status::Invalid("permutation size must match shape size. Expected: ", + shape.size(), " Got: ", permutation.size()); } - if (!dim_names.empty()) { - ARROW_CHECK_EQ(shape.size(), dim_names.size()) - << "dim_names.size() == " << dim_names.size() - << " must be empty or have the same length as shape.size() " << shape.size(); + if (!dim_names.empty() && shape.size() != dim_names.size()) { + return Status::Invalid("dim_names size must match shape size. Expected: ", + shape.size(), " Got: ", dim_names.size()); } return std::make_shared(value_type, shape, permutation, dim_names); } +Status FixedShapeTensorType::ComputeStrides(const FixedShapeTensorType& type, + std::vector& strides) { + auto element_type = internal::checked_pointer_cast(type.value_type_); + ARROW_RETURN_NOT_OK( + internal::ComputeRowMajorStrides(*element_type.get(), type.shape(), &strides)); + if (!type.permutation().empty()) { + internal::Permute(type.permutation(), &strides); + } + return Status::OK(); +} + +const std::vector& FixedShapeTensorType::strides() { + if (strides_.empty()) { + ARROW_CHECK_OK(ComputeStrides(*this, strides_)); + } + return strides_; +} + +std::shared_ptr fixed_shape_tensor(const std::shared_ptr& value_type, + const std::vector& shape, + const std::vector& permutation, + const std::vector& dim_names) { + auto maybe_type = FixedShapeTensorType::Make(value_type, shape, permutation, dim_names); + ARROW_DCHECK_OK(maybe_type.status()); + return maybe_type.MoveValueUnsafe(); +} + const std::shared_ptr GetStorageType( const std::shared_ptr& value_type, const std::vector& shape) { const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index a54e82e147e..aa13200e6b4 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -93,6 +93,16 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { /// \param[in] arr The FixedShapeTensorArray to convert to a Tensor Result> ToTensor(std::shared_ptr arr); + /// \brief Create a FixedShapeTensorType instance + static Result> Make( + const std::shared_ptr& value_type, const std::vector& shape, + const std::vector& permutation = {}, + const std::vector& dim_names = {}); + + /// \brief Compute strides of FixedShapeTensorType + static Status ComputeStrides(const FixedShapeTensorType& type, + std::vector& strides); + private: std::shared_ptr storage_type_; std::shared_ptr value_type_; diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 16495f6a2da..d17905d65f7 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -110,9 +110,12 @@ TEST_F(TestExtensionType, CreateExtensionType) { ASSERT_EQ(exact_ext_type->strides(), cell_strides_); ASSERT_EQ(exact_ext_type->dim_names(), dim_names_); - // TODO: Test invalid constructor input - // ASSERT_DEATH(fixed_shape_tensor(value_type_, cell_shape_, {0}), ""); - // ASSERT_DEATH(fixed_shape_tensor(value_type_, cell_shape_, {}, {"x"}), ""); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid: permutation size must match shape size."), + FixedShapeTensorType::Make(value_type_, cell_shape_, {0})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid: dim_names size must match shape size."), + FixedShapeTensorType::Make(value_type_, cell_shape_, {}, {"x"})); } TEST_F(TestExtensionType, CreateFromArray) { From fa4aa3d41a7d2cd18f3799e6cdaff0026a38e5c3 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 6 Mar 2023 12:42:07 +0100 Subject: [PATCH 11/39] ComputeStrides returns Result --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 19 +++++++++++++------ cpp/src/arrow/extension/fixed_shape_tensor.h | 3 +-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 91e3fe3804b..7ce4d74fb4f 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -254,20 +254,27 @@ Result> FixedShapeTensorType::Make( dim_names); } -Status FixedShapeTensorType::ComputeStrides(const FixedShapeTensorType& type, - std::vector& strides) { - auto element_type = internal::checked_pointer_cast(type.value_type_); +Result> FixedShapeTensorType::ComputeStrides( + const FixedShapeTensorType& type) { + std::vector strides; + auto element_type = + internal::checked_pointer_cast(type.storage_type()); + auto value_type = + internal::checked_pointer_cast(element_type->value_type()); + ARROW_RETURN_NOT_OK( - internal::ComputeRowMajorStrides(*element_type.get(), type.shape(), &strides)); + internal::ComputeRowMajorStrides(*value_type.get(), type.shape(), &strides)); if (!type.permutation().empty()) { internal::Permute(type.permutation(), &strides); } - return Status::OK(); + return strides; } const std::vector& FixedShapeTensorType::strides() { if (strides_.empty()) { - ARROW_CHECK_OK(ComputeStrides(*this, strides_)); + auto maybe_strides = ComputeStrides(*this); + ARROW_CHECK_OK(maybe_strides.status()); + strides_ = maybe_strides.MoveValueUnsafe(); } return strides_; } diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index aa13200e6b4..342c788b96f 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -100,8 +100,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { const std::vector& dim_names = {}); /// \brief Compute strides of FixedShapeTensorType - static Status ComputeStrides(const FixedShapeTensorType& type, - std::vector& strides); + static Result> ComputeStrides(const FixedShapeTensorType& type); private: std::shared_ptr storage_type_; From 0852fc6159b9670be7809fddb8949efe7b4186e2 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 7 Mar 2023 15:09:07 +0100 Subject: [PATCH 12/39] Review feeback --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 26 ++++++++--- .../extension/fixed_shape_tensor_test.cc | 44 +++++++++++++++---- 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 7ce4d74fb4f..6a0e2b35433 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -21,6 +21,7 @@ #include "arrow/array/array_primitive.h" #include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep #include "arrow/tensor.h" +#include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" #include "arrow/util/sort.h" @@ -155,7 +156,7 @@ Result> FixedShapeTensorType::MakeArray( } auto ext_type = internal::checked_pointer_cast( - fixed_shape_tensor(tensor->type(), cell_shape, permutation, tensor->dim_names())); + fixed_shape_tensor(tensor->type(), cell_shape, permutation, dim_names())); std::shared_ptr arr; std::shared_ptr value_array; @@ -223,20 +224,33 @@ Result> FixedShapeTensorType::ToTensor( // define n+1 dimensional tensor's strides by front appending a new stride to the n // dimensional tensor's strides. - ARROW_CHECK(is_tensor_supported(this->value_type_->id())); - - ARROW_DCHECK_EQ(arr->null_count(), 0) << "Null values not supported in tensors."; + ARROW_RETURN_IF(arr->null_count() > 0, + Status::Invalid("Null values not supported in tensors.")); auto ext_arr = internal::checked_pointer_cast( internal::checked_pointer_cast(arr)->storage()); std::vector shape = this->shape(); shape.insert(shape.begin(), 1, arr->length()); + int64_t major_stride; std::vector tensor_strides = this->strides(); - tensor_strides.insert(tensor_strides.begin(), 1, arr->length() * tensor_strides[0]); + if (internal::MultiplyWithOverflow(arr->length(), tensor_strides[0], &major_stride)) { + return Status::Invalid("Overflow in tensor strides"); + } + tensor_strides.insert(tensor_strides.begin(), 1, major_stride); + + std::vector dim_names; + if (!this->dim_names().empty()) { + dim_names = this->dim_names(); + dim_names.insert(dim_names.begin(), 1, ""); + } else { + dim_names = {}; + } std::shared_ptr buffer = ext_arr->values()->data()->buffers[1]; - return *Tensor::Make(ext_arr->value_type(), buffer, shape, tensor_strides, dim_names()); + ARROW_ASSIGN_OR_RAISE(auto tensor, Tensor::Make(ext_arr->value_type(), buffer, shape, + tensor_strides, dim_names)); + return tensor; } Result> FixedShapeTensorType::Make( diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index d17905d65f7..26e27700d2c 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -124,7 +124,7 @@ TEST_F(TestExtensionType, CreateFromArray) { std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); - EXPECT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); auto data = fsla_arr->data(); data->type = ext_type_; auto ext_arr = exact_ext_type->MakeArray(data); @@ -140,7 +140,7 @@ TEST_F(TestExtensionType, CreateFromTensor) { Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); auto exact_ext_type = internal::checked_pointer_cast(ext_type_); - EXPECT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); + ASSERT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); ASSERT_OK(ext_arr->ValidateFull()); ASSERT_TRUE(tensor->is_row_major()); @@ -149,7 +149,7 @@ TEST_F(TestExtensionType, CreateFromTensor) { auto ext_type_2 = internal::checked_pointer_cast( fixed_shape_tensor(int64(), {3, 4}, {0, 1})); - EXPECT_OK_AND_ASSIGN(auto ext_arr_2, ext_type_2->MakeArray(tensor)); + ASSERT_OK_AND_ASSIGN(auto ext_arr_2, ext_type_2->MakeArray(tensor)); ASSERT_OK_AND_ASSIGN( auto column_major_tensor, @@ -168,15 +168,41 @@ TEST_F(TestExtensionType, CreateFromTensor) { auto ext_type_4 = internal::checked_pointer_cast( fixed_shape_tensor(int64(), {3, 4}, {1, 0})); ASSERT_OK_AND_ASSIGN(auto ext_arr_4, ext_type_4->MakeArray(neither_major_tensor)); + + auto ext_type_5 = internal::checked_pointer_cast( + fixed_shape_tensor(binary(), {1, 2})); + auto arr = ArrayFromJSON(binary(), R"(["abc", "def"])"); + + ASSERT_OK_AND_ASSIGN(auto fsla_arr, + FixedSizeListArray::FromArrays(arr, fixed_size_list(binary(), 1))); + auto data = fsla_arr->data(); + data->type = ext_type_5; + auto ext_arr_5 = ext_type_5->MakeArray(data); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("binary is not valid data type for a tensor"), + exact_ext_type->ToTensor(ext_arr_5)); + + auto ext_type_6 = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), {1, 2})); + auto arr_with_null = ArrayFromJSON(int64(), "[0, null]"); + ASSERT_OK_AND_ASSIGN(auto fsla_arr_6, FixedSizeListArray::FromArrays( + arr_with_null, fixed_size_list(int64(), 1))); + auto data6 = fsla_arr_6->data(); + data6->type = ext_type_6; + data6->null_count = 1; + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Null values not supported in tensors."), + ext_type_6->ToTensor(ext_type_6->MakeArray(data6))); } TEST_F(TestExtensionType, RoundtripTensor) { ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); auto exact_ext_type = internal::checked_pointer_cast(ext_type_); - EXPECT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); + ASSERT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); - EXPECT_OK_AND_ASSIGN(auto tensor_from_array, exact_ext_type->ToTensor(ext_arr)); + ASSERT_OK_AND_ASSIGN(auto tensor_from_array, exact_ext_type->ToTensor(ext_arr)); ASSERT_EQ(tensor_from_array->shape(), tensor->shape()); ASSERT_EQ(tensor_from_array->strides(), tensor->strides()); ASSERT_TRUE(tensor->Equals(*tensor_from_array)); @@ -193,8 +219,8 @@ TEST_F(TestExtensionType, SliceTensor) { auto ext_type = fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_); auto exact_ext_type = internal::checked_pointer_cast(ext_type_); - EXPECT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); - EXPECT_OK_AND_ASSIGN(auto ext_arr_partial, exact_ext_type->MakeArray(tensor_partial)); + ASSERT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); + ASSERT_OK_AND_ASSIGN(auto ext_arr_partial, exact_ext_type->MakeArray(tensor_partial)); ASSERT_OK(ext_arr->ValidateFull()); ASSERT_OK(ext_arr_partial->ValidateFull()); @@ -254,7 +280,7 @@ TEST_F(TestExtensionType, RoudtripBatch) { std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); - EXPECT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); auto data = fsla_arr->data(); data->type = ext_type_; auto ext_arr = exact_ext_type->MakeArray(data); @@ -273,7 +299,7 @@ TEST_F(TestExtensionType, RoudtripBatchFromTensor) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, Buffer::Wrap(values_), shape_, {}, dim_names_)); - EXPECT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); + ASSERT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); ext_arr->data()->type = exact_ext_type; auto ext_metadata = From d116b460901b1477e147d92c41482b07d115879a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 9 Mar 2023 14:06:06 +0100 Subject: [PATCH 13/39] Add Equals tests --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 10 +++++----- .../extension/fixed_shape_tensor_test.cc | 20 +++++++++++++++++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 6a0e2b35433..bf0a87c998e 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -48,12 +48,12 @@ bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const { return true; }; const bool permutation_equivalent = - (permutation_ == other_ext.permutation()) || - ((permutation_.empty() && is_permutation_trivial(other_ext.permutation())) && - (is_permutation_trivial(permutation_) || other_ext.permutation().empty())); + ((permutation_ == other_ext.permutation()) || + (permutation_.empty() && is_permutation_trivial(other_ext.permutation())) || + (is_permutation_trivial(permutation_) && other_ext.permutation().empty())); - return storage_type()->Equals(other_ext.storage_type()) && - shape_ == other_ext.shape() && dim_names_ == other_ext.dim_names() && + return (storage_type()->Equals(other_ext.storage_type())) && + (shape_ == other_ext.shape()) && (dim_names_ == other_ext.dim_names()) && permutation_equivalent; } diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 26e27700d2c..fea8460c6b7 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -118,6 +118,26 @@ TEST_F(TestExtensionType, CreateExtensionType) { FixedShapeTensorType::Make(value_type_, cell_shape_, {}, {"x"})); } +TEST_F(TestExtensionType, EqualsCases) { + auto ext_type_permutation_1 = fixed_shape_tensor(int64(), {3, 4}, {0, 1}, {"x", "y"}); + auto ext_type_permutation_2 = fixed_shape_tensor(int64(), {3, 4}, {1, 0}, {"x", "y"}); + auto ext_type_no_permutation = fixed_shape_tensor(int64(), {3, 4}, {}, {"x", "y"}); + + ASSERT_TRUE(ext_type_permutation_1->Equals(ext_type_permutation_1)); + + ASSERT_FALSE(fixed_shape_tensor(int32(), {3, 4}, {}, {"x", "y"}) + ->Equals(ext_type_no_permutation)); + ASSERT_FALSE(fixed_shape_tensor(int64(), {2, 4}, {}, {"x", "y"}) + ->Equals(ext_type_no_permutation)); + ASSERT_FALSE(fixed_shape_tensor(int64(), {3, 4}, {}, {"H", "W"}) + ->Equals(ext_type_no_permutation)); + + ASSERT_TRUE(ext_type_no_permutation->Equals(ext_type_permutation_1)); + ASSERT_TRUE(ext_type_permutation_1->Equals(ext_type_no_permutation)); + ASSERT_FALSE(ext_type_permutation_1->Equals(ext_type_permutation_2)); + ASSERT_FALSE(ext_type_permutation_2->Equals(ext_type_permutation_1)); +} + TEST_F(TestExtensionType, CreateFromArray) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); From 80f6b31d069a78c3cb9b4b01f953fbb54e20f81f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 9 Mar 2023 14:55:07 +0100 Subject: [PATCH 14/39] Add FromTensor --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 22 ++++++++--------- cpp/src/arrow/extension/fixed_shape_tensor.h | 21 ++++++++-------- .../extension/fixed_shape_tensor_test.cc | 24 +++++++++++-------- 3 files changed, 35 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index bf0a87c998e..8484bf7b6e8 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -135,28 +135,28 @@ std::shared_ptr FixedShapeTensorType::MakeArray( return std::make_shared(data); } -Result> FixedShapeTensorType::MakeArray( - std::shared_ptr tensor) const { +Result> FixedShapeTensorArray::FromTensor( + const std::shared_ptr& tensor) { + auto cell_shape = tensor->shape(); + cell_shape.erase(cell_shape.begin()); + + std::vector dim_names; + std::copy(tensor->dim_names().begin() + 1, tensor->dim_names().end(), + std::back_inserter(dim_names)); + auto permutation = internal::ArgSort(tensor->strides()); std::reverse(permutation.begin(), permutation.end()); if (permutation[0] != 0) { return Status::Invalid( "Only first-major tensors can be zero-copy converted to arrays"); } - - auto cell_shape = tensor->shape(); - cell_shape.erase(cell_shape.begin()); - if (cell_shape != shape_) { - return Status::Invalid("Expected cell shape does not match input tensor shape"); - } - permutation.erase(permutation.begin()); for (auto& x : permutation) { x--; } auto ext_type = internal::checked_pointer_cast( - fixed_shape_tensor(tensor->type(), cell_shape, permutation, dim_names())); + fixed_shape_tensor(tensor->type(), cell_shape, permutation, dim_names)); std::shared_ptr arr; std::shared_ptr value_array; @@ -214,7 +214,7 @@ Result> FixedShapeTensorType::MakeArray( value_array); auto ext_data = arr->data(); ext_data->type = ext_type; - return MakeArray(ext_data); + return ext_type->MakeArray(ext_data); } Result> FixedShapeTensorType::ToTensor( diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 342c788b96f..cf110a8fa07 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -29,6 +29,16 @@ const std::shared_ptr GetStorageType( class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { public: using ExtensionArray::ExtensionArray; + + /// \brief Create a FixedShapeTensorArray from a Tensor + /// + /// This function will create a FixedShapeTensorArray from a Tensor, taking its + /// first dimension as the "element dimension" and the remaining dimensions as the + /// "tensor dimensions". If Tensor provides strides, they will be used to determine + /// dimension permutation. Otherwise, row-major permutation will be assumed. + /// + /// \param[in] tensor The Tensor to convert to a FixedShapeTensorArray + static Result> FromTensor(const std::shared_ptr& tensor); }; /// \brief Concrete type class for constant-size Tensor data. @@ -73,17 +83,6 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { /// Create a FixedShapeTensorArray from ArrayData std::shared_ptr MakeArray(std::shared_ptr data) const override; - /// \brief Create a FixedShapeTensorArray from a Tensor - /// - /// This function will create a FixedShapeTensorArray from a Tensor, taking its - /// first dimension as the "element dimension" and the remaining dimensions as the - /// "tensor dimensions". The tensor dimensions must match the FixedShapeTensorType's - /// element shape. This function assumes that the tensor's memory layout is - /// row-major. - /// - /// \param[in] tensor The Tensor to convert to a FixedShapeTensorArray - Result> MakeArray(std::shared_ptr tensor) const; - /// \brief Create a Tensor from FixedShapeTensorArray /// /// This function will create a Tensor from a FixedShapeTensorArray, setting its diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index fea8460c6b7..2ad40cdb7c2 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -33,6 +33,7 @@ namespace arrow { using FixedShapeTensorType = extension::FixedShapeTensorType; using extension::fixed_shape_tensor; +using extension::FixedShapeTensorArray; class TestExtensionType : public ::testing::Test { public: @@ -160,7 +161,7 @@ TEST_F(TestExtensionType, CreateFromTensor) { Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); auto exact_ext_type = internal::checked_pointer_cast(ext_type_); - ASSERT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); + ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); ASSERT_OK(ext_arr->ValidateFull()); ASSERT_TRUE(tensor->is_row_major()); @@ -169,7 +170,7 @@ TEST_F(TestExtensionType, CreateFromTensor) { auto ext_type_2 = internal::checked_pointer_cast( fixed_shape_tensor(int64(), {3, 4}, {0, 1})); - ASSERT_OK_AND_ASSIGN(auto ext_arr_2, ext_type_2->MakeArray(tensor)); + ASSERT_OK_AND_ASSIGN(auto ext_arr_2, FixedShapeTensorArray::FromTensor(tensor)); ASSERT_OK_AND_ASSIGN( auto column_major_tensor, @@ -180,14 +181,16 @@ TEST_F(TestExtensionType, CreateFromTensor) { Invalid, testing::HasSubstr( "Invalid: Only first-major tensors can be zero-copy converted to arrays"), - ext_type_3->MakeArray(column_major_tensor)); - ASSERT_THAT(ext_type_3->MakeArray(column_major_tensor), Raises(StatusCode::Invalid)); + FixedShapeTensorArray::FromTensor(column_major_tensor)); + ASSERT_THAT(FixedShapeTensorArray::FromTensor(column_major_tensor), + Raises(StatusCode::Invalid)); auto neither_major_tensor = std::make_shared(value_type_, Buffer::Wrap(values_), shape_, neither_major_strides); auto ext_type_4 = internal::checked_pointer_cast( fixed_shape_tensor(int64(), {3, 4}, {1, 0})); - ASSERT_OK_AND_ASSIGN(auto ext_arr_4, ext_type_4->MakeArray(neither_major_tensor)); + ASSERT_OK_AND_ASSIGN(auto ext_arr_4, + FixedShapeTensorArray::FromTensor(neither_major_tensor)); auto ext_type_5 = internal::checked_pointer_cast( fixed_shape_tensor(binary(), {1, 2})); @@ -220,7 +223,7 @@ TEST_F(TestExtensionType, RoundtripTensor) { ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); auto exact_ext_type = internal::checked_pointer_cast(ext_type_); - ASSERT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); + ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); ASSERT_OK_AND_ASSIGN(auto tensor_from_array, exact_ext_type->ToTensor(ext_arr)); ASSERT_EQ(tensor_from_array->shape(), tensor->shape()); @@ -239,8 +242,9 @@ TEST_F(TestExtensionType, SliceTensor) { auto ext_type = fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_); auto exact_ext_type = internal::checked_pointer_cast(ext_type_); - ASSERT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); - ASSERT_OK_AND_ASSIGN(auto ext_arr_partial, exact_ext_type->MakeArray(tensor_partial)); + ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); + ASSERT_OK_AND_ASSIGN(auto ext_arr_partial, + FixedShapeTensorArray::FromTensor(tensor_partial)); ASSERT_OK(ext_arr->ValidateFull()); ASSERT_OK(ext_arr_partial->ValidateFull()); @@ -318,8 +322,8 @@ TEST_F(TestExtensionType, RoudtripBatch) { TEST_F(TestExtensionType, RoudtripBatchFromTensor) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, Buffer::Wrap(values_), - shape_, {}, dim_names_)); - ASSERT_OK_AND_ASSIGN(auto ext_arr, exact_ext_type->MakeArray(tensor)); + shape_, {}, {"n", "x", "y"})); + ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); ext_arr->data()->type = exact_ext_type; auto ext_metadata = From d1b6e638cc0e1e8ada5a51484a602e4033457e7a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 9 Mar 2023 15:04:48 +0100 Subject: [PATCH 15/39] Review feedback. --- cpp/src/arrow/extension/fixed_shape_tensor.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index cf110a8fa07..ec37d44db79 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -63,7 +63,8 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { const std::vector& shape() const { return shape_; } /// Strides of tensor elements. Strides state offset in bytes between adjacent - /// elements along each dimension. + /// elements along each dimension. In case permutation is non-empty strides are + /// computed according to logical layout. const std::vector& strides(); /// Permutation mapping from logical to physical memory layout of tensor elements From 20049e34f1e749c18433b18104420dcc2cd5244c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 9 Mar 2023 19:13:15 +0100 Subject: [PATCH 16/39] Change to vector copy --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 8484bf7b6e8..416266c1710 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -141,8 +141,9 @@ Result> FixedShapeTensorArray::FromTensor( cell_shape.erase(cell_shape.begin()); std::vector dim_names; - std::copy(tensor->dim_names().begin() + 1, tensor->dim_names().end(), - std::back_inserter(dim_names)); + for (size_t i = 1; i < tensor->dim_names().size(); ++i) { + dim_names.emplace_back(tensor->dim_names()[i]); + } auto permutation = internal::ArgSort(tensor->strides()); std::reverse(permutation.begin(), permutation.end()); From 9830232092a91dac15c114e4d6e1878027b8feef Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 14 Mar 2023 02:43:00 +0100 Subject: [PATCH 17/39] Tensor roundtrip --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 27 +++++-- .../extension/fixed_shape_tensor_test.cc | 73 ++++++++++++++++--- 2 files changed, 83 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 416266c1710..9d1f519779b 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -53,7 +53,7 @@ bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const { (is_permutation_trivial(permutation_) && other_ext.permutation().empty())); return (storage_type()->Equals(other_ext.storage_type())) && - (shape_ == other_ext.shape()) && (dim_names_ == other_ext.dim_names()) && + (this->shape() == other_ext.shape()) && (dim_names_ == other_ext.dim_names()) && permutation_equivalent; } @@ -155,6 +155,7 @@ Result> FixedShapeTensorArray::FromTensor( for (auto& x : permutation) { x--; } + internal::Permute(permutation, &cell_shape); auto ext_type = internal::checked_pointer_cast( fixed_shape_tensor(tensor->type(), cell_shape, permutation, dim_names)); @@ -227,17 +228,25 @@ Result> FixedShapeTensorType::ToTensor( ARROW_RETURN_IF(arr->null_count() > 0, Status::Invalid("Null values not supported in tensors.")); + auto ext_arr = internal::checked_pointer_cast( internal::checked_pointer_cast(arr)->storage()); + ARROW_RETURN_IF(!is_fixed_width(ext_arr->value_type()->id()), + Status::Invalid(ext_arr->value_type()->ToString(), + " is not valid data type for a tensor")); std::vector shape = this->shape(); shape.insert(shape.begin(), 1, arr->length()); int64_t major_stride; std::vector tensor_strides = this->strides(); - if (internal::MultiplyWithOverflow(arr->length(), tensor_strides[0], &major_stride)) { + if (internal::MultiplyWithOverflow(ext_arr->value_type()->byte_width(), + ext_arr->list_type()->list_size(), &major_stride)) { return Status::Invalid("Overflow in tensor strides"); } + if (!this->permutation().empty()) { + internal::Permute(this->permutation(), &tensor_strides); + } tensor_strides.insert(tensor_strides.begin(), 1, major_stride); std::vector dim_names; @@ -277,11 +286,12 @@ Result> FixedShapeTensorType::ComputeStrides( auto value_type = internal::checked_pointer_cast(element_type->value_type()); - ARROW_RETURN_NOT_OK( - internal::ComputeRowMajorStrides(*value_type.get(), type.shape(), &strides)); + auto shape = type.shape(); if (!type.permutation().empty()) { - internal::Permute(type.permutation(), &strides); + internal::Permute(type.permutation(), &shape); } + ARROW_RETURN_NOT_OK( + internal::ComputeRowMajorStrides(*value_type.get(), shape, &strides)); return strides; } @@ -298,7 +308,12 @@ std::shared_ptr fixed_shape_tensor(const std::shared_ptr& va const std::vector& shape, const std::vector& permutation, const std::vector& dim_names) { - auto maybe_type = FixedShapeTensorType::Make(value_type, shape, permutation, dim_names); + std::vector shape_ = std::move(shape); + if (!permutation.empty()) { + internal::Permute(permutation, &shape_); + } + auto maybe_type = + FixedShapeTensorType::Make(value_type, shape_, permutation, dim_names); ARROW_DCHECK_OK(maybe_type.status()); return maybe_type.MoveValueUnsafe(); } diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 2ad40cdb7c2..38bd8fe10d0 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -219,18 +219,69 @@ TEST_F(TestExtensionType, CreateFromTensor) { ext_type_6->ToTensor(ext_type_6->MakeArray(data6))); } -TEST_F(TestExtensionType, RoundtripTensor) { - ASSERT_OK_AND_ASSIGN(auto tensor, - Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); - auto exact_ext_type = internal::checked_pointer_cast(ext_type_); +void CheckTensorRoundtrip(const std::shared_ptr& tensor, + std::shared_ptr expected_ext_type) { + auto ext_type = internal::checked_pointer_cast(expected_ext_type); ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); - - ASSERT_OK_AND_ASSIGN(auto tensor_from_array, exact_ext_type->ToTensor(ext_arr)); - ASSERT_EQ(tensor_from_array->shape(), tensor->shape()); - ASSERT_EQ(tensor_from_array->strides(), tensor->strides()); + auto generated_ext_type = + internal::checked_pointer_cast(ext_arr->type()); + + // Check that generated type is equal to the expected type + ASSERT_EQ(generated_ext_type->type_name(), ext_type->type_name()); + ASSERT_EQ(generated_ext_type->shape(), ext_type->shape()); + ASSERT_EQ(generated_ext_type->dim_names(), ext_type->dim_names()); + ASSERT_EQ(generated_ext_type->permutation(), ext_type->permutation()); + ASSERT_TRUE(generated_ext_type->storage_type()->Equals(*ext_type->storage_type())); + ASSERT_TRUE(generated_ext_type->Equals(ext_type)); + + // Check Tensor roundtrip + ASSERT_OK_AND_ASSIGN(auto tensor_from_array, generated_ext_type->ToTensor(ext_arr)); + ASSERT_EQ(tensor->type(), tensor_from_array->type()); + ASSERT_EQ(tensor->shape(), tensor_from_array->shape()); + for (size_t i = 1; i < tensor->dim_names().size(); i++) { + ASSERT_EQ(tensor->dim_names()[i], tensor_from_array->dim_names()[i]); + } + ASSERT_EQ(tensor->strides(), tensor_from_array->strides()); + ASSERT_TRUE(tensor->data()->Equals(*tensor_from_array->data())); ASSERT_TRUE(tensor->Equals(*tensor_from_array)); } +TEST_F(TestExtensionType, RoundtripTensor) { + auto values = Buffer::Wrap(values_); + ASSERT_OK_AND_ASSIGN(auto tensor1, Tensor::Make(value_type_, values, {3, 3, 4}, + {96, 32, 8}, {"", "y", "z"})); + ASSERT_OK_AND_ASSIGN(auto tensor2, + Tensor::Make(value_type_, values, {3, 3, 4}, {96, 8, 24})); + ASSERT_OK_AND_ASSIGN(auto tensor3, + Tensor::Make(value_type_, values, {3, 4, 3}, {96, 24, 8})); + ASSERT_OK_AND_ASSIGN(auto tensor4, + Tensor::Make(value_type_, values, {3, 4, 3}, {96, 8, 32})); + ASSERT_OK_AND_ASSIGN(auto tensor5, + Tensor::Make(value_type_, values, {6, 2, 3}, {48, 24, 8})); + ASSERT_OK_AND_ASSIGN(auto tensor6, + Tensor::Make(value_type_, values, {6, 2, 3}, {48, 8, 16})); + ASSERT_OK_AND_ASSIGN(auto tensor7, + Tensor::Make(value_type_, values, {2, 3, 6}, {144, 48, 8})); + ASSERT_OK_AND_ASSIGN(auto tensor8, + Tensor::Make(value_type_, values, {2, 3, 6}, {144, 8, 24})); + ASSERT_OK_AND_ASSIGN(auto tensor9, + Tensor::Make(value_type_, values, {2, 3, 2, 3}, {144, 48, 24, 8})); + ASSERT_OK_AND_ASSIGN(auto tensor10, + Tensor::Make(value_type_, values, {2, 3, 2, 3}, {144, 8, 24, 48})); + + CheckTensorRoundtrip(tensor1, + fixed_shape_tensor(value_type_, {3, 4}, {0, 1}, {"y", "z"})); + CheckTensorRoundtrip(tensor2, fixed_shape_tensor(value_type_, {4, 3}, {1, 0}, {})); + CheckTensorRoundtrip(tensor3, fixed_shape_tensor(value_type_, {4, 3}, {0, 1})); + CheckTensorRoundtrip(tensor4, fixed_shape_tensor(value_type_, {3, 4}, {1, 0})); + CheckTensorRoundtrip(tensor5, fixed_shape_tensor(value_type_, {2, 3}, {0, 1})); + CheckTensorRoundtrip(tensor6, fixed_shape_tensor(value_type_, {3, 2}, {1, 0})); + CheckTensorRoundtrip(tensor7, fixed_shape_tensor(value_type_, {3, 6}, {0, 1})); + CheckTensorRoundtrip(tensor8, fixed_shape_tensor(value_type_, {6, 3}, {1, 0})); + CheckTensorRoundtrip(tensor9, fixed_shape_tensor(value_type_, {3, 2, 3}, {0, 1, 2})); + CheckTensorRoundtrip(tensor10, fixed_shape_tensor(value_type_, {3, 2, 3}, {2, 1, 0})); +} + TEST_F(TestExtensionType, SliceTensor) { ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); @@ -357,17 +408,17 @@ TEST_F(TestExtensionType, ComputeStrides) { auto ext_type_5 = internal::checked_pointer_cast( fixed_shape_tensor(int64(), {3, 4, 7}, {1, 0, 2})); - ASSERT_EQ(ext_type_5->strides(), (std::vector{56, 224, 8})); + ASSERT_EQ(ext_type_5->strides(), (std::vector{168, 56, 8})); ASSERT_EQ(ext_type_5->Serialize(), R"({"shape":[3,4,7],"permutation":[1,0,2]})"); auto ext_type_6 = internal::checked_pointer_cast( fixed_shape_tensor(int64(), {3, 4, 7}, {1, 2, 0}, {})); - ASSERT_EQ(ext_type_6->strides(), (std::vector{56, 8, 224})); + ASSERT_EQ(ext_type_6->strides(), (std::vector{168, 24, 8})); ASSERT_EQ(ext_type_6->Serialize(), R"({"shape":[3,4,7],"permutation":[1,2,0]})"); auto ext_type_7 = internal::checked_pointer_cast( fixed_shape_tensor(int32(), {3, 4, 7}, {2, 0, 1}, {})); - ASSERT_EQ(ext_type_7->strides(), (std::vector{4, 112, 28})); + ASSERT_EQ(ext_type_7->strides(), (std::vector{48, 16, 4})); ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); } From 5f13efe1d48793797c6c4181c0a1efaff2cf18b4 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 17 Mar 2023 18:31:39 +0100 Subject: [PATCH 18/39] Change to strides() docstring --- cpp/src/arrow/extension/fixed_shape_tensor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index ec37d44db79..bb6686e15f7 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -64,7 +64,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { /// Strides of tensor elements. Strides state offset in bytes between adjacent /// elements along each dimension. In case permutation is non-empty strides are - /// computed according to logical layout. + /// computed from permuted tensor element's shape. const std::vector& strides(); /// Permutation mapping from logical to physical memory layout of tensor elements From 8ffadeea6aaeb882674abe0170e91010241eb0e5 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 17 Mar 2023 20:08:24 +0100 Subject: [PATCH 19/39] Element shape not permuted --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 8 +------- cpp/src/arrow/extension/fixed_shape_tensor_test.cc | 8 ++++---- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 9d1f519779b..999ba2bea56 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -155,7 +155,6 @@ Result> FixedShapeTensorArray::FromTensor( for (auto& x : permutation) { x--; } - internal::Permute(permutation, &cell_shape); auto ext_type = internal::checked_pointer_cast( fixed_shape_tensor(tensor->type(), cell_shape, permutation, dim_names)); @@ -308,12 +307,7 @@ std::shared_ptr fixed_shape_tensor(const std::shared_ptr& va const std::vector& shape, const std::vector& permutation, const std::vector& dim_names) { - std::vector shape_ = std::move(shape); - if (!permutation.empty()) { - internal::Permute(permutation, &shape_); - } - auto maybe_type = - FixedShapeTensorType::Make(value_type, shape_, permutation, dim_names); + auto maybe_type = FixedShapeTensorType::Make(value_type, shape, permutation, dim_names); ARROW_DCHECK_OK(maybe_type.status()); return maybe_type.MoveValueUnsafe(); } diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 38bd8fe10d0..74f79faf487 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -271,13 +271,13 @@ TEST_F(TestExtensionType, RoundtripTensor) { CheckTensorRoundtrip(tensor1, fixed_shape_tensor(value_type_, {3, 4}, {0, 1}, {"y", "z"})); - CheckTensorRoundtrip(tensor2, fixed_shape_tensor(value_type_, {4, 3}, {1, 0}, {})); + CheckTensorRoundtrip(tensor2, fixed_shape_tensor(value_type_, {3, 4}, {1, 0}, {})); CheckTensorRoundtrip(tensor3, fixed_shape_tensor(value_type_, {4, 3}, {0, 1})); - CheckTensorRoundtrip(tensor4, fixed_shape_tensor(value_type_, {3, 4}, {1, 0})); + CheckTensorRoundtrip(tensor4, fixed_shape_tensor(value_type_, {4, 3}, {1, 0})); CheckTensorRoundtrip(tensor5, fixed_shape_tensor(value_type_, {2, 3}, {0, 1})); - CheckTensorRoundtrip(tensor6, fixed_shape_tensor(value_type_, {3, 2}, {1, 0})); + CheckTensorRoundtrip(tensor6, fixed_shape_tensor(value_type_, {2, 3}, {1, 0})); CheckTensorRoundtrip(tensor7, fixed_shape_tensor(value_type_, {3, 6}, {0, 1})); - CheckTensorRoundtrip(tensor8, fixed_shape_tensor(value_type_, {6, 3}, {1, 0})); + CheckTensorRoundtrip(tensor8, fixed_shape_tensor(value_type_, {3, 6}, {1, 0})); CheckTensorRoundtrip(tensor9, fixed_shape_tensor(value_type_, {3, 2, 3}, {0, 1, 2})); CheckTensorRoundtrip(tensor10, fixed_shape_tensor(value_type_, {3, 2, 3}, {2, 1, 0})); } From 47408861c0a858019ffe4686ae2c906f77a90bb2 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 28 Mar 2023 14:09:47 +0200 Subject: [PATCH 20/39] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 5 +++-- cpp/src/arrow/extension/fixed_shape_tensor.h | 2 +- cpp/src/arrow/extension/fixed_shape_tensor_test.cc | 2 ++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 999ba2bea56..d0d09291ac8 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -225,8 +225,9 @@ Result> FixedShapeTensorType::ToTensor( // define n+1 dimensional tensor's strides by front appending a new stride to the n // dimensional tensor's strides. - ARROW_RETURN_IF(arr->null_count() > 0, - Status::Invalid("Null values not supported in tensors.")); + if (arr->null_count() > 0) { + return Status::Invalid("Null values not supported in tensors."); + } auto ext_arr = internal::checked_pointer_cast( internal::checked_pointer_cast(arr)->storage()); diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index bb6686e15f7..d6260475dcd 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -35,7 +35,7 @@ class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { /// This function will create a FixedShapeTensorArray from a Tensor, taking its /// first dimension as the "element dimension" and the remaining dimensions as the /// "tensor dimensions". If Tensor provides strides, they will be used to determine - /// dimension permutation. Otherwise, row-major permutation will be assumed. + /// dimension permutation. Otherwise, row-major layout (i.e. no permutation) will be assumed. /// /// \param[in] tensor The Tensor to convert to a FixedShapeTensorArray static Result> FromTensor(const std::shared_ptr& tensor); diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 74f79faf487..15ac0e74f9c 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -135,6 +135,8 @@ TEST_F(TestExtensionType, EqualsCases) { ASSERT_TRUE(ext_type_no_permutation->Equals(ext_type_permutation_1)); ASSERT_TRUE(ext_type_permutation_1->Equals(ext_type_no_permutation)); + ASSERT_FALSE(ext_type_no_permutation->Equals(ext_type_permutation_2)); + ASSERT_FALSE(ext_type_permutation_2->Equals(ext_type_no_permutation)); ASSERT_FALSE(ext_type_permutation_1->Equals(ext_type_permutation_2)); ASSERT_FALSE(ext_type_permutation_2->Equals(ext_type_permutation_1)); } From 789fc5b5c8c386ba942043ff35d094dd5b84ff46 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 28 Mar 2023 22:45:46 +0200 Subject: [PATCH 21/39] ToTensor as method of FixedShapeTensorArray --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 48 ++++++++----------- cpp/src/arrow/extension/fixed_shape_tensor.h | 24 +++++----- .../extension/fixed_shape_tensor_test.cc | 26 ++++------ 3 files changed, 42 insertions(+), 56 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index d0d09291ac8..b54657e015c 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -135,7 +135,7 @@ std::shared_ptr FixedShapeTensorType::MakeArray( return std::make_shared(data); } -Result> FixedShapeTensorArray::FromTensor( +Result> FixedShapeTensorArray::FromTensor( const std::shared_ptr& tensor) { auto cell_shape = tensor->shape(); cell_shape.erase(cell_shape.begin()); @@ -159,7 +159,6 @@ Result> FixedShapeTensorArray::FromTensor( auto ext_type = internal::checked_pointer_cast( fixed_shape_tensor(tensor->type(), cell_shape, permutation, dim_names)); - std::shared_ptr arr; std::shared_ptr value_array; switch (tensor->type_id()) { case Type::UINT8: { @@ -211,55 +210,50 @@ Result> FixedShapeTensorArray::FromTensor( tensor->type()->ToString()); } } - arr = std::make_shared(ext_type->storage_type(), tensor->shape()[0], - value_array); - auto ext_data = arr->data(); - ext_data->type = ext_type; - return ext_type->MakeArray(ext_data); + auto cell_size = tensor->size() / tensor->shape()[0]; + ARROW_ASSIGN_OR_RAISE(std::shared_ptr arr, + FixedSizeListArray::FromArrays(value_array, cell_size)); + std::shared_ptr ext_arr = ExtensionType::WrapArray(ext_type, arr); + return std::reinterpret_pointer_cast(ext_arr); } -Result> FixedShapeTensorType::ToTensor( - std::shared_ptr arr) { +const Result> FixedShapeTensorArray::ToTensor() const { // To convert an array of n dimensional tensors to a n+1 dimensional tensor we // interpret the array's length as the first dimension the new tensor. Further, we // define n+1 dimensional tensor's strides by front appending a new stride to the n // dimensional tensor's strides. - if (arr->null_count() > 0) { - return Status::Invalid("Null values not supported in tensors."); - } - - auto ext_arr = internal::checked_pointer_cast( - internal::checked_pointer_cast(arr)->storage()); - ARROW_RETURN_IF(!is_fixed_width(ext_arr->value_type()->id()), + auto ext_arr = internal::checked_pointer_cast(this->storage()); + auto ext_type = internal::checked_pointer_cast(this->type()); + ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()), Status::Invalid(ext_arr->value_type()->ToString(), " is not valid data type for a tensor")); - - std::vector shape = this->shape(); - shape.insert(shape.begin(), 1, arr->length()); + std::vector shape = ext_type->shape(); + shape.insert(shape.begin(), 1, this->length()); int64_t major_stride; - std::vector tensor_strides = this->strides(); + std::vector tensor_strides = ext_type->strides(); if (internal::MultiplyWithOverflow(ext_arr->value_type()->byte_width(), ext_arr->list_type()->list_size(), &major_stride)) { return Status::Invalid("Overflow in tensor strides"); } - if (!this->permutation().empty()) { - internal::Permute(this->permutation(), &tensor_strides); + if (!ext_type->permutation().empty()) { + internal::Permute(ext_type->permutation(), &tensor_strides); } tensor_strides.insert(tensor_strides.begin(), 1, major_stride); std::vector dim_names; - if (!this->dim_names().empty()) { - dim_names = this->dim_names(); + if (!ext_type->dim_names().empty()) { + dim_names = ext_type->dim_names(); dim_names.insert(dim_names.begin(), 1, ""); } else { dim_names = {}; } - std::shared_ptr buffer = ext_arr->values()->data()->buffers[1]; - ARROW_ASSIGN_OR_RAISE(auto tensor, Tensor::Make(ext_arr->value_type(), buffer, shape, - tensor_strides, dim_names)); + ARROW_ASSIGN_OR_RAISE(auto buffers, ext_arr->Flatten()); + ARROW_ASSIGN_OR_RAISE( + auto tensor, Tensor::Make(ext_arr->value_type(), buffers->data()->buffers[1], shape, + tensor_strides, dim_names)); return tensor; } diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index d6260475dcd..6179a282bf8 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -32,13 +32,22 @@ class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { /// \brief Create a FixedShapeTensorArray from a Tensor /// - /// This function will create a FixedShapeTensorArray from a Tensor, taking its + /// This method will create a FixedShapeTensorArray from a Tensor, taking its /// first dimension as the "element dimension" and the remaining dimensions as the /// "tensor dimensions". If Tensor provides strides, they will be used to determine - /// dimension permutation. Otherwise, row-major layout (i.e. no permutation) will be assumed. + /// dimension permutation. Otherwise, row-major layout (i.e. no permutation) will be + /// assumed. /// /// \param[in] tensor The Tensor to convert to a FixedShapeTensorArray - static Result> FromTensor(const std::shared_ptr& tensor); + static Result> FromTensor( + const std::shared_ptr& tensor); + + /// \brief Create a Tensor from FixedShapeTensorArray + /// + /// This method will create a Tensor from a FixedShapeTensorArray, setting its + /// first dimension as length equal to the FixedShapeTensorArray's length and the + /// remaining dimensions as the FixedShapeTensorType's element shape. + const Result> ToTensor() const; }; /// \brief Concrete type class for constant-size Tensor data. @@ -84,15 +93,6 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { /// Create a FixedShapeTensorArray from ArrayData std::shared_ptr MakeArray(std::shared_ptr data) const override; - /// \brief Create a Tensor from FixedShapeTensorArray - /// - /// This function will create a Tensor from a FixedShapeTensorArray, setting its - /// first dimension as length equal to the FixedShapeTensorArray's length and the - /// remaining dimensions as the FixedShapeTensorType's element shape. - /// - /// \param[in] arr The FixedShapeTensorArray to convert to a Tensor - Result> ToTensor(std::shared_ptr arr); - /// \brief Create a FixedShapeTensorType instance static Result> Make( const std::shared_ptr& value_type, const std::vector& shape, diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 15ac0e74f9c..002893aa020 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -195,30 +195,22 @@ TEST_F(TestExtensionType, CreateFromTensor) { FixedShapeTensorArray::FromTensor(neither_major_tensor)); auto ext_type_5 = internal::checked_pointer_cast( - fixed_shape_tensor(binary(), {1, 2})); + fixed_shape_tensor(binary(), {1, 3})); auto arr = ArrayFromJSON(binary(), R"(["abc", "def"])"); ASSERT_OK_AND_ASSIGN(auto fsla_arr, - FixedSizeListArray::FromArrays(arr, fixed_size_list(binary(), 1))); - auto data = fsla_arr->data(); - data->type = ext_type_5; - auto ext_arr_5 = ext_type_5->MakeArray(data); + FixedSizeListArray::FromArrays(arr, fixed_size_list(binary(), 2))); + auto ext_arr_5 = std::reinterpret_pointer_cast( + ExtensionType::WrapArray(ext_type_5, fsla_arr)); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("binary is not valid data type for a tensor"), - exact_ext_type->ToTensor(ext_arr_5)); + ext_arr_5->ToTensor()); auto ext_type_6 = internal::checked_pointer_cast( fixed_shape_tensor(int64(), {1, 2})); - auto arr_with_null = ArrayFromJSON(int64(), "[0, null]"); + auto arr_with_null = ArrayFromJSON(int64(), "[1, 0, null, null, 1, 2]"); ASSERT_OK_AND_ASSIGN(auto fsla_arr_6, FixedSizeListArray::FromArrays( - arr_with_null, fixed_size_list(int64(), 1))); - auto data6 = fsla_arr_6->data(); - data6->type = ext_type_6; - data6->null_count = 1; - - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("Null values not supported in tensors."), - ext_type_6->ToTensor(ext_type_6->MakeArray(data6))); + arr_with_null, fixed_size_list(int64(), 2))); } void CheckTensorRoundtrip(const std::shared_ptr& tensor, @@ -226,7 +218,7 @@ void CheckTensorRoundtrip(const std::shared_ptr& tensor, auto ext_type = internal::checked_pointer_cast(expected_ext_type); ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); auto generated_ext_type = - internal::checked_pointer_cast(ext_arr->type()); + internal::checked_cast(ext_arr->extension_type()); // Check that generated type is equal to the expected type ASSERT_EQ(generated_ext_type->type_name(), ext_type->type_name()); @@ -237,7 +229,7 @@ void CheckTensorRoundtrip(const std::shared_ptr& tensor, ASSERT_TRUE(generated_ext_type->Equals(ext_type)); // Check Tensor roundtrip - ASSERT_OK_AND_ASSIGN(auto tensor_from_array, generated_ext_type->ToTensor(ext_arr)); + ASSERT_OK_AND_ASSIGN(auto tensor_from_array, ext_arr->ToTensor()); ASSERT_EQ(tensor->type(), tensor_from_array->type()); ASSERT_EQ(tensor->shape(), tensor_from_array->shape()); for (size_t i = 1; i < tensor->dim_names().size(); i++) { From 4cbae67a099ec585c1bde977746a87170592594a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 28 Mar 2023 22:56:58 +0200 Subject: [PATCH 22/39] Remove ARROW_WITH_JSON flag --- cpp/src/arrow/CMakeLists.txt | 1 - cpp/src/arrow/extension_type.cc | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index fb0f7f1b7a0..9df85ce303d 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -549,7 +549,6 @@ if(ARROW_IPC) endif() if(ARROW_JSON) - add_definitions(-DARROW_WITH_JSON) list(APPEND ARROW_SRCS extension/fixed_shape_tensor.cc diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 542de7a09ee..91543dfb19e 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -26,7 +26,7 @@ #include "arrow/array/util.h" #include "arrow/chunked_array.h" -#ifdef ARROW_WITH_JSON +#ifdef ARROW_JSON #include "arrow/extension/fixed_shape_tensor.h" #endif #include "arrow/status.h" @@ -143,7 +143,7 @@ namespace internal { static void CreateGlobalRegistry() { g_registry = std::make_shared(); -#ifdef ARROW_WITH_JSON +#ifdef ARROW_JSON // Register canonical extension types auto ext_type = checked_pointer_cast(extension::fixed_shape_tensor(int64(), {})); From 40c6e2025b1e3efc4831251dcf700bbcdaa19243 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 28 Mar 2023 23:54:29 +0200 Subject: [PATCH 23/39] GetStorageType to anonymous namespace --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 7 ------- cpp/src/arrow/extension/fixed_shape_tensor.h | 10 ++++++++-- cpp/src/arrow/extension/fixed_shape_tensor_test.cc | 2 ++ 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index b54657e015c..dae13f10a36 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -307,12 +307,5 @@ std::shared_ptr fixed_shape_tensor(const std::shared_ptr& va return maybe_type.MoveValueUnsafe(); } -const std::shared_ptr GetStorageType( - const std::shared_ptr& value_type, const std::vector& shape) { - const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), - std::multiplies<>()); - return fixed_size_list(value_type, static_cast(size)); -} - } // namespace extension } // namespace arrow diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 6179a282bf8..052fa25b7b2 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -23,8 +23,14 @@ namespace arrow { namespace extension { +namespace { const std::shared_ptr GetStorageType( - const std::shared_ptr& value_type, const std::vector& shape); + const std::shared_ptr& value_type, const std::vector& shape) { + const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), + std::multiplies<>()); + return fixed_size_list(value_type, static_cast(size)); +} +} // namespace class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { public: @@ -99,10 +105,10 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { const std::vector& permutation = {}, const std::vector& dim_names = {}); + private: /// \brief Compute strides of FixedShapeTensorType static Result> ComputeStrides(const FixedShapeTensorType& type); - private: std::shared_ptr storage_type_; std::shared_ptr value_type_; std::vector shape_; diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 002893aa020..18ab551ce0b 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#ifdef ARROW_JSON #include "arrow/extension/fixed_shape_tensor.h" #include "arrow/testing/matchers.h" @@ -417,3 +418,4 @@ TEST_F(TestExtensionType, ComputeStrides) { } } // namespace arrow +#endif From ff19cc063c5d587f3f1e440942a7c78f542ef55c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 29 Mar 2023 00:06:41 +0200 Subject: [PATCH 24/39] Review feedback --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 2 +- cpp/src/arrow/extension/fixed_shape_tensor.h | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index dae13f10a36..4b89d6c6601 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -210,7 +210,7 @@ Result> FixedShapeTensorArray::FromTensor tensor->type()->ToString()); } } - auto cell_size = tensor->size() / tensor->shape()[0]; + auto cell_size = static_cast(tensor->size() / tensor->shape()[0]); ARROW_ASSIGN_OR_RAISE(std::shared_ptr arr, FixedSizeListArray::FromArrays(value_array, cell_size)); std::shared_ptr ext_arr = ExtensionType::WrapArray(ext_type, arr); diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 052fa25b7b2..186b0b4a7c4 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -38,11 +38,11 @@ class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { /// \brief Create a FixedShapeTensorArray from a Tensor /// - /// This method will create a FixedShapeTensorArray from a Tensor, taking its - /// first dimension as the "element dimension" and the remaining dimensions as the - /// "tensor dimensions". If Tensor provides strides, they will be used to determine - /// dimension permutation. Otherwise, row-major layout (i.e. no permutation) will be - /// assumed. + /// This method will create a FixedShapeTensorArray from a Tensor, taking its first + /// dimension as the number of elements in the resulting array and the remaining + /// dimensions as the shape of the individual tensors. If Tensor provides strides, + /// they will be used to determine dimension permutation. Otherwise, row-major layout + /// (i.e. no permutation) will be assumed. /// /// \param[in] tensor The Tensor to convert to a FixedShapeTensorArray static Result> FromTensor( @@ -52,7 +52,7 @@ class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { /// /// This method will create a Tensor from a FixedShapeTensorArray, setting its /// first dimension as length equal to the FixedShapeTensorArray's length and the - /// remaining dimensions as the FixedShapeTensorType's element shape. + /// remaining dimensions as the FixedShapeTensorType's shape. const Result> ToTensor() const; }; From fa50377d17f7b42f28d3e8e7bb55369627be996e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 29 Mar 2023 00:18:45 +0200 Subject: [PATCH 25/39] Removing GetStorageType --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 6 ++++-- cpp/src/arrow/extension/fixed_shape_tensor.h | 13 ++----------- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 4b89d6c6601..32c5d74a2e7 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -268,8 +268,10 @@ Result> FixedShapeTensorType::Make( return Status::Invalid("dim_names size must match shape size. Expected: ", shape.size(), " Got: ", dim_names.size()); } - return std::make_shared(value_type, shape, permutation, - dim_names); + const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), + std::multiplies<>()); + return std::make_shared(value_type, static_cast(size), + shape, permutation, dim_names); } Result> FixedShapeTensorType::ComputeStrides( diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 186b0b4a7c4..32c3dcdf4ba 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -23,15 +23,6 @@ namespace arrow { namespace extension { -namespace { -const std::shared_ptr GetStorageType( - const std::shared_ptr& value_type, const std::vector& shape) { - const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), - std::multiplies<>()); - return fixed_size_list(value_type, static_cast(size)); -} -} // namespace - class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { public: using ExtensionArray::ExtensionArray; @@ -59,11 +50,11 @@ class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { /// \brief Concrete type class for constant-size Tensor data. class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { public: - FixedShapeTensorType(const std::shared_ptr& value_type, + FixedShapeTensorType(const std::shared_ptr& value_type, const int32_t& size, const std::vector& shape, const std::vector& permutation = {}, const std::vector& dim_names = {}) - : ExtensionType(GetStorageType(value_type, shape)), + : ExtensionType(fixed_size_list(value_type, size)), value_type_(value_type), shape_(shape), permutation_(permutation), From ddf0219dc82151b796148109f47fb39277702cc3 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 30 Mar 2023 03:49:21 +0200 Subject: [PATCH 26/39] Refactor ComputeStrides --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 97 ++++++++++++------- cpp/src/arrow/extension/fixed_shape_tensor.h | 9 +- .../extension/fixed_shape_tensor_test.cc | 77 +++++++-------- 3 files changed, 109 insertions(+), 74 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 32c5d74a2e7..ff2d61a232b 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -145,15 +145,14 @@ Result> FixedShapeTensorArray::FromTensor dim_names.emplace_back(tensor->dim_names()[i]); } - auto permutation = internal::ArgSort(tensor->strides()); - std::reverse(permutation.begin(), permutation.end()); + auto permutation = internal::ArgSort(tensor->strides(), std::greater<>()); if (permutation[0] != 0) { return Status::Invalid( "Only first-major tensors can be zero-copy converted to arrays"); } permutation.erase(permutation.begin()); - for (auto& x : permutation) { - x--; + for (size_t i = 0; i < permutation.size(); ++i) { + --permutation[i]; } auto ext_type = internal::checked_pointer_cast( @@ -217,11 +216,49 @@ Result> FixedShapeTensorArray::FromTensor return std::reinterpret_pointer_cast(ext_arr); } +Status FixedShapeTensorType::ComputeStrides(const FixedWidthType& type, + const std::vector& shape, + const std::vector& permutation, + std::vector* strides) { + if (permutation.empty()) { + return internal::ComputeRowMajorStrides(type, shape, strides); + } + + const int byte_width = type.byte_width(); + + int64_t remaining = 0; + if (!shape.empty() && shape.front() > 0) { + remaining = byte_width; + for (auto i : permutation) { + if (i > 0) { + if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) { + return Status::Invalid( + "Strides computed from shape would not fit in 64-bit integer"); + } + } + } + } + + if (remaining == 0) { + strides->assign(shape.size(), byte_width); + return Status::OK(); + } + + strides->push_back(remaining); + for (auto i : permutation) { + if (i > 0) { + remaining /= shape[i]; + strides->push_back(remaining); + } + } + internal::Permute(permutation, strides); + + return Status::OK(); +} + const Result> FixedShapeTensorArray::ToTensor() const { // To convert an array of n dimensional tensors to a n+1 dimensional tensor we - // interpret the array's length as the first dimension the new tensor. Further, we - // define n+1 dimensional tensor's strides by front appending a new stride to the n - // dimensional tensor's strides. + // interpret the array's length as the first dimension the new tensor. auto ext_arr = internal::checked_pointer_cast(this->storage()); auto ext_type = internal::checked_pointer_cast(this->type()); @@ -231,16 +268,15 @@ const Result> FixedShapeTensorArray::ToTensor() const { std::vector shape = ext_type->shape(); shape.insert(shape.begin(), 1, this->length()); - int64_t major_stride; - std::vector tensor_strides = ext_type->strides(); - if (internal::MultiplyWithOverflow(ext_arr->value_type()->byte_width(), - ext_arr->list_type()->list_size(), &major_stride)) { - return Status::Invalid("Overflow in tensor strides"); + std::vector tensor_strides; + auto value_type = internal::checked_pointer_cast(ext_arr->value_type()); + auto permutation = ext_type->permutation(); + for (size_t i = 0; i < permutation.size(); ++i) { + ++permutation[i]; } - if (!ext_type->permutation().empty()) { - internal::Permute(ext_type->permutation(), &tensor_strides); - } - tensor_strides.insert(tensor_strides.begin(), 1, major_stride); + permutation.insert(permutation.begin(), 1, 0); + ARROW_RETURN_NOT_OK(FixedShapeTensorType::ComputeStrides(*value_type.get(), shape, + permutation, &tensor_strides)); std::vector dim_names; if (!ext_type->dim_names().empty()) { @@ -274,28 +310,23 @@ Result> FixedShapeTensorType::Make( shape, permutation, dim_names); } -Result> FixedShapeTensorType::ComputeStrides( - const FixedShapeTensorType& type) { - std::vector strides; - auto element_type = - internal::checked_pointer_cast(type.storage_type()); - auto value_type = - internal::checked_pointer_cast(element_type->value_type()); - - auto shape = type.shape(); - if (!type.permutation().empty()) { - internal::Permute(type.permutation(), &shape); +const std::vector FixedShapeTensorType::shape() const { + std::vector shape = shape_; + if (!permutation_.empty()) { + std::vector reverse_permutation = permutation_; + internal::Permute(permutation_, &reverse_permutation); + internal::Permute(reverse_permutation, &shape); } - ARROW_RETURN_NOT_OK( - internal::ComputeRowMajorStrides(*value_type.get(), shape, &strides)); - return strides; + return shape; } const std::vector& FixedShapeTensorType::strides() { if (strides_.empty()) { - auto maybe_strides = ComputeStrides(*this); - ARROW_CHECK_OK(maybe_strides.status()); - strides_ = maybe_strides.MoveValueUnsafe(); + auto value_type = internal::checked_pointer_cast(this->value_type_); + std::vector tensor_strides; + ARROW_CHECK_OK(ComputeStrides(*value_type.get(), this->shape(), this->permutation(), + &tensor_strides)); + strides_ = tensor_strides; } return strides_; } diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 32c3dcdf4ba..e55df900121 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -66,7 +66,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { size_t ndim() { return shape_.size(); } /// Shape of tensor elements - const std::vector& shape() const { return shape_; } + const std::vector shape() const; /// Strides of tensor elements. Strides state offset in bytes between adjacent /// elements along each dimension. In case permutation is non-empty strides are @@ -96,10 +96,13 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { const std::vector& permutation = {}, const std::vector& dim_names = {}); - private: /// \brief Compute strides of FixedShapeTensorType - static Result> ComputeStrides(const FixedShapeTensorType& type); + static Status ComputeStrides(const FixedWidthType& type, + const std::vector& shape, + const std::vector& permutation, + std::vector* strides); + private: std::shared_ptr storage_type_; std::shared_ptr value_type_; std::vector shape_; diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 18ab551ce0b..2e9ce4a308f 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -214,8 +214,8 @@ TEST_F(TestExtensionType, CreateFromTensor) { arr_with_null, fixed_size_list(int64(), 2))); } -void CheckTensorRoundtrip(const std::shared_ptr& tensor, - std::shared_ptr expected_ext_type) { +void CheckFromTensorType(const std::shared_ptr& tensor, + std::shared_ptr expected_ext_type) { auto ext_type = internal::checked_pointer_cast(expected_ext_type); ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); auto generated_ext_type = @@ -228,9 +228,30 @@ void CheckTensorRoundtrip(const std::shared_ptr& tensor, ASSERT_EQ(generated_ext_type->permutation(), ext_type->permutation()); ASSERT_TRUE(generated_ext_type->storage_type()->Equals(*ext_type->storage_type())); ASSERT_TRUE(generated_ext_type->Equals(ext_type)); +} + +TEST_F(TestExtensionType, TestFromTensorType) { + auto values = Buffer::Wrap(values_); + auto shapes = + std::vector>{{3, 3, 4}, {3, 3, 4}, {3, 4, 3}, {3, 4, 3}}; + auto strides = std::vector>{ + {96, 32, 8}, {96, 8, 24}, {96, 24, 8}, {96, 8, 32}}; + auto cell_shapes = std::vector>{{3, 4}, {3, 4}, {4, 3}, {4, 3}}; + auto permutations = std::vector>{{0, 1}, {1, 0}, {0, 1}, {1, 0}}; + + for (size_t i = 0; i < shapes.size(); i++) { + ASSERT_OK_AND_ASSIGN(auto tensor, + Tensor::Make(value_type_, values, shapes[i], strides[i])); + ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); + auto ext_type = fixed_shape_tensor(value_type_, cell_shapes[i], permutations[i]); + CheckFromTensorType(tensor, ext_type); + } +} - // Check Tensor roundtrip +void CheckTensorRoundtrip(const std::shared_ptr& tensor) { + ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); ASSERT_OK_AND_ASSIGN(auto tensor_from_array, ext_arr->ToTensor()); + ASSERT_EQ(tensor->type(), tensor_from_array->type()); ASSERT_EQ(tensor->shape(), tensor_from_array->shape()); for (size_t i = 1; i < tensor->dim_names().size(); i++) { @@ -243,38 +264,18 @@ void CheckTensorRoundtrip(const std::shared_ptr& tensor, TEST_F(TestExtensionType, RoundtripTensor) { auto values = Buffer::Wrap(values_); - ASSERT_OK_AND_ASSIGN(auto tensor1, Tensor::Make(value_type_, values, {3, 3, 4}, - {96, 32, 8}, {"", "y", "z"})); - ASSERT_OK_AND_ASSIGN(auto tensor2, - Tensor::Make(value_type_, values, {3, 3, 4}, {96, 8, 24})); - ASSERT_OK_AND_ASSIGN(auto tensor3, - Tensor::Make(value_type_, values, {3, 4, 3}, {96, 24, 8})); - ASSERT_OK_AND_ASSIGN(auto tensor4, - Tensor::Make(value_type_, values, {3, 4, 3}, {96, 8, 32})); - ASSERT_OK_AND_ASSIGN(auto tensor5, - Tensor::Make(value_type_, values, {6, 2, 3}, {48, 24, 8})); - ASSERT_OK_AND_ASSIGN(auto tensor6, - Tensor::Make(value_type_, values, {6, 2, 3}, {48, 8, 16})); - ASSERT_OK_AND_ASSIGN(auto tensor7, - Tensor::Make(value_type_, values, {2, 3, 6}, {144, 48, 8})); - ASSERT_OK_AND_ASSIGN(auto tensor8, - Tensor::Make(value_type_, values, {2, 3, 6}, {144, 8, 24})); - ASSERT_OK_AND_ASSIGN(auto tensor9, - Tensor::Make(value_type_, values, {2, 3, 2, 3}, {144, 48, 24, 8})); - ASSERT_OK_AND_ASSIGN(auto tensor10, - Tensor::Make(value_type_, values, {2, 3, 2, 3}, {144, 8, 24, 48})); - - CheckTensorRoundtrip(tensor1, - fixed_shape_tensor(value_type_, {3, 4}, {0, 1}, {"y", "z"})); - CheckTensorRoundtrip(tensor2, fixed_shape_tensor(value_type_, {3, 4}, {1, 0}, {})); - CheckTensorRoundtrip(tensor3, fixed_shape_tensor(value_type_, {4, 3}, {0, 1})); - CheckTensorRoundtrip(tensor4, fixed_shape_tensor(value_type_, {4, 3}, {1, 0})); - CheckTensorRoundtrip(tensor5, fixed_shape_tensor(value_type_, {2, 3}, {0, 1})); - CheckTensorRoundtrip(tensor6, fixed_shape_tensor(value_type_, {2, 3}, {1, 0})); - CheckTensorRoundtrip(tensor7, fixed_shape_tensor(value_type_, {3, 6}, {0, 1})); - CheckTensorRoundtrip(tensor8, fixed_shape_tensor(value_type_, {3, 6}, {1, 0})); - CheckTensorRoundtrip(tensor9, fixed_shape_tensor(value_type_, {3, 2, 3}, {0, 1, 2})); - CheckTensorRoundtrip(tensor10, fixed_shape_tensor(value_type_, {3, 2, 3}, {2, 1, 0})); + + auto shapes = std::vector>{ + {3, 3, 4}, {3, 3, 4}, {3, 4, 3}, {3, 4, 3}, {6, 2, 3}, + {6, 2, 3}, {2, 3, 6}, {2, 3, 6}, {2, 3, 2, 3}, {2, 3, 2, 3}}; + auto strides = std::vector>{ + {96, 32, 8}, {96, 8, 24}, {96, 24, 8}, {96, 8, 32}, {48, 24, 8}, + {48, 8, 16}, {144, 48, 8}, {144, 8, 24}, {144, 48, 24, 8}, {144, 8, 24, 48}}; + for (size_t i = 0; i < shapes.size(); i++) { + ASSERT_OK_AND_ASSIGN(auto tensor, + Tensor::Make(value_type_, values, shapes[i], strides[i])); + CheckTensorRoundtrip(tensor); + } } TEST_F(TestExtensionType, SliceTensor) { @@ -403,17 +404,17 @@ TEST_F(TestExtensionType, ComputeStrides) { auto ext_type_5 = internal::checked_pointer_cast( fixed_shape_tensor(int64(), {3, 4, 7}, {1, 0, 2})); - ASSERT_EQ(ext_type_5->strides(), (std::vector{168, 56, 8})); + ASSERT_EQ(ext_type_5->strides(), (std::vector{56, 224, 8})); ASSERT_EQ(ext_type_5->Serialize(), R"({"shape":[3,4,7],"permutation":[1,0,2]})"); auto ext_type_6 = internal::checked_pointer_cast( fixed_shape_tensor(int64(), {3, 4, 7}, {1, 2, 0}, {})); - ASSERT_EQ(ext_type_6->strides(), (std::vector{168, 24, 8})); + ASSERT_EQ(ext_type_6->strides(), (std::vector{32, 8, 96})); ASSERT_EQ(ext_type_6->Serialize(), R"({"shape":[3,4,7],"permutation":[1,2,0]})"); auto ext_type_7 = internal::checked_pointer_cast( fixed_shape_tensor(int32(), {3, 4, 7}, {2, 0, 1}, {})); - ASSERT_EQ(ext_type_7->strides(), (std::vector{48, 16, 4})); + ASSERT_EQ(ext_type_7->strides(), (std::vector{4, 84, 28})); ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); } From bba80ff0c200efdfaefcf3cb5aae51f4704db068 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 30 Mar 2023 13:18:18 +0200 Subject: [PATCH 27/39] CMake change --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/extension/CMakeLists.txt | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 cpp/src/arrow/extension/CMakeLists.txt diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 9df85ce303d..60eb1115f34 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -888,6 +888,7 @@ endif() if(ARROW_JSON) add_subdirectory(json) + add_subdirectory(extension) endif() if(ARROW_ORC) diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt new file mode 100644 index 00000000000..f123adafc56 --- /dev/null +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +add_arrow_test(test + SOURCES + fixed_shape_tensor_test.cc + PREFIX + "arrow-fixed-shape-tensor") + +arrow_install_all_headers("arrow/extension") \ No newline at end of file From 905910ca9f144cccab226a16e84bc6a188b69067 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 30 Mar 2023 13:29:20 +0200 Subject: [PATCH 28/39] Removing FromTensor/ToTensor --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 180 ---------------- cpp/src/arrow/extension/fixed_shape_tensor.h | 33 +-- .../extension/fixed_shape_tensor_test.cc | 202 ------------------ 3 files changed, 1 insertion(+), 414 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index ff2d61a232b..b5036f27cd8 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -20,7 +20,6 @@ #include "arrow/array/array_nested.h" #include "arrow/array/array_primitive.h" #include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep -#include "arrow/tensor.h" #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" #include "arrow/util/sort.h" @@ -135,164 +134,6 @@ std::shared_ptr FixedShapeTensorType::MakeArray( return std::make_shared(data); } -Result> FixedShapeTensorArray::FromTensor( - const std::shared_ptr& tensor) { - auto cell_shape = tensor->shape(); - cell_shape.erase(cell_shape.begin()); - - std::vector dim_names; - for (size_t i = 1; i < tensor->dim_names().size(); ++i) { - dim_names.emplace_back(tensor->dim_names()[i]); - } - - auto permutation = internal::ArgSort(tensor->strides(), std::greater<>()); - if (permutation[0] != 0) { - return Status::Invalid( - "Only first-major tensors can be zero-copy converted to arrays"); - } - permutation.erase(permutation.begin()); - for (size_t i = 0; i < permutation.size(); ++i) { - --permutation[i]; - } - - auto ext_type = internal::checked_pointer_cast( - fixed_shape_tensor(tensor->type(), cell_shape, permutation, dim_names)); - - std::shared_ptr value_array; - switch (tensor->type_id()) { - case Type::UINT8: { - value_array = std::make_shared(tensor->size(), tensor->data()); - break; - } - case Type::INT8: { - value_array = std::make_shared(tensor->size(), tensor->data()); - break; - } - case Type::UINT16: { - value_array = std::make_shared(tensor->size(), tensor->data()); - break; - } - case Type::INT16: { - value_array = std::make_shared(tensor->size(), tensor->data()); - break; - } - case Type::UINT32: { - value_array = std::make_shared(tensor->size(), tensor->data()); - break; - } - case Type::INT32: { - value_array = std::make_shared(tensor->size(), tensor->data()); - break; - } - case Type::UINT64: { - value_array = std::make_shared(tensor->size(), tensor->data()); - break; - } - case Type::INT64: { - value_array = std::make_shared(tensor->size(), tensor->data()); - break; - } - case Type::HALF_FLOAT: { - value_array = std::make_shared(tensor->size(), tensor->data()); - break; - } - case Type::FLOAT: { - value_array = std::make_shared(tensor->size(), tensor->data()); - break; - } - case Type::DOUBLE: { - value_array = std::make_shared(tensor->size(), tensor->data()); - break; - } - default: { - return Status::NotImplemented("Unsupported tensor type: ", - tensor->type()->ToString()); - } - } - auto cell_size = static_cast(tensor->size() / tensor->shape()[0]); - ARROW_ASSIGN_OR_RAISE(std::shared_ptr arr, - FixedSizeListArray::FromArrays(value_array, cell_size)); - std::shared_ptr ext_arr = ExtensionType::WrapArray(ext_type, arr); - return std::reinterpret_pointer_cast(ext_arr); -} - -Status FixedShapeTensorType::ComputeStrides(const FixedWidthType& type, - const std::vector& shape, - const std::vector& permutation, - std::vector* strides) { - if (permutation.empty()) { - return internal::ComputeRowMajorStrides(type, shape, strides); - } - - const int byte_width = type.byte_width(); - - int64_t remaining = 0; - if (!shape.empty() && shape.front() > 0) { - remaining = byte_width; - for (auto i : permutation) { - if (i > 0) { - if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) { - return Status::Invalid( - "Strides computed from shape would not fit in 64-bit integer"); - } - } - } - } - - if (remaining == 0) { - strides->assign(shape.size(), byte_width); - return Status::OK(); - } - - strides->push_back(remaining); - for (auto i : permutation) { - if (i > 0) { - remaining /= shape[i]; - strides->push_back(remaining); - } - } - internal::Permute(permutation, strides); - - return Status::OK(); -} - -const Result> FixedShapeTensorArray::ToTensor() const { - // To convert an array of n dimensional tensors to a n+1 dimensional tensor we - // interpret the array's length as the first dimension the new tensor. - - auto ext_arr = internal::checked_pointer_cast(this->storage()); - auto ext_type = internal::checked_pointer_cast(this->type()); - ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()), - Status::Invalid(ext_arr->value_type()->ToString(), - " is not valid data type for a tensor")); - std::vector shape = ext_type->shape(); - shape.insert(shape.begin(), 1, this->length()); - - std::vector tensor_strides; - auto value_type = internal::checked_pointer_cast(ext_arr->value_type()); - auto permutation = ext_type->permutation(); - for (size_t i = 0; i < permutation.size(); ++i) { - ++permutation[i]; - } - permutation.insert(permutation.begin(), 1, 0); - ARROW_RETURN_NOT_OK(FixedShapeTensorType::ComputeStrides(*value_type.get(), shape, - permutation, &tensor_strides)); - - std::vector dim_names; - if (!ext_type->dim_names().empty()) { - dim_names = ext_type->dim_names(); - dim_names.insert(dim_names.begin(), 1, ""); - } else { - dim_names = {}; - } - - ARROW_ASSIGN_OR_RAISE(auto buffers, ext_arr->Flatten()); - ARROW_ASSIGN_OR_RAISE( - auto tensor, Tensor::Make(ext_arr->value_type(), buffers->data()->buffers[1], shape, - tensor_strides, dim_names)); - return tensor; -} - Result> FixedShapeTensorType::Make( const std::shared_ptr& value_type, const std::vector& shape, const std::vector& permutation, const std::vector& dim_names) { @@ -310,27 +151,6 @@ Result> FixedShapeTensorType::Make( shape, permutation, dim_names); } -const std::vector FixedShapeTensorType::shape() const { - std::vector shape = shape_; - if (!permutation_.empty()) { - std::vector reverse_permutation = permutation_; - internal::Permute(permutation_, &reverse_permutation); - internal::Permute(reverse_permutation, &shape); - } - return shape; -} - -const std::vector& FixedShapeTensorType::strides() { - if (strides_.empty()) { - auto value_type = internal::checked_pointer_cast(this->value_type_); - std::vector tensor_strides; - ARROW_CHECK_OK(ComputeStrides(*value_type.get(), this->shape(), this->permutation(), - &tensor_strides)); - strides_ = tensor_strides; - } - return strides_; -} - std::shared_ptr fixed_shape_tensor(const std::shared_ptr& value_type, const std::vector& shape, const std::vector& permutation, diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index e55df900121..7a4141d6f1e 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -26,25 +26,6 @@ namespace extension { class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { public: using ExtensionArray::ExtensionArray; - - /// \brief Create a FixedShapeTensorArray from a Tensor - /// - /// This method will create a FixedShapeTensorArray from a Tensor, taking its first - /// dimension as the number of elements in the resulting array and the remaining - /// dimensions as the shape of the individual tensors. If Tensor provides strides, - /// they will be used to determine dimension permutation. Otherwise, row-major layout - /// (i.e. no permutation) will be assumed. - /// - /// \param[in] tensor The Tensor to convert to a FixedShapeTensorArray - static Result> FromTensor( - const std::shared_ptr& tensor); - - /// \brief Create a Tensor from FixedShapeTensorArray - /// - /// This method will create a Tensor from a FixedShapeTensorArray, setting its - /// first dimension as length equal to the FixedShapeTensorArray's length and the - /// remaining dimensions as the FixedShapeTensorType's shape. - const Result> ToTensor() const; }; /// \brief Concrete type class for constant-size Tensor data. @@ -66,12 +47,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { size_t ndim() { return shape_.size(); } /// Shape of tensor elements - const std::vector shape() const; - - /// Strides of tensor elements. Strides state offset in bytes between adjacent - /// elements along each dimension. In case permutation is non-empty strides are - /// computed from permuted tensor element's shape. - const std::vector& strides(); + const std::vector shape() const { return shape_; } /// Permutation mapping from logical to physical memory layout of tensor elements const std::vector& permutation() const { return permutation_; } @@ -96,17 +72,10 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { const std::vector& permutation = {}, const std::vector& dim_names = {}); - /// \brief Compute strides of FixedShapeTensorType - static Status ComputeStrides(const FixedWidthType& type, - const std::vector& shape, - const std::vector& permutation, - std::vector* strides); - private: std::shared_ptr storage_type_; std::shared_ptr value_type_; std::vector shape_; - std::vector strides_; std::vector permutation_; std::vector dim_names_; }; diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 2e9ce4a308f..b6ec8dded7f 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -109,7 +109,6 @@ TEST_F(TestExtensionType, CreateExtensionType) { ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); ASSERT_EQ(exact_ext_type->ndim(), cell_shape_.size()); ASSERT_EQ(exact_ext_type->shape(), cell_shape_); - ASSERT_EQ(exact_ext_type->strides(), cell_strides_); ASSERT_EQ(exact_ext_type->dim_names(), dim_names_); EXPECT_RAISES_WITH_MESSAGE_THAT( @@ -156,155 +155,6 @@ TEST_F(TestExtensionType, CreateFromArray) { ASSERT_EQ(ext_arr->null_count(), 0); } -TEST_F(TestExtensionType, CreateFromTensor) { - std::vector column_major_strides = {8, 24, 72}; - std::vector neither_major_strides = {96, 8, 32}; - - ASSERT_OK_AND_ASSIGN(auto tensor, - Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); - - auto exact_ext_type = internal::checked_pointer_cast(ext_type_); - ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); - - ASSERT_OK(ext_arr->ValidateFull()); - ASSERT_TRUE(tensor->is_row_major()); - ASSERT_EQ(tensor->strides(), tensor_strides_); - ASSERT_EQ(ext_arr->length(), shape_[0]); - - auto ext_type_2 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), {3, 4}, {0, 1})); - ASSERT_OK_AND_ASSIGN(auto ext_arr_2, FixedShapeTensorArray::FromTensor(tensor)); - - ASSERT_OK_AND_ASSIGN( - auto column_major_tensor, - Tensor::Make(value_type_, Buffer::Wrap(values_), shape_, column_major_strides)); - auto ext_type_3 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), {3, 4}, {0, 1})); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, - testing::HasSubstr( - "Invalid: Only first-major tensors can be zero-copy converted to arrays"), - FixedShapeTensorArray::FromTensor(column_major_tensor)); - ASSERT_THAT(FixedShapeTensorArray::FromTensor(column_major_tensor), - Raises(StatusCode::Invalid)); - - auto neither_major_tensor = std::make_shared(value_type_, Buffer::Wrap(values_), - shape_, neither_major_strides); - auto ext_type_4 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), {3, 4}, {1, 0})); - ASSERT_OK_AND_ASSIGN(auto ext_arr_4, - FixedShapeTensorArray::FromTensor(neither_major_tensor)); - - auto ext_type_5 = internal::checked_pointer_cast( - fixed_shape_tensor(binary(), {1, 3})); - auto arr = ArrayFromJSON(binary(), R"(["abc", "def"])"); - - ASSERT_OK_AND_ASSIGN(auto fsla_arr, - FixedSizeListArray::FromArrays(arr, fixed_size_list(binary(), 2))); - auto ext_arr_5 = std::reinterpret_pointer_cast( - ExtensionType::WrapArray(ext_type_5, fsla_arr)); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("binary is not valid data type for a tensor"), - ext_arr_5->ToTensor()); - - auto ext_type_6 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), {1, 2})); - auto arr_with_null = ArrayFromJSON(int64(), "[1, 0, null, null, 1, 2]"); - ASSERT_OK_AND_ASSIGN(auto fsla_arr_6, FixedSizeListArray::FromArrays( - arr_with_null, fixed_size_list(int64(), 2))); -} - -void CheckFromTensorType(const std::shared_ptr& tensor, - std::shared_ptr expected_ext_type) { - auto ext_type = internal::checked_pointer_cast(expected_ext_type); - ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); - auto generated_ext_type = - internal::checked_cast(ext_arr->extension_type()); - - // Check that generated type is equal to the expected type - ASSERT_EQ(generated_ext_type->type_name(), ext_type->type_name()); - ASSERT_EQ(generated_ext_type->shape(), ext_type->shape()); - ASSERT_EQ(generated_ext_type->dim_names(), ext_type->dim_names()); - ASSERT_EQ(generated_ext_type->permutation(), ext_type->permutation()); - ASSERT_TRUE(generated_ext_type->storage_type()->Equals(*ext_type->storage_type())); - ASSERT_TRUE(generated_ext_type->Equals(ext_type)); -} - -TEST_F(TestExtensionType, TestFromTensorType) { - auto values = Buffer::Wrap(values_); - auto shapes = - std::vector>{{3, 3, 4}, {3, 3, 4}, {3, 4, 3}, {3, 4, 3}}; - auto strides = std::vector>{ - {96, 32, 8}, {96, 8, 24}, {96, 24, 8}, {96, 8, 32}}; - auto cell_shapes = std::vector>{{3, 4}, {3, 4}, {4, 3}, {4, 3}}; - auto permutations = std::vector>{{0, 1}, {1, 0}, {0, 1}, {1, 0}}; - - for (size_t i = 0; i < shapes.size(); i++) { - ASSERT_OK_AND_ASSIGN(auto tensor, - Tensor::Make(value_type_, values, shapes[i], strides[i])); - ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); - auto ext_type = fixed_shape_tensor(value_type_, cell_shapes[i], permutations[i]); - CheckFromTensorType(tensor, ext_type); - } -} - -void CheckTensorRoundtrip(const std::shared_ptr& tensor) { - ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); - ASSERT_OK_AND_ASSIGN(auto tensor_from_array, ext_arr->ToTensor()); - - ASSERT_EQ(tensor->type(), tensor_from_array->type()); - ASSERT_EQ(tensor->shape(), tensor_from_array->shape()); - for (size_t i = 1; i < tensor->dim_names().size(); i++) { - ASSERT_EQ(tensor->dim_names()[i], tensor_from_array->dim_names()[i]); - } - ASSERT_EQ(tensor->strides(), tensor_from_array->strides()); - ASSERT_TRUE(tensor->data()->Equals(*tensor_from_array->data())); - ASSERT_TRUE(tensor->Equals(*tensor_from_array)); -} - -TEST_F(TestExtensionType, RoundtripTensor) { - auto values = Buffer::Wrap(values_); - - auto shapes = std::vector>{ - {3, 3, 4}, {3, 3, 4}, {3, 4, 3}, {3, 4, 3}, {6, 2, 3}, - {6, 2, 3}, {2, 3, 6}, {2, 3, 6}, {2, 3, 2, 3}, {2, 3, 2, 3}}; - auto strides = std::vector>{ - {96, 32, 8}, {96, 8, 24}, {96, 24, 8}, {96, 8, 32}, {48, 24, 8}, - {48, 8, 16}, {144, 48, 8}, {144, 8, 24}, {144, 48, 24, 8}, {144, 8, 24, 48}}; - for (size_t i = 0; i < shapes.size(); i++) { - ASSERT_OK_AND_ASSIGN(auto tensor, - Tensor::Make(value_type_, values, shapes[i], strides[i])); - CheckTensorRoundtrip(tensor); - } -} - -TEST_F(TestExtensionType, SliceTensor) { - ASSERT_OK_AND_ASSIGN(auto tensor, - Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); - ASSERT_OK_AND_ASSIGN( - auto tensor_partial, - Tensor::Make(value_type_, Buffer::Wrap(values_partial_), shape_partial_)); - ASSERT_EQ(tensor->strides(), tensor_strides_); - ASSERT_EQ(tensor_partial->strides(), tensor_strides_); - auto ext_type = fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_); - auto exact_ext_type = internal::checked_pointer_cast(ext_type_); - - ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); - ASSERT_OK_AND_ASSIGN(auto ext_arr_partial, - FixedShapeTensorArray::FromTensor(tensor_partial)); - ASSERT_OK(ext_arr->ValidateFull()); - ASSERT_OK(ext_arr_partial->ValidateFull()); - - auto sliced = internal::checked_pointer_cast(ext_arr->Slice(0, 2)); - auto partial = internal::checked_pointer_cast(ext_arr_partial); - - ASSERT_TRUE(sliced->Equals(*partial)); - ASSERT_OK(sliced->ValidateFull()); - ASSERT_OK(partial->ValidateFull()); - ASSERT_TRUE(sliced->storage()->Equals(*partial->storage())); - ASSERT_EQ(sliced->length(), partial->length()); -} - void CheckSerializationRoundtrip(const std::shared_ptr& ext_type) { auto fst_type = internal::checked_pointer_cast(ext_type); auto serialized = fst_type->Serialize(); @@ -366,57 +216,5 @@ TEST_F(TestExtensionType, RoudtripBatch) { CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); } -TEST_F(TestExtensionType, RoudtripBatchFromTensor) { - auto exact_ext_type = internal::checked_pointer_cast(ext_type_); - ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, Buffer::Wrap(values_), - shape_, {}, {"n", "x", "y"})); - ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); - ext_arr->data()->type = exact_ext_type; - - auto ext_metadata = - key_value_metadata({{"ARROW:extension:name", ext_type_->extension_name()}, - {"ARROW:extension:metadata", serialized_}}); - auto ext_field = field("f0", ext_type_, true, ext_metadata); - auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); - std::shared_ptr read_batch; - RoundtripBatch(batch, &read_batch); - CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); -} - -TEST_F(TestExtensionType, ComputeStrides) { - auto exact_ext_type = internal::checked_pointer_cast(ext_type_); - - auto ext_type_1 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_)); - auto ext_type_2 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_)); - auto ext_type_3 = internal::checked_pointer_cast( - fixed_shape_tensor(int32(), cell_shape_, {}, dim_names_)); - ASSERT_TRUE(ext_type_1->Equals(*ext_type_2)); - ASSERT_FALSE(ext_type_1->Equals(*ext_type_3)); - - auto ext_type_4 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), {3, 4, 7}, {}, {"x", "y", "z"})); - ASSERT_EQ(ext_type_4->strides(), (std::vector{224, 56, 8})); - ext_type_4 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), {3, 4, 7}, {0, 1, 2}, {"x", "y", "z"})); - ASSERT_EQ(ext_type_4->strides(), (std::vector{224, 56, 8})); - - auto ext_type_5 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), {3, 4, 7}, {1, 0, 2})); - ASSERT_EQ(ext_type_5->strides(), (std::vector{56, 224, 8})); - ASSERT_EQ(ext_type_5->Serialize(), R"({"shape":[3,4,7],"permutation":[1,0,2]})"); - - auto ext_type_6 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), {3, 4, 7}, {1, 2, 0}, {})); - ASSERT_EQ(ext_type_6->strides(), (std::vector{32, 8, 96})); - ASSERT_EQ(ext_type_6->Serialize(), R"({"shape":[3,4,7],"permutation":[1,2,0]})"); - - auto ext_type_7 = internal::checked_pointer_cast( - fixed_shape_tensor(int32(), {3, 4, 7}, {2, 0, 1}, {})); - ASSERT_EQ(ext_type_7->strides(), (std::vector{4, 84, 28})); - ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); -} - } // namespace arrow #endif From 40a84816692ed6e9615c355e5b46ac168d0f0fa0 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 30 Mar 2023 13:49:12 +0200 Subject: [PATCH 29/39] linting --- cpp/src/arrow/extension/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index f123adafc56..c15c42874d4 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -21,4 +21,4 @@ add_arrow_test(test PREFIX "arrow-fixed-shape-tensor") -arrow_install_all_headers("arrow/extension") \ No newline at end of file +arrow_install_all_headers("arrow/extension") From b07532ba6cf5a1f4c1905407eca386c78be1026a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 30 Mar 2023 16:15:39 +0200 Subject: [PATCH 30/39] Update cpp/src/arrow/extension/fixed_shape_tensor.cc Co-authored-by: Joris Van den Bossche --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index b5036f27cd8..acba49e077d 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -131,6 +131,8 @@ Result> FixedShapeTensorType::Deserialize( std::shared_ptr FixedShapeTensorType::MakeArray( std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK_EQ("arrow.fixed_shape_tensor", static_cast(*data->type).extension_name()); return std::make_shared(data); } From db3230c772c740fd3e87d50253b9233e506768d3 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 30 Mar 2023 18:08:14 +0200 Subject: [PATCH 31/39] Review feedback --- cpp/src/arrow/CMakeLists.txt | 4 --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 6 ++++- cpp/src/arrow/extension/fixed_shape_tensor.h | 5 ++-- .../extension/fixed_shape_tensor_test.cc | 25 ++++++++++++------- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 60eb1115f34..2321552b977 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -806,10 +806,6 @@ if(ARROW_IPC) add_arrow_test(extension_type_test) endif() -if(ARROW_JSON) - add_arrow_test(canonical_extension_test SOURCES extension/fixed_shape_tensor_test.cc) -endif() - add_arrow_test(misc_test SOURCES datum_test.cc diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index acba49e077d..8b0ed43df5c 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +#include +#include + #include "arrow/extension/fixed_shape_tensor.h" #include "arrow/array/array_nested.h" @@ -132,7 +135,8 @@ Result> FixedShapeTensorType::Deserialize( std::shared_ptr FixedShapeTensorType::MakeArray( std::shared_ptr data) const { DCHECK_EQ(data->type->id(), Type::EXTENSION); - DCHECK_EQ("arrow.fixed_shape_tensor", static_cast(*data->type).extension_name()); + DCHECK_EQ("arrow.fixed_shape_tensor", + static_cast(*data->type).extension_name()); return std::make_shared(data); } diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 7a4141d6f1e..e3d78f782a1 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -15,9 +15,6 @@ // specific language governing permissions and limitations // under the License. -#include -#include - #include "arrow/extension_type.h" namespace arrow { @@ -29,6 +26,8 @@ class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { }; /// \brief Concrete type class for constant-size Tensor data. +/// This is a canonical arrow extension extension type. +/// See: https://arrow.apache.org/docs/format/CanonicalExtensions.html class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { public: FixedShapeTensorType(const std::shared_ptr& value_type, const int32_t& size, diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index b6ec8dded7f..87c42c61ba9 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -51,8 +51,6 @@ class TestExtensionType : public ::testing::Test { values_partial_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; shape_partial_ = {2, 3, 4}; - tensor_strides_ = {96, 32, 8}; - cell_strides_ = {32, 8}; serialized_ = R"({"shape":[3,4],"dim_names":["x","y"]})"; } @@ -66,8 +64,6 @@ class TestExtensionType : public ::testing::Test { std::shared_ptr ext_type_; std::vector values_; std::vector values_partial_; - std::vector tensor_strides_; - std::vector cell_strides_; std::string serialized_; }; @@ -206,14 +202,25 @@ TEST_F(TestExtensionType, RoudtripBatch) { data->type = ext_type_; auto ext_arr = exact_ext_type->MakeArray(data); - auto ext_metadata = - key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, - {"ARROW:extension:metadata", serialized_}}); - auto ext_field = field("f0", exact_ext_type, true, ext_metadata); - auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); + // Pass extension array, expect getting back extension array std::shared_ptr read_batch; + auto ext_field = + field(/*name=*/"f0", /*type=*/exact_ext_type, /*nullable=*/true, /*metadata=*/{}); + auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); RoundtripBatch(batch, &read_batch); CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); + + // Pass extension metadata and storage array, expect getting back extension array + ASSERT_OK_AND_ASSIGN(fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); + std::shared_ptr read_batch2; + auto ext_metadata = + key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, + {"ARROW:extension:metadata", serialized_}}); + ext_field = field(/*name=*/"f0", /*type=*/cell_type_, /*nullable=*/true, + /*metadata=*/ext_metadata); + auto batch2 = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {fsla_arr}); + RoundtripBatch(batch2, &read_batch2); + CompareBatch(*batch, *read_batch2, /*compare_metadata=*/false); } } // namespace arrow From 0d1e9828ac490ede7eb5e140a4bfc858edf5b69c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 31 Mar 2023 00:48:30 +0200 Subject: [PATCH 32/39] ARROW_WITH_JSON and changes to batch roundtrip test --- cpp/src/arrow/extension/CMakeLists.txt | 1 + .../extension/fixed_shape_tensor_test.cc | 27 +++++++------------ cpp/src/arrow/extension_type.cc | 4 +-- 3 files changed, 13 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index c15c42874d4..d3c62e89436 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +add_definitions(-DARROW_WITH_JSON) add_arrow_test(test SOURCES fixed_shape_tensor_test.cc diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 87c42c61ba9..e227dabb21e 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#ifdef ARROW_JSON +#ifdef ARROW_WITH_JSON #include "arrow/extension/fixed_shape_tensor.h" #include "arrow/testing/matchers.h" @@ -144,9 +144,7 @@ TEST_F(TestExtensionType, CreateFromArray) { auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); - auto data = fsla_arr->data(); - data->type = ext_type_; - auto ext_arr = exact_ext_type->MakeArray(data); + auto ext_arr = ExtensionType::WrapArray(ext_type_, fsla_arr); ASSERT_EQ(ext_arr->length(), shape_[0]); ASSERT_EQ(ext_arr->null_count(), 0); } @@ -198,29 +196,24 @@ TEST_F(TestExtensionType, RoudtripBatch) { auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); - auto data = fsla_arr->data(); - data->type = ext_type_; - auto ext_arr = exact_ext_type->MakeArray(data); + auto ext_arr = ExtensionType::WrapArray(ext_type_, fsla_arr); // Pass extension array, expect getting back extension array std::shared_ptr read_batch; - auto ext_field = - field(/*name=*/"f0", /*type=*/exact_ext_type, /*nullable=*/true, /*metadata=*/{}); - auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); + auto ext_field = field(/*name=*/"f0", /*type=*/cell_type_); + auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {fsla_arr}); RoundtripBatch(batch, &read_batch); CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); // Pass extension metadata and storage array, expect getting back extension array - ASSERT_OK_AND_ASSIGN(fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); - std::shared_ptr read_batch2; auto ext_metadata = - key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, - {"ARROW:extension:metadata", serialized_}}); + key_value_metadata({{"ARROW:extension:name", "ARROW:extension:metadata"}, + {exact_ext_type->extension_name(), serialized_}}); ext_field = field(/*name=*/"f0", /*type=*/cell_type_, /*nullable=*/true, /*metadata=*/ext_metadata); - auto batch2 = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {fsla_arr}); - RoundtripBatch(batch2, &read_batch2); - CompareBatch(*batch, *read_batch2, /*compare_metadata=*/false); + batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {fsla_arr}); + RoundtripBatch(batch, &read_batch); + CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); } } // namespace arrow diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 91543dfb19e..542de7a09ee 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -26,7 +26,7 @@ #include "arrow/array/util.h" #include "arrow/chunked_array.h" -#ifdef ARROW_JSON +#ifdef ARROW_WITH_JSON #include "arrow/extension/fixed_shape_tensor.h" #endif #include "arrow/status.h" @@ -143,7 +143,7 @@ namespace internal { static void CreateGlobalRegistry() { g_registry = std::make_shared(); -#ifdef ARROW_JSON +#ifdef ARROW_WITH_JSON // Register canonical extension types auto ext_type = checked_pointer_cast(extension::fixed_shape_tensor(int64(), {})); From 2241059cd2d2c032b1d3f5abe48f35b281465369 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 31 Mar 2023 13:50:36 +0200 Subject: [PATCH 33/39] ARROW_WITH_JSON -> ARROW_JSON --- cpp/src/arrow/extension/CMakeLists.txt | 2 +- cpp/src/arrow/extension/fixed_shape_tensor_test.cc | 2 -- cpp/src/arrow/extension_type.cc | 4 ++-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index d3c62e89436..732e8f98cae 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -add_definitions(-DARROW_WITH_JSON) +add_definitions(-DARROW_JSON) add_arrow_test(test SOURCES fixed_shape_tensor_test.cc diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index e227dabb21e..12a697723bb 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -#ifdef ARROW_WITH_JSON #include "arrow/extension/fixed_shape_tensor.h" #include "arrow/testing/matchers.h" @@ -217,4 +216,3 @@ TEST_F(TestExtensionType, RoudtripBatch) { } } // namespace arrow -#endif diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 542de7a09ee..91543dfb19e 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -26,7 +26,7 @@ #include "arrow/array/util.h" #include "arrow/chunked_array.h" -#ifdef ARROW_WITH_JSON +#ifdef ARROW_JSON #include "arrow/extension/fixed_shape_tensor.h" #endif #include "arrow/status.h" @@ -143,7 +143,7 @@ namespace internal { static void CreateGlobalRegistry() { g_registry = std::make_shared(); -#ifdef ARROW_WITH_JSON +#ifdef ARROW_JSON // Register canonical extension types auto ext_type = checked_pointer_cast(extension::fixed_shape_tensor(int64(), {})); From 05926f7cef94d16f90e4f170bcd78cf99f0949b8 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 31 Mar 2023 14:04:03 +0200 Subject: [PATCH 34/39] ARROW_WITH_JSON -> ARROW_JSON --- cpp/src/arrow/extension/CMakeLists.txt | 2 +- cpp/src/arrow/extension_type.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index 732e8f98cae..d3c62e89436 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -add_definitions(-DARROW_JSON) +add_definitions(-DARROW_WITH_JSON) add_arrow_test(test SOURCES fixed_shape_tensor_test.cc diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 91543dfb19e..542de7a09ee 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -26,7 +26,7 @@ #include "arrow/array/util.h" #include "arrow/chunked_array.h" -#ifdef ARROW_JSON +#ifdef ARROW_WITH_JSON #include "arrow/extension/fixed_shape_tensor.h" #endif #include "arrow/status.h" @@ -143,7 +143,7 @@ namespace internal { static void CreateGlobalRegistry() { g_registry = std::make_shared(); -#ifdef ARROW_JSON +#ifdef ARROW_WITH_JSON // Register canonical extension types auto ext_type = checked_pointer_cast(extension::fixed_shape_tensor(int64(), {})); From f5ce071bb5514ce66f3ad26ffdfb0d35ffd2ab34 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 31 Mar 2023 15:50:09 +0200 Subject: [PATCH 35/39] ARROW_WITH_JSON->ARROW_JSON --- cpp/src/arrow/extension/CMakeLists.txt | 1 - cpp/src/arrow/extension/fixed_shape_tensor_test.cc | 4 ++-- cpp/src/arrow/extension_type.cc | 5 +++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index d3c62e89436..c15c42874d4 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. -add_definitions(-DARROW_WITH_JSON) add_arrow_test(test SOURCES fixed_shape_tensor_test.cc diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 12a697723bb..231b9395441 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -199,8 +199,8 @@ TEST_F(TestExtensionType, RoudtripBatch) { // Pass extension array, expect getting back extension array std::shared_ptr read_batch; - auto ext_field = field(/*name=*/"f0", /*type=*/cell_type_); - auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {fsla_arr}); + auto ext_field = field(/*name=*/"f0", /*type=*/ext_type_); + auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); RoundtripBatch(batch, &read_batch); CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 542de7a09ee..1199336763d 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -26,7 +26,8 @@ #include "arrow/array/util.h" #include "arrow/chunked_array.h" -#ifdef ARROW_WITH_JSON +#include "arrow/config.h" +#ifdef ARROW_JSON #include "arrow/extension/fixed_shape_tensor.h" #endif #include "arrow/status.h" @@ -143,7 +144,7 @@ namespace internal { static void CreateGlobalRegistry() { g_registry = std::make_shared(); -#ifdef ARROW_WITH_JSON +#ifdef ARROW_JSON // Register canonical extension types auto ext_type = checked_pointer_cast(extension::fixed_shape_tensor(int64(), {})); From 01bc09e8eb8e0fed4cd827bb4d93e0b8cfd45259 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 31 Mar 2023 20:49:39 +0200 Subject: [PATCH 36/39] Changes to RoundtripBatch test --- cpp/src/arrow/extension/fixed_shape_tensor_test.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 231b9395441..b29f41ed8c5 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -205,14 +205,15 @@ TEST_F(TestExtensionType, RoudtripBatch) { CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); // Pass extension metadata and storage array, expect getting back extension array + std::shared_ptr read_batch2; auto ext_metadata = - key_value_metadata({{"ARROW:extension:name", "ARROW:extension:metadata"}, - {exact_ext_type->extension_name(), serialized_}}); + key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, + {"ARROW:extension:metadata", serialized_}}); ext_field = field(/*name=*/"f0", /*type=*/cell_type_, /*nullable=*/true, /*metadata=*/ext_metadata); - batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {fsla_arr}); - RoundtripBatch(batch, &read_batch); - CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); + auto batch2 = RecordBatch::Make(schema({ext_field}), fsla_arr->length(), {fsla_arr}); + RoundtripBatch(batch2, &read_batch2); + CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); } } // namespace arrow From 0b081b2d1fc0841d58f5d7e0344af154ad76fa06 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 4 Apr 2023 09:20:06 +0200 Subject: [PATCH 37/39] Add value_type method --- cpp/src/arrow/extension/fixed_shape_tensor.h | 3 +++ cpp/src/arrow/extension/fixed_shape_tensor_test.cc | 1 + 2 files changed, 4 insertions(+) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index e3d78f782a1..0a409e6bed9 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -48,6 +48,9 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { /// Shape of tensor elements const std::vector shape() const { return shape_; } + /// Value type of tensor elements + const std::shared_ptr value_type() const { return value_type_; } + /// Permutation mapping from logical to physical memory layout of tensor elements const std::vector& permutation() const { return permutation_; } diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index b29f41ed8c5..54c8742cd22 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -104,6 +104,7 @@ TEST_F(TestExtensionType, CreateExtensionType) { ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); ASSERT_EQ(exact_ext_type->ndim(), cell_shape_.size()); ASSERT_EQ(exact_ext_type->shape(), cell_shape_); + ASSERT_EQ(exact_ext_type->value_type(), value_type_); ASSERT_EQ(exact_ext_type->dim_names(), dim_names_); EXPECT_RAISES_WITH_MESSAGE_THAT( From 100dd7fc4d33c23bc3503676fdddb52702bc9c6f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 4 Apr 2023 12:51:49 +0200 Subject: [PATCH 38/39] Update cpp/src/arrow/extension/fixed_shape_tensor.h Co-authored-by: Joris Van den Bossche --- cpp/src/arrow/extension/fixed_shape_tensor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 0a409e6bed9..4ee2b894ee8 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -26,7 +26,7 @@ class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { }; /// \brief Concrete type class for constant-size Tensor data. -/// This is a canonical arrow extension extension type. +/// This is a canonical arrow extension type. /// See: https://arrow.apache.org/docs/format/CanonicalExtensions.html class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { public: From e48f068ec0fe4797246e53da2e72c3fe8c6f6231 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 4 Apr 2023 12:53:46 +0200 Subject: [PATCH 39/39] Removing values_partial_ --- cpp/src/arrow/extension/fixed_shape_tensor_test.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 54c8742cd22..16ba9d2014e 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -47,22 +47,17 @@ class TestExtensionType : public ::testing::Test { fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_)); values_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; - values_partial_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; - shape_partial_ = {2, 3, 4}; serialized_ = R"({"shape":[3,4],"dim_names":["x","y"]})"; } protected: std::vector shape_; - std::vector shape_partial_; std::vector cell_shape_; std::shared_ptr value_type_; std::shared_ptr cell_type_; std::vector dim_names_; std::shared_ptr ext_type_; std::vector values_; - std::vector values_partial_; std::string serialized_; };