From df439e810bc8a5d0fa8ea7b1a26750eb4923b118 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 15 Aug 2023 03:23:25 +0200 Subject: [PATCH 01/62] Initial commit --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/extension/CMakeLists.txt | 2 +- ...test.cc => tensor_extension_array_test.cc} | 408 +++++++++--------- .../arrow/extension/variable_shape_tensor.cc | 166 +++++++ .../arrow/extension/variable_shape_tensor.h | 91 ++++ cpp/src/arrow/extension_type.cc | 6 +- 6 files changed, 459 insertions(+), 215 deletions(-) rename cpp/src/arrow/extension/{fixed_shape_tensor_test.cc => tensor_extension_array_test.cc} (62%) create mode 100644 cpp/src/arrow/extension/variable_shape_tensor.cc create mode 100644 cpp/src/arrow/extension/variable_shape_tensor.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 2e5c67e07b6..d40022b4a5b 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -984,6 +984,7 @@ if(ARROW_JSON) arrow_add_object_library(ARROW_JSON extension/fixed_shape_tensor.cc extension/opaque.cc + extension/variable_shape_tensor.cc json/options.cc json/chunked_builder.cc json/chunker.cc diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index 4ab6a35b52e..ae52bc32a99 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -18,7 +18,7 @@ set(CANONICAL_EXTENSION_TESTS bool8_test.cc json_test.cc uuid_test.cc) if(ARROW_JSON) - list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc) + list(APPEND CANONICAL_EXTENSION_TESTS tensor_extension_array_test.cc opaque_test.cc) endif() add_arrow_test(test diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc similarity index 62% rename from cpp/src/arrow/extension/fixed_shape_tensor_test.cc rename to cpp/src/arrow/extension/tensor_extension_array_test.cc index 6d4d2de3265..047aac8a651 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/extension/fixed_shape_tensor.h" +#include "arrow/extension/variable_shape_tensor.h" #include "arrow/testing/matchers.h" @@ -28,7 +29,7 @@ #include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/key_value_metadata.h" -#include "arrow/util/sort_internal.h" +#include "arrow/util/sort.h" namespace arrow { @@ -37,38 +38,42 @@ using arrow::ipc::test::RoundtripBatch; using extension::fixed_shape_tensor; using extension::FixedShapeTensorArray; +using VariableShapeTensorType = extension::VariableShapeTensorType; +using extension::variable_shape_tensor; +using extension::VariableShapeTensorArray; + class TestExtensionType : public ::testing::Test { public: void SetUp() override { shape_ = {3, 3, 4}; - element_shape_ = {3, 4}; + cell_shape_ = {3, 4}; value_type_ = int64(); - element_type_ = fixed_size_list(value_type_, 12); + cell_type_ = fixed_size_list(value_type_, 12); dim_names_ = {"x", "y"}; ext_type_ = internal::checked_pointer_cast( - fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_)); + fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_)); values_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; values_partial_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; shape_partial_ = {2, 3, 4}; tensor_strides_ = {96, 32, 8}; - element_strides_ = {32, 8}; + cell_strides_ = {32, 8}; serialized_ = R"({"shape":[3,4],"dim_names":["x","y"]})"; } protected: std::vector shape_; std::vector shape_partial_; - std::vector element_shape_; + std::vector cell_shape_; std::shared_ptr value_type_; - std::shared_ptr element_type_; + std::shared_ptr cell_type_; std::vector dim_names_; std::shared_ptr ext_type_; std::vector values_; std::vector values_partial_; std::vector tensor_strides_; - std::vector element_strides_; + std::vector cell_strides_; std::string serialized_; }; @@ -84,8 +89,8 @@ TEST_F(TestExtensionType, CreateExtensionType) { // Test ExtensionType methods ASSERT_EQ(ext_type_->extension_name(), "arrow.fixed_shape_tensor"); ASSERT_TRUE(ext_type_->Equals(*exact_ext_type)); - ASSERT_FALSE(ext_type_->Equals(*element_type_)); - ASSERT_TRUE(ext_type_->storage_type()->Equals(*element_type_)); + ASSERT_FALSE(ext_type_->Equals(*cell_type_)); + ASSERT_TRUE(ext_type_->storage_type()->Equals(*cell_type_)); ASSERT_EQ(ext_type_->Serialize(), serialized_); ASSERT_OK_AND_ASSIGN(auto ds, ext_type_->Deserialize(ext_type_->storage_type(), serialized_)); @@ -94,28 +99,18 @@ TEST_F(TestExtensionType, CreateExtensionType) { // Test FixedShapeTensorType methods ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); - ASSERT_EQ(exact_ext_type->ndim(), element_shape_.size()); - ASSERT_EQ(exact_ext_type->shape(), element_shape_); + ASSERT_EQ(exact_ext_type->ndim(), cell_shape_.size()); + ASSERT_EQ(exact_ext_type->shape(), cell_shape_); ASSERT_EQ(exact_ext_type->value_type(), value_type_); - ASSERT_EQ(exact_ext_type->strides(), element_strides_); + ASSERT_EQ(exact_ext_type->strides(), cell_strides_); ASSERT_EQ(exact_ext_type->dim_names(), dim_names_); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("Invalid: permutation size must match shape size."), - FixedShapeTensorType::Make(value_type_, element_shape_, {0})); + FixedShapeTensorType::Make(value_type_, cell_shape_, {0})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("Invalid: dim_names size must match shape size."), - FixedShapeTensorType::Make(value_type_, element_shape_, {}, {"x"})); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, - testing::HasSubstr("Invalid: Permutation indices for 2 dimensional tensors must be " - "unique and within [0, 1] range. Got: [3,0]"), - FixedShapeTensorType::Make(value_type_, {5, 6}, {3, 0})); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, - testing::HasSubstr("Invalid: Permutation indices for 3 dimensional tensors must be " - "unique and within [0, 2] range. Got: [0,1,1]"), - FixedShapeTensorType::Make(value_type_, {1, 2, 3}, {0, 1, 1})); + FixedShapeTensorType::Make(value_type_, cell_shape_, {}, {"x"})); } TEST_F(TestExtensionType, EqualsCases) { @@ -146,7 +141,7 @@ TEST_F(TestExtensionType, CreateFromArray) { std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); - ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type_)); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); auto ext_arr = ExtensionType::WrapArray(ext_type_, fsla_arr); ASSERT_EQ(ext_arr->length(), shape_[0]); ASSERT_EQ(ext_arr->null_count(), 0); @@ -174,43 +169,47 @@ TEST_F(TestExtensionType, MakeArrayCanGetCorrectScalarType) { ASSERT_TRUE(tensor->Equals(*tensor_from_array)); } +template void CheckSerializationRoundtrip(const std::shared_ptr& ext_type) { - auto fst_type = internal::checked_pointer_cast(ext_type); - auto serialized = fst_type->Serialize(); + auto type = internal::checked_pointer_cast(ext_type); + auto serialized = type->Serialize(); ASSERT_OK_AND_ASSIGN(auto deserialized, - fst_type->Deserialize(fst_type->storage_type(), serialized)); - ASSERT_TRUE(fst_type->Equals(*deserialized)); + type->Deserialize(type->storage_type(), serialized)); + ASSERT_TRUE(type->Equals(*deserialized)); } -void CheckDeserializationRaises(const std::shared_ptr& storage_type, +void CheckDeserializationRaises(const std::shared_ptr& extension_type, + const std::shared_ptr& storage_type, const std::string& serialized, const std::string& expected_message) { - auto fst_type = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), {3, 4})); + auto ext_type = internal::checked_pointer_cast(extension_type); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr(expected_message), - fst_type->Deserialize(storage_type, serialized)); + ext_type->Deserialize(storage_type, serialized)); } TEST_F(TestExtensionType, MetadataSerializationRoundtrip) { - CheckSerializationRoundtrip(ext_type_); - CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {}, {}, {})); - CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {0}, {}, {})); - CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {1}, {0}, {"x"})); - CheckSerializationRoundtrip( + using T = FixedShapeTensorType; + CheckSerializationRoundtrip(ext_type_); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {}, {}, {})); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {0}, {}, {})); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {1}, {0}, {"x"})); + CheckSerializationRoundtrip( fixed_shape_tensor(value_type_, {256, 256, 3}, {0, 1, 2}, {"H", "W", "C"})); - CheckSerializationRoundtrip( + CheckSerializationRoundtrip( fixed_shape_tensor(value_type_, {256, 256, 3}, {2, 0, 1}, {"C", "H", "W"})); auto storage_type = fixed_size_list(int64(), 12); - CheckDeserializationRaises(boolean(), R"({"shape":[3,4]})", + CheckDeserializationRaises(ext_type_, boolean(), R"({"shape":[3,4]})", "Expected FixedSizeList storage type, got bool"); - CheckDeserializationRaises(storage_type, R"({"dim_names":["x","y"]})", + CheckDeserializationRaises(ext_type_, storage_type, R"({"dim_names":["x","y"]})", "Invalid serialized JSON data"); - CheckDeserializationRaises(storage_type, R"({"shape":(3,4)})", + CheckDeserializationRaises(ext_type_, storage_type, R"({"shape":(3,4)})", "Invalid serialized JSON data"); - CheckDeserializationRaises(storage_type, R"({"shape":[3,4],"permutation":[1,0,2]})", + CheckDeserializationRaises(ext_type_, storage_type, + R"({"shape":[3,4],"permutation":[1,0,2]})", "Invalid permutation"); - CheckDeserializationRaises(storage_type, R"({"shape":[3],"dim_names":["x","y"]})", + CheckDeserializationRaises(ext_type_, storage_type, + R"({"shape":[3],"dim_names":["x","y"]})", "Invalid dim_names"); } @@ -220,7 +219,7 @@ TEST_F(TestExtensionType, RoundtripBatch) { std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); - ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type_)); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); auto ext_arr = ExtensionType::WrapArray(ext_type_, fsla_arr); // Pass extension array, expect getting back extension array @@ -235,7 +234,7 @@ TEST_F(TestExtensionType, RoundtripBatch) { auto ext_metadata = key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, {"ARROW:extension:metadata", serialized_}}); - ext_field = field(/*name=*/"f0", /*type=*/element_type_, /*nullable=*/true, + ext_field = field(/*name=*/"f0", /*type=*/cell_type_, /*nullable=*/true, /*metadata=*/ext_metadata); auto batch2 = RecordBatch::Make(schema({ext_field}), fsla_arr->length(), {fsla_arr}); ASSERT_OK(RoundtripBatch(batch2, &read_batch2)); @@ -290,7 +289,7 @@ TEST_F(TestExtensionType, CreateFromTensor) { auto ext_arr_5 = std::static_pointer_cast( ExtensionType::WrapArray(ext_type_5, fsla_arr)); EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, testing::HasSubstr("binary is not valid data type for a tensor"), + Invalid, testing::HasSubstr("binary is not valid data type for a tensor"), ext_arr_5->ToTensor()); auto ext_type_6 = internal::checked_pointer_cast( @@ -298,10 +297,6 @@ TEST_F(TestExtensionType, CreateFromTensor) { auto arr_with_null = ArrayFromJSON(int64(), "[1, 0, null, null, 1, 2]"); ASSERT_OK_AND_ASSIGN(auto fsla_arr_6, FixedSizeListArray::FromArrays( arr_with_null, fixed_size_list(int64(), 2))); - - auto ext_type_7 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), {3, 4}, {})); - ASSERT_OK_AND_ASSIGN(auto ext_arr_7, FixedShapeTensorArray::FromTensor(tensor)); } void CheckFromTensorType(const std::shared_ptr& tensor, @@ -332,7 +327,7 @@ TEST_F(TestExtensionType, TestFromTensorType) { auto dim_names = std::vector>{ {"y", "z"}, {"z", "y"}, {"y", "z"}, {"z", "y"}, {"y", "z"}, {"y", "z"}, {"y", "z"}, {"y", "z"}}; - auto element_shapes = std::vector>{{3, 4}, {4, 3}, {4, 3}, {3, 4}}; + auto cell_shapes = std::vector>{{3, 4}, {4, 3}, {4, 3}, {3, 4}}; auto permutations = std::vector>{{0, 1}, {1, 0}, {0, 1}, {1, 0}}; for (size_t i = 0; i < shapes.size(); i++) { @@ -340,82 +335,11 @@ TEST_F(TestExtensionType, TestFromTensorType) { strides[i], tensor_dim_names[i])); ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); auto ext_type = - fixed_shape_tensor(value_type_, element_shapes[i], permutations[i], dim_names[i]); + fixed_shape_tensor(value_type_, cell_shapes[i], permutations[i], dim_names[i]); CheckFromTensorType(tensor, ext_type); } } -template -void CheckToTensor(const std::vector& values, const std::shared_ptr typ, - const int32_t& element_size, const std::vector& element_shape, - const std::vector& element_permutation, - const std::vector& element_dim_names, - const std::vector& tensor_shape, - const std::vector& tensor_dim_names, - const std::vector& tensor_strides) { - auto buffer = Buffer::Wrap(values); - const std::shared_ptr element_type = fixed_size_list(typ, element_size); - std::vector> buffers = {nullptr, buffer}; - auto arr_data = std::make_shared(typ, values.size(), buffers); - auto arr = std::make_shared(arr_data); - ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type)); - - ASSERT_OK_AND_ASSIGN( - auto expected_tensor, - Tensor::Make(typ, buffer, tensor_shape, tensor_strides, tensor_dim_names)); - const auto ext_type = - fixed_shape_tensor(typ, element_shape, element_permutation, element_dim_names); - - auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr); - const auto tensor_array = std::static_pointer_cast(ext_arr); - ASSERT_OK_AND_ASSIGN(const auto actual_tensor, tensor_array->ToTensor()); - ASSERT_OK(actual_tensor->Validate()); - - ASSERT_EQ(actual_tensor->type(), expected_tensor->type()); - ASSERT_EQ(actual_tensor->shape(), expected_tensor->shape()); - ASSERT_EQ(actual_tensor->strides(), expected_tensor->strides()); - ASSERT_EQ(actual_tensor->dim_names(), expected_tensor->dim_names()); - ASSERT_TRUE(actual_tensor->data()->Equals(*expected_tensor->data())); - ASSERT_TRUE(actual_tensor->Equals(*expected_tensor)); -} - -TEST_F(TestExtensionType, ToTensor) { - std::vector float_values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; - - auto element_sizes = std::vector{6, 6, 18, 18, 18, 18}; - - auto element_shapes = std::vector>{{2, 3}, {3, 2}, {3, 6}, - {6, 3}, {3, 2, 3}, {3, 2, 3}}; - auto tensor_shapes = std::vector>{ - {6, 2, 3}, {6, 2, 3}, {2, 3, 6}, {2, 3, 6}, {2, 3, 2, 3}, {2, 3, 2, 3}}; - - auto element_permutations = std::vector>{ - {0, 1}, {1, 0}, {0, 1}, {1, 0}, {0, 1, 2}, {2, 1, 0}}; - auto tensor_strides_32 = - std::vector>{{24, 12, 4}, {24, 4, 8}, {72, 24, 4}, - {72, 4, 12}, {72, 24, 12, 4}, {72, 4, 12, 24}}; - auto tensor_strides_64 = - std::vector>{{48, 24, 8}, {48, 8, 16}, {144, 48, 8}, - {144, 8, 24}, {144, 48, 24, 8}, {144, 8, 24, 48}}; - - auto element_dim_names = std::vector>{ - {"y", "z"}, {"z", "y"}, {"y", "z"}, {"z", "y"}, {"H", "W", "C"}, {"H", "W", "C"}}; - auto tensor_dim_names = std::vector>{ - {"", "y", "z"}, {"", "y", "z"}, {"", "y", "z"}, - {"", "y", "z"}, {"", "H", "W", "C"}, {"", "C", "W", "H"}}; - - for (size_t i = 0; i < element_shapes.size(); i++) { - CheckToTensor(float_values, float32(), element_sizes[i], element_shapes[i], - element_permutations[i], element_dim_names[i], tensor_shapes[i], - tensor_dim_names[i], tensor_strides_32[i]); - CheckToTensor(values_, int64(), element_sizes[i], element_shapes[i], - element_permutations[i], element_dim_names[i], tensor_shapes[i], - tensor_dim_names[i], tensor_strides_64[i]); - } -} - void CheckTensorRoundtrip(const std::shared_ptr& tensor) { ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); ASSERT_OK_AND_ASSIGN(auto tensor_from_array, ext_arr->ToTensor()); @@ -459,7 +383,7 @@ TEST_F(TestExtensionType, SliceTensor) { Tensor::Make(value_type_, Buffer::Wrap(values_partial_), shape_partial_)); ASSERT_EQ(tensor->strides(), tensor_strides_); ASSERT_EQ(tensor_partial->strides(), tensor_strides_); - auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_); + auto ext_type = fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_); auto exact_ext_type = internal::checked_pointer_cast(ext_type_); ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); @@ -499,11 +423,11 @@ TEST_F(TestExtensionType, ComputeStrides) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); auto ext_type_1 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), element_shape_, {}, dim_names_)); + fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_)); auto ext_type_2 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), element_shape_, {}, dim_names_)); + fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_)); auto ext_type_3 = internal::checked_pointer_cast( - fixed_shape_tensor(int32(), element_shape_, {}, dim_names_)); + fixed_shape_tensor(int32(), cell_shape_, {}, dim_names_)); ASSERT_TRUE(ext_type_1->Equals(*ext_type_2)); ASSERT_FALSE(ext_type_1->Equals(*ext_type_3)); @@ -557,96 +481,156 @@ TEST_F(TestExtensionType, ToString) { ASSERT_EQ(expected_3, result_3); } -TEST_F(TestExtensionType, GetTensor) { - auto arr = ArrayFromJSON(element_type_, - "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]," - "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]"); - auto element_values = - std::vector>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, - {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}}; - - auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_); - auto permuted_ext_type = fixed_shape_tensor(value_type_, {3, 4}, {1, 0}, {"x", "y"}); - auto exact_ext_type = internal::checked_pointer_cast(ext_type); - auto exact_permuted_ext_type = - internal::checked_pointer_cast(permuted_ext_type); - - auto array = std::static_pointer_cast( - ExtensionType::WrapArray(ext_type, arr)); - auto permuted_array = std::static_pointer_cast( - ExtensionType::WrapArray(permuted_ext_type, arr)); - - for (size_t i = 0; i < element_values.size(); i++) { - // Get tensor from extension array with trivial permutation - ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(i)); - auto actual_ext_scalar = internal::checked_pointer_cast(scalar); - ASSERT_OK_AND_ASSIGN(auto actual_tensor, - exact_ext_type->MakeTensor(actual_ext_scalar)); - ASSERT_OK(actual_tensor->Validate()); - ASSERT_OK_AND_ASSIGN(auto expected_tensor, - Tensor::Make(value_type_, Buffer::Wrap(element_values[i]), - {3, 4}, {}, {"x", "y"})); - ASSERT_EQ(expected_tensor->shape(), actual_tensor->shape()); - ASSERT_EQ(expected_tensor->dim_names(), actual_tensor->dim_names()); - ASSERT_EQ(expected_tensor->strides(), actual_tensor->strides()); - ASSERT_EQ(actual_tensor->strides(), std::vector({32, 8})); - ASSERT_EQ(expected_tensor->type(), actual_tensor->type()); - ASSERT_TRUE(expected_tensor->Equals(*actual_tensor)); - - // Get tensor from extension array with non-trivial permutation - ASSERT_OK_AND_ASSIGN(auto expected_permuted_tensor, - Tensor::Make(value_type_, Buffer::Wrap(element_values[i]), - {4, 3}, {8, 24}, {"y", "x"})); - ASSERT_OK_AND_ASSIGN(scalar, permuted_array->GetScalar(i)); - ASSERT_OK_AND_ASSIGN(auto actual_permuted_tensor, - exact_permuted_ext_type->MakeTensor( - internal::checked_pointer_cast(scalar))); - ASSERT_OK(actual_permuted_tensor->Validate()); - ASSERT_EQ(expected_permuted_tensor->strides(), actual_permuted_tensor->strides()); - ASSERT_EQ(expected_permuted_tensor->shape(), actual_permuted_tensor->shape()); - ASSERT_EQ(expected_permuted_tensor->dim_names(), actual_permuted_tensor->dim_names()); - ASSERT_EQ(expected_permuted_tensor->type(), actual_permuted_tensor->type()); - ASSERT_EQ(expected_permuted_tensor->is_contiguous(), - actual_permuted_tensor->is_contiguous()); - ASSERT_EQ(expected_permuted_tensor->is_column_major(), - actual_permuted_tensor->is_column_major()); - ASSERT_TRUE(expected_permuted_tensor->Equals(*actual_permuted_tensor)); +class TestVariableShapeTensorType : public ::testing::Test { + public: + void SetUp() override { + ndim_ = 3; + value_type_ = int64(); + permutation_ = {0, 1, 2}; + dim_names_ = {"x", "y", "z"}; + ext_type_ = internal::checked_pointer_cast( + variable_shape_tensor(value_type_, ndim_, permutation_, dim_names_)); + shapes_ = + ArrayFromJSON(fixed_size_list(uint32(), ndim_), "[[2,3,1],[1,2,2],[3,1,3]]"); + data_ = ArrayFromJSON(list(value_type_), + "[[0,1,2,3,4,5],[6,7,8,9],[10,11,12,13,14,15,16,17,18]]"); + serialized_ = R"({"permutation":[0,1,2],"dim_names":["x","y","z"]})"; + storage_arr_ = ArrayFromJSON( + ext_type_->storage_type(), + R"([[[2,3,1],[0,1,2,3,4,5]],[[1,2,2],[6,7,8,9]],[[3,1,3],[10,11,12,13,14,15,16,17,18]]])"); + ext_arr_ = internal::checked_pointer_cast( + ExtensionType::WrapArray(ext_type_, storage_arr_)); } - // Test null values fail - auto element_type = fixed_size_list(int64(), 1); - auto fsla_arr = ArrayFromJSON(element_type, "[[1], [null], null]"); - ext_type = fixed_shape_tensor(int64(), {1}); - exact_ext_type = internal::checked_pointer_cast(ext_type); - auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr); - auto tensor_array = internal::checked_pointer_cast(ext_arr); + protected: + uint32_t ndim_; + std::shared_ptr value_type_; + std::vector permutation_; + std::vector dim_names_; + std::shared_ptr ext_type_; + std::shared_ptr shapes_; + std::shared_ptr data_; + std::string serialized_; + std::shared_ptr storage_arr_; + std::shared_ptr ext_arr_; +}; - ASSERT_OK_AND_ASSIGN(auto scalar, tensor_array->GetScalar(0)); - ASSERT_OK_AND_ASSIGN(auto tensor, - exact_ext_type->MakeTensor( - internal::checked_pointer_cast(scalar))); +TEST_F(TestVariableShapeTensorType, CheckDummyRegistration) { + // We need a registered dummy type at runtime to allow for IPC deserialization + auto registered_type = GetExtensionType("arrow.variable_shape_tensor"); + ASSERT_TRUE(registered_type->type_id == Type::EXTENSION); +} - ASSERT_OK_AND_ASSIGN(scalar, tensor_array->GetScalar(1)); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("Invalid: Cannot convert data with nulls to Tensor."), - exact_ext_type->MakeTensor( - internal::checked_pointer_cast(scalar))); +TEST_F(TestVariableShapeTensorType, CreateExtensionType) { + auto exact_ext_type = + internal::checked_pointer_cast(ext_type_); + + // Test ExtensionType methods + ASSERT_EQ(ext_type_->extension_name(), "arrow.variable_shape_tensor"); + ASSERT_TRUE(ext_type_->Equals(*exact_ext_type)); + auto expected_type = struct_({ + ::arrow::field("shape", fixed_size_list(uint32(), ndim_)), + ::arrow::field("data", list(value_type_)), + }); + + ASSERT_TRUE(ext_type_->storage_type()->Equals(*expected_type)); + ASSERT_EQ(ext_type_->Serialize(), serialized_); + ASSERT_OK_AND_ASSIGN(auto ds, + ext_type_->Deserialize(ext_type_->storage_type(), serialized_)); + auto deserialized = internal::checked_pointer_cast(ds); + ASSERT_TRUE(deserialized->Equals(*exact_ext_type)); + ASSERT_TRUE(deserialized->Equals(*ext_type_)); + + // Test FixedShapeTensorType methods + ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); + ASSERT_EQ(exact_ext_type->ndim(), ndim_); + ASSERT_EQ(exact_ext_type->value_type(), value_type_); + ASSERT_EQ(exact_ext_type->permutation(), permutation_); + ASSERT_EQ(exact_ext_type->dim_names(), dim_names_); - ASSERT_OK_AND_ASSIGN(scalar, tensor_array->GetScalar(2)); EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("Invalid: Cannot convert data with nulls to Tensor."), - exact_ext_type->MakeTensor( - internal::checked_pointer_cast(scalar))); - - element_type = list(utf8()); - ext_type = fixed_shape_tensor(utf8(), {1}); - exact_ext_type = internal::checked_pointer_cast(ext_type); - scalar = std::make_shared(ArrayFromJSON(element_type, R"([["a", "b"]])")); - auto ext_scalar = std::make_shared(scalar, ext_type); + Invalid, + testing::HasSubstr("Invalid: permutation size must match ndim. Expected: 3 Got: 1"), + VariableShapeTensorType::Make(value_type_, ndim_, {0})); EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, - testing::HasSubstr("Type error: Cannot convert non-fixed-width values to Tensor."), - exact_ext_type->MakeTensor(ext_scalar)); + Invalid, testing::HasSubstr("Invalid: dim_names size must match ndim."), + VariableShapeTensorType::Make(value_type_, ndim_, {}, {"x"})); +} + +TEST_F(TestVariableShapeTensorType, EqualsCases) { + auto ext_type_permutation_1 = variable_shape_tensor(int64(), 2, {0, 1}, {"x", "y"}); + auto ext_type_permutation_2 = variable_shape_tensor(int64(), 2, {1, 0}, {"x", "y"}); + auto ext_type_no_permutation = variable_shape_tensor(int64(), 2, {}, {"x", "y"}); + + ASSERT_TRUE(ext_type_permutation_1->Equals(ext_type_permutation_1)); + + ASSERT_FALSE( + variable_shape_tensor(int32(), 2, {}, {"x", "y"})->Equals(ext_type_no_permutation)); + ASSERT_FALSE(variable_shape_tensor(int64(), 2, {}, {}) + ->Equals(variable_shape_tensor(int64(), 3, {}, {}))); + ASSERT_FALSE( + variable_shape_tensor(int64(), 2, {}, {"H", "W"})->Equals(ext_type_no_permutation)); + + ASSERT_TRUE(ext_type_no_permutation->Equals(ext_type_permutation_1)); + ASSERT_TRUE(ext_type_permutation_1->Equals(ext_type_no_permutation)); + ASSERT_FALSE(ext_type_no_permutation->Equals(ext_type_permutation_2)); + ASSERT_FALSE(ext_type_permutation_2->Equals(ext_type_no_permutation)); + ASSERT_FALSE(ext_type_permutation_1->Equals(ext_type_permutation_2)); + ASSERT_FALSE(ext_type_permutation_2->Equals(ext_type_permutation_1)); +} + +TEST_F(TestVariableShapeTensorType, CreateFromArray) { + std::vector field_names = {"shapes", "data"}; + ASSERT_OK_AND_ASSIGN(auto storage_arr, + StructArray::Make({shapes_, data_}, field_names)); + auto arr = ExtensionType::WrapArray(ext_type_, storage_arr); + ASSERT_TRUE(ext_arr_->Equals(*arr)); +} + +TEST_F(TestVariableShapeTensorType, MetadataSerializationRoundtrip) { + using T = VariableShapeTensorType; + + CheckSerializationRoundtrip(ext_type_); + CheckSerializationRoundtrip(variable_shape_tensor(value_type_, {}, {}, {})); + CheckSerializationRoundtrip(variable_shape_tensor(value_type_, {0}, {}, {})); + CheckSerializationRoundtrip(variable_shape_tensor(value_type_, {1}, {0}, {"x"})); + CheckSerializationRoundtrip( + variable_shape_tensor(value_type_, 3, {0, 1, 2}, {"H", "W", "C"})); + CheckSerializationRoundtrip( + variable_shape_tensor(value_type_, 3, {2, 0, 1}, {"C", "H", "W"})); + + auto storage_type = ext_type_->storage_type(); + CheckDeserializationRaises(ext_type_, boolean(), R"({"shape":[3,4]})", + "Expected Struct storage type, got bool"); + CheckDeserializationRaises(ext_type_, storage_type, R"({"shape":(3,4)})", + "Invalid serialized JSON data"); + CheckDeserializationRaises(ext_type_, storage_type, R"({"permutation":[1,0]})", + "Invalid permutation"); + CheckDeserializationRaises(ext_type_, storage_type, R"({"dim_names":["x","y"]})", + "Invalid dim_names"); +} + +TEST_F(TestVariableShapeTensorType, RoudtripBatch) { + auto exact_ext_type = + internal::checked_pointer_cast(ext_type_); + + // Pass extension array, expect getting back extension array + std::shared_ptr read_batch; + auto ext_field = field(/*name=*/"f0", /*type=*/ext_type_); + auto batch = RecordBatch::Make(schema({ext_field}), ext_arr_->length(), {ext_arr_}); + RoundtripBatch(batch, &read_batch); + CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); + + // Pass extension metadata and storage array, expect getting back extension array + std::shared_ptr read_batch2; + auto ext_metadata = + key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, + {"ARROW:extension:metadata", serialized_}}); + ext_field = field(/*name=*/"f0", /*type=*/ext_type_->storage_type(), /*nullable=*/true, + /*metadata=*/ext_metadata); + auto batch2 = RecordBatch::Make(schema({ext_field}), ext_arr_->length(), {ext_arr_}); + RoundtripBatch(batch2, &read_batch2); + CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); } } // namespace arrow diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc new file mode 100644 index 00000000000..65062132e5c --- /dev/null +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/extension/variable_shape_tensor.h" + +#include "arrow/array/array_nested.h" +#include "arrow/array/array_primitive.h" +#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep +#include "arrow/tensor.h" +#include "arrow/util/int_util_overflow.h" +#include "arrow/util/logging.h" +#include "arrow/util/sort.h" + +#include +#include + +namespace rj = arrow::rapidjson; + +namespace arrow { + +namespace extension { + +namespace {} // namespace + +bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const { + if (extension_name() != other.extension_name()) { + return false; + } + const auto& other_ext = static_cast(other); + if (this->ndim() != other_ext.ndim()) { + return false; + } + + auto is_permutation_trivial = [](const std::vector& permutation) { + for (size_t i = 1; i < permutation.size(); ++i) { + if (permutation[i - 1] + 1 != permutation[i]) { + return false; + } + } + return true; + }; + const bool permutation_equivalent = + ((permutation_ == other_ext.permutation()) || + (permutation_.empty() && is_permutation_trivial(other_ext.permutation())) || + (is_permutation_trivial(permutation_) && other_ext.permutation().empty())); + + return (storage_type()->Equals(other_ext.storage_type())) && + (dim_names_ == other_ext.dim_names()) && permutation_equivalent; +} + +std::string VariableShapeTensorType::Serialize() const { + rj::Document document; + document.SetObject(); + rj::Document::AllocatorType& allocator = document.GetAllocator(); + + if (!permutation_.empty()) { + rj::Value permutation(rj::kArrayType); + for (auto v : permutation_) { + permutation.PushBack(v, allocator); + } + document.AddMember(rj::Value("permutation", allocator), permutation, allocator); + } + + if (!dim_names_.empty()) { + rj::Value dim_names(rj::kArrayType); + for (std::string v : dim_names_) { + dim_names.PushBack(rj::Value{}.SetString(v.c_str(), allocator), allocator); + } + document.AddMember(rj::Value("dim_names", allocator), dim_names, allocator); + } + + rj::StringBuffer buffer; + rj::Writer writer(buffer); + document.Accept(writer); + return buffer.GetString(); +} + +Result> VariableShapeTensorType::Deserialize( + std::shared_ptr storage_type, const std::string& serialized_data) const { + if (storage_type->id() != Type::STRUCT) { + return Status::Invalid("Expected Struct storage type, got ", + storage_type->ToString()); + } + auto value_type = storage_type->field(1)->type()->field(0)->type(); + const size_t ndim = + std::static_pointer_cast(storage_type->field(0)->type()) + ->list_size(); + + rj::Document document; + if (document.Parse(serialized_data.data(), serialized_data.length()).HasParseError()) { + return Status::Invalid("Invalid serialized JSON data: ", serialized_data); + } + + std::vector permutation; + if (document.HasMember("permutation")) { + for (auto& x : document["permutation"].GetArray()) { + permutation.emplace_back(x.GetInt64()); + } + if (permutation.size() != ndim) { + return Status::Invalid("Invalid permutation"); + } + } + std::vector dim_names; + if (document.HasMember("dim_names")) { + for (auto& x : document["dim_names"].GetArray()) { + dim_names.emplace_back(x.GetString()); + } + if (dim_names.size() != ndim) { + return Status::Invalid("Invalid dim_names"); + } + } + + return variable_shape_tensor(value_type, static_cast(ndim), permutation, + dim_names); +} + +std::shared_ptr VariableShapeTensorType::MakeArray( + std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK_EQ("arrow.variable_shape_tensor", + static_cast(*data->type).extension_name()); + return std::make_shared(data); +} + +Result> VariableShapeTensorType::Make( + const std::shared_ptr& value_type, const uint32_t& ndim, + const std::vector& permutation, const std::vector& dim_names) { + if (!permutation.empty() && permutation.size() != ndim) { + return Status::Invalid("permutation size must match ndim. Expected: ", ndim, + " Got: ", permutation.size()); + } + if (!dim_names.empty() && dim_names.size() != ndim) { + return Status::Invalid("dim_names size must match ndim. Expected: ", ndim, + " Got: ", dim_names.size()); + } + return std::make_shared(value_type, ndim, permutation, + dim_names); +} + +std::shared_ptr variable_shape_tensor( + const std::shared_ptr& value_type, const uint32_t& ndim, + const std::vector& permutation, const std::vector& dim_names) { + auto maybe_type = + VariableShapeTensorType::Make(value_type, ndim, permutation, dim_names); + ARROW_DCHECK_OK(maybe_type.status()); + return maybe_type.MoveValueUnsafe(); +} + +} // namespace extension +} // namespace arrow diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h new file mode 100644 index 00000000000..2d981222ea8 --- /dev/null +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension_type.h" + +namespace arrow { +namespace extension { + +class ARROW_EXPORT VariableShapeTensorArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +/// \brief Concrete type class for variable-shape Tensor data. +/// This is a canonical arrow extension type. +/// See: https://arrow.apache.org/docs/format/CanonicalExtensions.html +class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { + public: + VariableShapeTensorType(const std::shared_ptr& value_type, + const uint32_t& ndim, + const std::vector& permutation = {}, + const std::vector& dim_names = {}) + : ExtensionType(struct_({::arrow::field("shape", fixed_size_list(uint32(), ndim)), + ::arrow::field("data", list(value_type))})), + value_type_(value_type), + permutation_(permutation), + dim_names_(dim_names) {} + + std::string extension_name() const override { return "arrow.variable_shape_tensor"; } + + /// Number of dimensions of tensor elements + uint32_t ndim() const { + std::shared_ptr storage_type = this->storage_type()->field(0)->type(); + return std::static_pointer_cast(storage_type)->list_size(); + } + + /// Value type of tensor elements + const std::shared_ptr value_type() const { return value_type_; } + + /// Permutation mapping from logical to physical memory layout of tensor elements + const std::vector& permutation() const { return permutation_; } + + /// Dimension names of tensor elements. Dimensions are ordered physically. + const std::vector& dim_names() const { return dim_names_; } + + bool ExtensionEquals(const ExtensionType& other) const override; + + std::string Serialize() const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override; + + /// Create a VariableShapeTensorArray from ArrayData + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + /// \brief Create a VariableShapeTensorType instance + static Result> Make( + const std::shared_ptr& value_type, const uint32_t& ndim, + const std::vector& permutation = {}, + const std::vector& dim_names = {}); + + private: + std::shared_ptr storage_type_; + std::shared_ptr value_type_; + std::vector permutation_; + std::vector dim_names_; +}; + +/// \brief Return a VariableShapeTensorType instance. +ARROW_EXPORT std::shared_ptr variable_shape_tensor( + const std::shared_ptr& value_type, const uint32_t& ndim, + const std::vector& permutation = {}, + const std::vector& dim_names = {}); + +} // namespace extension +} // namespace arrow diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 555ffe0156a..31749601ca7 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -29,8 +29,9 @@ #include "arrow/config.h" #include "arrow/extension/bool8.h" #ifdef ARROW_JSON -# include "arrow/extension/fixed_shape_tensor.h" -# include "arrow/extension/opaque.h" +#include "arrow/extension/fixed_shape_tensor.h" +#include "arrow/extension/opaque.h" +#include "arrow/extension/variable_shape_tensor.h" #endif #include "arrow/extension/json.h" #include "arrow/extension/uuid.h" @@ -155,6 +156,7 @@ static void CreateGlobalRegistry() { #ifdef ARROW_JSON ext_types.push_back(extension::fixed_shape_tensor(int64(), {})); ext_types.push_back(extension::opaque(null(), "", "")); + ext_types.push_back(extension::variable_shape_tensor(int64(), 0)); #endif for (const auto& ext_type : ext_types) { From 765bbf7e5e04829f31a93f0ec5d1bcfa51ef6ead Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 3 Sep 2023 22:27:03 +0200 Subject: [PATCH 02/62] Add VariableShapeTensorArray::ToTensor(i) --- cpp/src/arrow/extension/fixed_shape_tensor.h | 9 +++ .../extension/tensor_extension_array_test.cc | 55 +++++++++++++++++++ .../arrow/extension/variable_shape_tensor.cc | 34 +++++++++++- .../arrow/extension/variable_shape_tensor.h | 10 ++++ 4 files changed, 106 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 80a602021c6..f9a7140c6e5 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -20,6 +20,15 @@ #include "arrow/extension_type.h" namespace arrow { +namespace internal { + +ARROW_EXPORT +Status ComputeStrides(const FixedWidthType& type, const std::vector& shape, + const std::vector& permutation, + std::vector* strides); + +} // namespace internal + namespace extension { class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 047aac8a651..da185737a73 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -30,6 +30,8 @@ #include "arrow/testing/gtest_util.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/sort.h" +#include "arrow/util/logging.h" + namespace arrow { @@ -486,6 +488,8 @@ class TestVariableShapeTensorType : public ::testing::Test { void SetUp() override { ndim_ = 3; value_type_ = int64(); + data_type_ = list(value_type_); + shape_type_ = fixed_size_list(uint32(), ndim_); permutation_ = {0, 1, 2}; dim_names_ = {"x", "y", "z"}; ext_type_ = internal::checked_pointer_cast( @@ -505,6 +509,8 @@ class TestVariableShapeTensorType : public ::testing::Test { protected: uint32_t ndim_; std::shared_ptr value_type_; + std::shared_ptr data_type_; + std::shared_ptr shape_type_; std::vector permutation_; std::vector dim_names_; std::shared_ptr ext_type_; @@ -633,4 +639,53 @@ TEST_F(TestVariableShapeTensorType, RoudtripBatch) { CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); } +TEST_F(TestVariableShapeTensorType, ComputeStrides) { + auto shapes = ArrayFromJSON(shape_type_, "[[2,3,1],[2,1,2],[3,1,3]]"); + auto data = + ArrayFromJSON(data_type_, "[[1,1,2,3,4,5],[2,7,8,9],[10,11,12,13,14,15,16,17,18]]"); + std::vector> fields = {field("shapes", shape_type_), + field("data", data_type_)}; + ASSERT_OK_AND_ASSIGN(auto storage_arr, StructArray::Make({shapes, data}, fields)); + auto ext_arr = ExtensionType::WrapArray(ext_type_, storage_arr); + auto ext_array = std::static_pointer_cast(ext_arr); + + std::shared_ptr t, tensor; + + ASSERT_OK_AND_ASSIGN(t, ext_array->GetTensor(0)); + ASSERT_EQ(t->shape(), (std::vector{2, 3, 1})); + ASSERT_EQ(t->strides(), (std::vector{24, 8, 8})); + + std::vector shape = {2, 3, 1}; + std::vector strides = {sizeof(int64_t) * 3, sizeof(int64_t) * 1, + sizeof(int64_t) * 1}; + std::vector values = {1, 1, 2, 3, 4, 5}; + auto data_buffer = Buffer::Wrap(values); + ASSERT_OK_AND_ASSIGN(tensor, + Tensor::Make(int64(), data_buffer, shape, strides, dim_names_)); + ASSERT_TRUE(tensor->Equals(*t)); + + ASSERT_OK_AND_ASSIGN(t, ext_array->GetTensor(1)); + ASSERT_EQ(t->shape(), (std::vector{2, 1, 2})); + ASSERT_EQ(t->strides(), (std::vector{16, 16, 8})); + + ASSERT_OK_AND_ASSIGN(t, ext_array->GetTensor(2)); + ASSERT_EQ(t->shape(), (std::vector{3, 1, 3})); + ASSERT_EQ(t->strides(), (std::vector{24, 24, 8})); + + shape = {3, 1, 3}; + strides = {sizeof(int64_t) * 3, sizeof(int64_t) * 3, sizeof(int64_t) * 1}; + values = {10, 11, 12, 13, 14, 15, 16, 17, 18}; + data_buffer = Buffer::Wrap(values); + ASSERT_OK_AND_ASSIGN(tensor, + Tensor::Make(int64(), data_buffer, shape, strides, dim_names_)); + + ASSERT_EQ(tensor->strides(), t->strides()); + ASSERT_EQ(tensor->shape(), t->shape()); + ASSERT_EQ(tensor->dim_names(), t->dim_names()); + ASSERT_EQ(tensor->type(), t->type()); + ASSERT_EQ(tensor->is_contiguous(), t->is_contiguous()); + ASSERT_EQ(tensor->is_column_major(), t->is_column_major()); + ASSERT_TRUE(tensor->Equals(*t)); +} + } // namespace arrow diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 65062132e5c..9803dce0427 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -17,11 +17,13 @@ #include +#include "arrow/extension/fixed_shape_tensor.h" #include "arrow/extension/variable_shape_tensor.h" #include "arrow/array/array_nested.h" #include "arrow/array/array_primitive.h" #include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep +#include "arrow/scalar.h" #include "arrow/tensor.h" #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" @@ -33,10 +35,38 @@ namespace rj = arrow::rapidjson; namespace arrow { - namespace extension { -namespace {} // namespace +const Result> VariableShapeTensorArray::GetTensor( + const int64_t i) const { + auto ext_arr = internal::checked_pointer_cast(this->storage()); + auto ext_type = internal::checked_pointer_cast(this->type()); + auto value_type = + internal::checked_pointer_cast(ext_type->value_type()); + auto ndim = ext_type->ndim(); + auto dim_names = ext_type->dim_names(); + auto shapes = + std::static_pointer_cast(ext_arr->field(0))->value_slice(i); + + std::vector shape; + for (int64_t j = 0; j < ndim; ++j) { + ARROW_ASSIGN_OR_RAISE(auto size, shapes->GetScalar(j)); + shape.push_back( + static_cast(std::static_pointer_cast(size)->value)); + } + + std::vector strides; + ARROW_CHECK_OK(internal::ComputeStrides(*value_type.get(), shape, + ext_type->permutation(), &strides)); + + auto list_arr = + std::static_pointer_cast(ext_arr->field(1))->value_slice(i)->data(); + auto bw = value_type->byte_width(); + auto buffer = + SliceBuffer(list_arr->buffers[1], list_arr->offset * bw, list_arr->length * bw); + + return Tensor::Make(ext_type->value_type(), buffer, shape, strides, dim_names); +} bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const { if (extension_name() != other.extension_name()) { diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 2d981222ea8..accd6cc46a2 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#pragma once + #include "arrow/extension_type.h" namespace arrow { @@ -23,6 +25,14 @@ namespace extension { class ARROW_EXPORT VariableShapeTensorArray : public ExtensionArray { public: using ExtensionArray::ExtensionArray; + + /// \brief Get a Tensor of VariableShapeTensorArray at i + /// + /// This method will return a Tensor from VariableShapeTensorArray with strides + /// derived from shape and permutation of VariableShapeTensorType. Shape and + /// dim_names will be permuted according to permutation stored in the + /// VariableShapeTensorType metadata. + const Result> GetTensor(const int64_t i) const; }; /// \brief Concrete type class for variable-shape Tensor data. From 46691b64a1ede19bc7470b4215b592935bc064bf Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 13 Sep 2023 01:32:11 +0200 Subject: [PATCH 03/62] :Add ragged_dimensions --- .../extension/tensor_extension_array_test.cc | 11 ++++-- .../arrow/extension/variable_shape_tensor.cc | 36 +++++++++++++++---- .../arrow/extension/variable_shape_tensor.h | 16 ++++++--- docs/source/format/CanonicalExtensions.rst | 11 ++++++ 4 files changed, 61 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index da185737a73..5d0f29ee5f8 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -492,13 +492,15 @@ class TestVariableShapeTensorType : public ::testing::Test { shape_type_ = fixed_size_list(uint32(), ndim_); permutation_ = {0, 1, 2}; dim_names_ = {"x", "y", "z"}; - ext_type_ = internal::checked_pointer_cast( - variable_shape_tensor(value_type_, ndim_, permutation_, dim_names_)); + ragged_dimensions_ = {1}; + ext_type_ = internal::checked_pointer_cast(variable_shape_tensor( + value_type_, ndim_, permutation_, dim_names_, ragged_dimensions_)); shapes_ = ArrayFromJSON(fixed_size_list(uint32(), ndim_), "[[2,3,1],[1,2,2],[3,1,3]]"); data_ = ArrayFromJSON(list(value_type_), "[[0,1,2,3,4,5],[6,7,8,9],[10,11,12,13,14,15,16,17,18]]"); - serialized_ = R"({"permutation":[0,1,2],"dim_names":["x","y","z"]})"; + serialized_ = + R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"ragged_dimensions":[1]})"; storage_arr_ = ArrayFromJSON( ext_type_->storage_type(), R"([[[2,3,1],[0,1,2,3,4,5]],[[1,2,2],[6,7,8,9]],[[3,1,3],[10,11,12,13,14,15,16,17,18]]])"); @@ -512,6 +514,7 @@ class TestVariableShapeTensorType : public ::testing::Test { std::shared_ptr data_type_; std::shared_ptr shape_type_; std::vector permutation_; + std::vector ragged_dimensions_; std::vector dim_names_; std::shared_ptr ext_type_; std::shared_ptr shapes_; @@ -604,6 +607,8 @@ TEST_F(TestVariableShapeTensorType, MetadataSerializationRoundtrip) { variable_shape_tensor(value_type_, 3, {0, 1, 2}, {"H", "W", "C"})); CheckSerializationRoundtrip( variable_shape_tensor(value_type_, 3, {2, 0, 1}, {"C", "H", "W"})); + CheckSerializationRoundtrip( + variable_shape_tensor(value_type_, 3, {2, 0, 1}, {"C", "H", "W"}, {0, 1, 2})); auto storage_type = ext_type_->storage_type(); CheckDeserializationRaises(ext_type_, boolean(), R"({"shape":[3,4]})", diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 9803dce0427..3547fa0338f 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -56,6 +56,7 @@ const Result> VariableShapeTensorArray::GetTensor( } std::vector strides; + // TODO: optimize ComputeStrides for ragged tensors ARROW_CHECK_OK(internal::ComputeStrides(*value_type.get(), shape, ext_type->permutation(), &strides)); @@ -115,6 +116,15 @@ std::string VariableShapeTensorType::Serialize() const { document.AddMember(rj::Value("dim_names", allocator), dim_names, allocator); } + if (!ragged_dimensions_.empty()) { + rj::Value ragged_dimensions(rj::kArrayType); + for (auto v : ragged_dimensions_) { + ragged_dimensions.PushBack(v, allocator); + } + document.AddMember(rj::Value("ragged_dimensions", allocator), ragged_dimensions, + allocator); + } + rj::StringBuffer buffer; rj::Writer writer(buffer); document.Accept(writer); @@ -156,8 +166,17 @@ Result> VariableShapeTensorType::Deserialize( } } + std::vector ragged_dimensions; + if (document.HasMember("ragged_dimensions")) { + for (auto& x : document["ragged_dimensions"].GetArray()) { + ragged_dimensions.emplace_back(x.GetInt64()); + } + if (ragged_dimensions.size() > ndim) { + return Status::Invalid("Invalid ragged_dimensions"); + } + } return variable_shape_tensor(value_type, static_cast(ndim), permutation, - dim_names); + dim_names, ragged_dimensions); } std::shared_ptr VariableShapeTensorType::MakeArray( @@ -170,7 +189,8 @@ std::shared_ptr VariableShapeTensorType::MakeArray( Result> VariableShapeTensorType::Make( const std::shared_ptr& value_type, const uint32_t& ndim, - const std::vector& permutation, const std::vector& dim_names) { + const std::vector& permutation, const std::vector& dim_names, + const std::vector& ragged_dimensions) { if (!permutation.empty() && permutation.size() != ndim) { return Status::Invalid("permutation size must match ndim. Expected: ", ndim, " Got: ", permutation.size()); @@ -179,15 +199,19 @@ Result> VariableShapeTensorType::Make( return Status::Invalid("dim_names size must match ndim. Expected: ", ndim, " Got: ", dim_names.size()); } + if (ragged_dimensions.size() > ndim) { + return Status::Invalid("ragged_dimensions size must be less or equal ndim."); + } return std::make_shared(value_type, ndim, permutation, - dim_names); + dim_names, ragged_dimensions); } std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, - const std::vector& permutation, const std::vector& dim_names) { - auto maybe_type = - VariableShapeTensorType::Make(value_type, ndim, permutation, dim_names); + const std::vector& permutation, const std::vector& dim_names, + const std::vector& ragged_dimensions) { + auto maybe_type = VariableShapeTensorType::Make(value_type, ndim, permutation, + dim_names, ragged_dimensions); ARROW_DCHECK_OK(maybe_type.status()); return maybe_type.MoveValueUnsafe(); } diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index accd6cc46a2..41baf902586 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -43,12 +43,14 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { VariableShapeTensorType(const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, - const std::vector& dim_names = {}) + const std::vector& dim_names = {}, + const std::vector& ragged_dimensions = {}) : ExtensionType(struct_({::arrow::field("shape", fixed_size_list(uint32(), ndim)), ::arrow::field("data", list(value_type))})), value_type_(value_type), permutation_(permutation), - dim_names_(dim_names) {} + dim_names_(dim_names), + ragged_dimensions_(ragged_dimensions) {} std::string extension_name() const override { return "arrow.variable_shape_tensor"; } @@ -67,6 +69,9 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { /// Dimension names of tensor elements. Dimensions are ordered physically. const std::vector& dim_names() const { return dim_names_; } + /// Indexes of ragged dimensions. + const std::vector& ragged_dimensions() const { return ragged_dimensions_; } + bool ExtensionEquals(const ExtensionType& other) const override; std::string Serialize() const override; @@ -82,20 +87,23 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { static Result> Make( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, - const std::vector& dim_names = {}); + const std::vector& dim_names = {}, + const std::vector& ragged_dimensions = {}); private: std::shared_ptr storage_type_; std::shared_ptr value_type_; std::vector permutation_; std::vector dim_names_; + std::vector ragged_dimensions_; }; /// \brief Return a VariableShapeTensorType instance. ARROW_EXPORT std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, - const std::vector& dim_names = {}); + const std::vector& dim_names = {}, + const std::vector& ragged_dimensions = {}); } // namespace extension } // namespace arrow diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 8608a6388e0..e3f9929f0d3 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -206,6 +206,12 @@ Variable shape tensor This allows for interpreting the tensor correctly without accounting for uniform dimensions while still permitting optional optimizations that take advantage of the uniformity. + * **ragged_dimensions** = indices of ragged dimensions whose sizes may + differ. Dimensions where all elements have the same size are called + uniform dimensions. Indices are a subset of all possible dimension + indices ([0, 1, .., N-1]). + Ragged dimensions list can be left out. In that case all dimensions + are assumed ragged. * Description of the serialization: @@ -227,6 +233,11 @@ Variable shape tensor ``{ "dim_names": ["H", "W", "C"], "uniform_shape": [400, null, 3] }`` + - Example with ``ragged_dimensions`` metadata for a set of color images + with variable width: + + ``{ "dim_names": ["H", "W", "C"], "ragged_dimensions": [1] }`` + - Example of permuted 3-dimensional tensor: ``{ "permutation": [2, 0, 1] }`` From 249cf9fbef3bc3a73b253054567ce8f95ed74cce Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 15 Sep 2023 20:54:27 +0200 Subject: [PATCH 04/62] Replace ragged_dimensions with uniform_dimensions --- .../extension/tensor_extension_array_test.cc | 8 ++--- .../arrow/extension/variable_shape_tensor.cc | 36 +++++++++---------- .../arrow/extension/variable_shape_tensor.h | 12 +++---- docs/source/format/CanonicalExtensions.rst | 19 +++++----- 4 files changed, 39 insertions(+), 36 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 5d0f29ee5f8..1c542c69ac5 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -492,15 +492,15 @@ class TestVariableShapeTensorType : public ::testing::Test { shape_type_ = fixed_size_list(uint32(), ndim_); permutation_ = {0, 1, 2}; dim_names_ = {"x", "y", "z"}; - ragged_dimensions_ = {1}; + uniform_dimensions_ = {1}; ext_type_ = internal::checked_pointer_cast(variable_shape_tensor( - value_type_, ndim_, permutation_, dim_names_, ragged_dimensions_)); + value_type_, ndim_, permutation_, dim_names_, uniform_dimensions_)); shapes_ = ArrayFromJSON(fixed_size_list(uint32(), ndim_), "[[2,3,1],[1,2,2],[3,1,3]]"); data_ = ArrayFromJSON(list(value_type_), "[[0,1,2,3,4,5],[6,7,8,9],[10,11,12,13,14,15,16,17,18]]"); serialized_ = - R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"ragged_dimensions":[1]})"; + R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"uniform_dimensions":[1]})"; storage_arr_ = ArrayFromJSON( ext_type_->storage_type(), R"([[[2,3,1],[0,1,2,3,4,5]],[[1,2,2],[6,7,8,9]],[[3,1,3],[10,11,12,13,14,15,16,17,18]]])"); @@ -514,7 +514,7 @@ class TestVariableShapeTensorType : public ::testing::Test { std::shared_ptr data_type_; std::shared_ptr shape_type_; std::vector permutation_; - std::vector ragged_dimensions_; + std::vector uniform_dimensions_; std::vector dim_names_; std::shared_ptr ext_type_; std::shared_ptr shapes_; diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 3547fa0338f..ec8d292c012 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -116,12 +116,12 @@ std::string VariableShapeTensorType::Serialize() const { document.AddMember(rj::Value("dim_names", allocator), dim_names, allocator); } - if (!ragged_dimensions_.empty()) { - rj::Value ragged_dimensions(rj::kArrayType); - for (auto v : ragged_dimensions_) { - ragged_dimensions.PushBack(v, allocator); + if (!uniform_dimensions_.empty()) { + rj::Value uniform_dimensions(rj::kArrayType); + for (auto v : uniform_dimensions_) { + uniform_dimensions.PushBack(v, allocator); } - document.AddMember(rj::Value("ragged_dimensions", allocator), ragged_dimensions, + document.AddMember(rj::Value("uniform_dimensions", allocator), uniform_dimensions, allocator); } @@ -166,17 +166,17 @@ Result> VariableShapeTensorType::Deserialize( } } - std::vector ragged_dimensions; - if (document.HasMember("ragged_dimensions")) { - for (auto& x : document["ragged_dimensions"].GetArray()) { - ragged_dimensions.emplace_back(x.GetInt64()); + std::vector uniform_dimensions; + if (document.HasMember("uniform_dimensions")) { + for (auto& x : document["uniform_dimensions"].GetArray()) { + uniform_dimensions.emplace_back(x.GetInt64()); } - if (ragged_dimensions.size() > ndim) { - return Status::Invalid("Invalid ragged_dimensions"); + if (uniform_dimensions.size() > ndim) { + return Status::Invalid("Invalid uniform_dimensions"); } } return variable_shape_tensor(value_type, static_cast(ndim), permutation, - dim_names, ragged_dimensions); + dim_names, uniform_dimensions); } std::shared_ptr VariableShapeTensorType::MakeArray( @@ -190,7 +190,7 @@ std::shared_ptr VariableShapeTensorType::MakeArray( Result> VariableShapeTensorType::Make( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation, const std::vector& dim_names, - const std::vector& ragged_dimensions) { + const std::vector& uniform_dimensions) { if (!permutation.empty() && permutation.size() != ndim) { return Status::Invalid("permutation size must match ndim. Expected: ", ndim, " Got: ", permutation.size()); @@ -199,19 +199,19 @@ Result> VariableShapeTensorType::Make( return Status::Invalid("dim_names size must match ndim. Expected: ", ndim, " Got: ", dim_names.size()); } - if (ragged_dimensions.size() > ndim) { - return Status::Invalid("ragged_dimensions size must be less or equal ndim."); + if (uniform_dimensions.size() > ndim) { + return Status::Invalid("uniform_dimensions size must be less or equal ndim."); } return std::make_shared(value_type, ndim, permutation, - dim_names, ragged_dimensions); + dim_names, uniform_dimensions); } std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation, const std::vector& dim_names, - const std::vector& ragged_dimensions) { + const std::vector& uniform_dimensions) { auto maybe_type = VariableShapeTensorType::Make(value_type, ndim, permutation, - dim_names, ragged_dimensions); + dim_names, uniform_dimensions); ARROW_DCHECK_OK(maybe_type.status()); return maybe_type.MoveValueUnsafe(); } diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 41baf902586..2a7e41f9b3a 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -44,13 +44,13 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& ragged_dimensions = {}) + const std::vector& uniform_dimensions = {}) : ExtensionType(struct_({::arrow::field("shape", fixed_size_list(uint32(), ndim)), ::arrow::field("data", list(value_type))})), value_type_(value_type), permutation_(permutation), dim_names_(dim_names), - ragged_dimensions_(ragged_dimensions) {} + uniform_dimensions_(uniform_dimensions) {} std::string extension_name() const override { return "arrow.variable_shape_tensor"; } @@ -70,7 +70,7 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const std::vector& dim_names() const { return dim_names_; } /// Indexes of ragged dimensions. - const std::vector& ragged_dimensions() const { return ragged_dimensions_; } + const std::vector& uniform_dimensions() const { return uniform_dimensions_; } bool ExtensionEquals(const ExtensionType& other) const override; @@ -88,14 +88,14 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& ragged_dimensions = {}); + const std::vector& uniform_dimensions = {}); private: std::shared_ptr storage_type_; std::shared_ptr value_type_; std::vector permutation_; std::vector dim_names_; - std::vector ragged_dimensions_; + std::vector uniform_dimensions_; }; /// \brief Return a VariableShapeTensorType instance. @@ -103,7 +103,7 @@ ARROW_EXPORT std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& ragged_dimensions = {}); + const std::vector& uniform_dimensions = {}); } // namespace extension } // namespace arrow diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index e3f9929f0d3..79346488c2c 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -206,12 +206,15 @@ Variable shape tensor This allows for interpreting the tensor correctly without accounting for uniform dimensions while still permitting optional optimizations that take advantage of the uniformity. - * **ragged_dimensions** = indices of ragged dimensions whose sizes may - differ. Dimensions where all elements have the same size are called - uniform dimensions. Indices are a subset of all possible dimension - indices ([0, 1, .., N-1]). - Ragged dimensions list can be left out. In that case all dimensions - are assumed ragged. + * **uniform_dimensions** = indices of dimensions whose sizes are + guaranteed to remain constant. Indices are a subset of all possible + dimension indices ([0, 1, .., N-1]). + The uniform dimensions must still be represented in the `shape` field, + and must always be the same value for all tensors in the array -- this + allows code to interpret the tensor correctly without accounting for + uniform dimensions while still permitting optional optimizations that + take advantage of the uniformity. uniform_dimensions can be left out, + in which case it is assumed that all dimensions might be variable. * Description of the serialization: @@ -233,10 +236,10 @@ Variable shape tensor ``{ "dim_names": ["H", "W", "C"], "uniform_shape": [400, null, 3] }`` - - Example with ``ragged_dimensions`` metadata for a set of color images + - Example with ``uniform_dimensions`` metadata for a set of color images with variable width: - ``{ "dim_names": ["H", "W", "C"], "ragged_dimensions": [1] }`` + ``{ "dim_names": ["H", "W", "C"], "uniform_dimensions": [1] }`` - Example of permuted 3-dimensional tensor: From b031c262d802361f96e1e1b0e9369ac4f00e1471 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 15 Sep 2023 21:13:25 +0200 Subject: [PATCH 05/62] Add example for explanation --- docs/source/format/CanonicalExtensions.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 79346488c2c..0812a25a11c 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -262,8 +262,8 @@ Variable shape tensor This means the logical tensor has names [z, x, y] and shape [30, 10, 20]. .. note:: - Values inside each **data** tensor element are stored in row-major/C-contiguous - order according to the corresponding **shape**. + Elements in a variable shape tensor extension array are stored + in row-major/C-contiguous order. .. _json_extension: From aeba04ac3ede1faa63c244d891ea412d6750bd22 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 25 Sep 2023 00:00:53 +0200 Subject: [PATCH 06/62] Add uniform_shape parameter --- .../extension/tensor_extension_array_test.cc | 19 ++++----- .../arrow/extension/variable_shape_tensor.cc | 42 +++++++++++++++---- .../arrow/extension/variable_shape_tensor.h | 16 +++++-- docs/source/format/CanonicalExtensions.rst | 6 +++ 4 files changed, 59 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 1c542c69ac5..cdefe7aa295 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -493,14 +493,16 @@ class TestVariableShapeTensorType : public ::testing::Test { permutation_ = {0, 1, 2}; dim_names_ = {"x", "y", "z"}; uniform_dimensions_ = {1}; - ext_type_ = internal::checked_pointer_cast(variable_shape_tensor( - value_type_, ndim_, permutation_, dim_names_, uniform_dimensions_)); + uniform_shape_ = {0, 1, 0}; + ext_type_ = internal::checked_pointer_cast( + variable_shape_tensor(value_type_, ndim_, permutation_, dim_names_, + uniform_dimensions_, uniform_shape_)); shapes_ = - ArrayFromJSON(fixed_size_list(uint32(), ndim_), "[[2,3,1],[1,2,2],[3,1,3]]"); + ArrayFromJSON(fixed_size_list(uint32(), ndim_), "[[2,1,3],[2,1,2],[3,1,3]]"); data_ = ArrayFromJSON(list(value_type_), "[[0,1,2,3,4,5],[6,7,8,9],[10,11,12,13,14,15,16,17,18]]"); serialized_ = - R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"uniform_dimensions":[1]})"; + R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"uniform_dimensions":[1],"uniform_shape":[0,1,0]})"; storage_arr_ = ArrayFromJSON( ext_type_->storage_type(), R"([[[2,3,1],[0,1,2,3,4,5]],[[1,2,2],[6,7,8,9]],[[3,1,3],[10,11,12,13,14,15,16,17,18]]])"); @@ -515,6 +517,7 @@ class TestVariableShapeTensorType : public ::testing::Test { std::shared_ptr shape_type_; std::vector permutation_; std::vector uniform_dimensions_; + std::vector uniform_shape_; std::vector dim_names_; std::shared_ptr ext_type_; std::shared_ptr shapes_; @@ -588,14 +591,6 @@ TEST_F(TestVariableShapeTensorType, EqualsCases) { ASSERT_FALSE(ext_type_permutation_2->Equals(ext_type_permutation_1)); } -TEST_F(TestVariableShapeTensorType, CreateFromArray) { - std::vector field_names = {"shapes", "data"}; - ASSERT_OK_AND_ASSIGN(auto storage_arr, - StructArray::Make({shapes_, data_}, field_names)); - auto arr = ExtensionType::WrapArray(ext_type_, storage_arr); - ASSERT_TRUE(ext_arr_->Equals(*arr)); -} - TEST_F(TestVariableShapeTensorType, MetadataSerializationRoundtrip) { using T = VariableShapeTensorType; diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index ec8d292c012..89bbf4011f9 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -92,7 +92,9 @@ bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const (is_permutation_trivial(permutation_) && other_ext.permutation().empty())); return (storage_type()->Equals(other_ext.storage_type())) && - (dim_names_ == other_ext.dim_names()) && permutation_equivalent; + (dim_names_ == other_ext.dim_names()) && + (uniform_dimensions_ == other_ext.uniform_dimensions_) && + (uniform_shape_ == other_ext.uniform_shape()) && permutation_equivalent; } std::string VariableShapeTensorType::Serialize() const { @@ -125,6 +127,14 @@ std::string VariableShapeTensorType::Serialize() const { allocator); } + if (!uniform_shape_.empty()) { + rj::Value uniform_shape(rj::kArrayType); + for (auto v : uniform_shape_) { + uniform_shape.PushBack(v, allocator); + } + document.AddMember(rj::Value("uniform_shape", allocator), uniform_shape, allocator); + } + rj::StringBuffer buffer; rj::Writer writer(buffer); document.Accept(writer); @@ -175,8 +185,19 @@ Result> VariableShapeTensorType::Deserialize( return Status::Invalid("Invalid uniform_dimensions"); } } + + std::vector uniform_shape; + if (document.HasMember("uniform_shape")) { + for (auto& x : document["uniform_shape"].GetArray()) { + uniform_shape.emplace_back(x.GetInt64()); + } + if (uniform_shape.size() > ndim) { + return Status::Invalid("Invalid uniform_shape"); + } + } + return variable_shape_tensor(value_type, static_cast(ndim), permutation, - dim_names, uniform_dimensions); + dim_names, uniform_dimensions, uniform_shape); } std::shared_ptr VariableShapeTensorType::MakeArray( @@ -190,7 +211,8 @@ std::shared_ptr VariableShapeTensorType::MakeArray( Result> VariableShapeTensorType::Make( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation, const std::vector& dim_names, - const std::vector& uniform_dimensions) { + const std::vector& uniform_dimensions, + const std::vector& uniform_shape) { if (!permutation.empty() && permutation.size() != ndim) { return Status::Invalid("permutation size must match ndim. Expected: ", ndim, " Got: ", permutation.size()); @@ -202,16 +224,20 @@ Result> VariableShapeTensorType::Make( if (uniform_dimensions.size() > ndim) { return Status::Invalid("uniform_dimensions size must be less or equal ndim."); } - return std::make_shared(value_type, ndim, permutation, - dim_names, uniform_dimensions); + if (uniform_shape.size() > ndim) { + return Status::Invalid("uniform_shape size must be less or equal ndim."); + } + return std::make_shared( + value_type, ndim, permutation, dim_names, uniform_dimensions, uniform_shape); } std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation, const std::vector& dim_names, - const std::vector& uniform_dimensions) { - auto maybe_type = VariableShapeTensorType::Make(value_type, ndim, permutation, - dim_names, uniform_dimensions); + const std::vector& uniform_dimensions, + const std::vector& uniform_shape) { + auto maybe_type = VariableShapeTensorType::Make( + value_type, ndim, permutation, dim_names, uniform_dimensions, uniform_shape); ARROW_DCHECK_OK(maybe_type.status()); return maybe_type.MoveValueUnsafe(); } diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 2a7e41f9b3a..c2c40b364f8 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -44,13 +44,15 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& uniform_dimensions = {}) + const std::vector& uniform_dimensions = {}, + const std::vector& uniform_shape = {}) : ExtensionType(struct_({::arrow::field("shape", fixed_size_list(uint32(), ndim)), ::arrow::field("data", list(value_type))})), value_type_(value_type), permutation_(permutation), dim_names_(dim_names), - uniform_dimensions_(uniform_dimensions) {} + uniform_dimensions_(uniform_dimensions), + uniform_shape_(uniform_shape) {} std::string extension_name() const override { return "arrow.variable_shape_tensor"; } @@ -72,6 +74,9 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { /// Indexes of ragged dimensions. const std::vector& uniform_dimensions() const { return uniform_dimensions_; } + /// Shape of uniform dimensions. + const std::vector& uniform_shape() const { return uniform_shape_; } + bool ExtensionEquals(const ExtensionType& other) const override; std::string Serialize() const override; @@ -88,7 +93,8 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& uniform_dimensions = {}); + const std::vector& uniform_dimensions = {}, + const std::vector& uniform_shape = {}); private: std::shared_ptr storage_type_; @@ -96,6 +102,7 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { std::vector permutation_; std::vector dim_names_; std::vector uniform_dimensions_; + std::vector uniform_shape_; }; /// \brief Return a VariableShapeTensorType instance. @@ -103,7 +110,8 @@ ARROW_EXPORT std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& uniform_dimensions = {}); + const std::vector& uniform_dimensions = {}, + const std::vector& uniform_shape = {}); } // namespace extension } // namespace arrow diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 0812a25a11c..16d05647ba0 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -216,6 +216,12 @@ Variable shape tensor take advantage of the uniformity. uniform_dimensions can be left out, in which case it is assumed that all dimensions might be variable. + * **uniform_shape** = shape over dimensions that are guaranteed to stay + constant over of all tensors in the array if all their ragged dimension + sizes were replaced by 0. + An array containing tensor with shape (2, 3, 4) and uniform dimensions + (0, 2) would have uniform shape (2, 0, 4). + * Description of the serialization: The metadata must be a valid JSON object that optionally includes From 173656ffe7870536316d92de260528830e982597 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 25 Sep 2023 15:25:35 +0200 Subject: [PATCH 07/62] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- docs/source/format/CanonicalExtensions.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 16d05647ba0..44d2697390c 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -216,9 +216,9 @@ Variable shape tensor take advantage of the uniformity. uniform_dimensions can be left out, in which case it is assumed that all dimensions might be variable. - * **uniform_shape** = shape over dimensions that are guaranteed to stay - constant over of all tensors in the array if all their ragged dimension - sizes were replaced by 0. + * **uniform_shape** = shape of the dimensions that are guaranteed to stay + constant over all tensors in the array, with the shape of the ragged dimensions + set to 0. An array containing tensor with shape (2, 3, 4) and uniform dimensions (0, 2) would have uniform shape (2, 0, 4). From 7cd9eadba742af93541c36280392a21254af8e5e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 11 Oct 2023 17:13:21 +0200 Subject: [PATCH 08/62] Post rebase --- docs/source/format/CanonicalExtensions.rst | 20 -------------------- python/pyarrow/tests/test_extension_type.py | 2 +- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 44d2697390c..5a4131d8eb8 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -206,21 +206,6 @@ Variable shape tensor This allows for interpreting the tensor correctly without accounting for uniform dimensions while still permitting optional optimizations that take advantage of the uniformity. - * **uniform_dimensions** = indices of dimensions whose sizes are - guaranteed to remain constant. Indices are a subset of all possible - dimension indices ([0, 1, .., N-1]). - The uniform dimensions must still be represented in the `shape` field, - and must always be the same value for all tensors in the array -- this - allows code to interpret the tensor correctly without accounting for - uniform dimensions while still permitting optional optimizations that - take advantage of the uniformity. uniform_dimensions can be left out, - in which case it is assumed that all dimensions might be variable. - - * **uniform_shape** = shape of the dimensions that are guaranteed to stay - constant over all tensors in the array, with the shape of the ragged dimensions - set to 0. - An array containing tensor with shape (2, 3, 4) and uniform dimensions - (0, 2) would have uniform shape (2, 0, 4). * Description of the serialization: @@ -242,11 +227,6 @@ Variable shape tensor ``{ "dim_names": ["H", "W", "C"], "uniform_shape": [400, null, 3] }`` - - Example with ``uniform_dimensions`` metadata for a set of color images - with variable width: - - ``{ "dim_names": ["H", "W", "C"], "uniform_dimensions": [1] }`` - - Example of permuted 3-dimensional tensor: ``{ "permutation": [2, 0, 1] }`` diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index ebac37e862b..a44a52f3998 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1712,7 +1712,7 @@ def test_tensor_type_is_picklable(pickle_module): 'fixed_shape_tensor[value_type=int64, shape=[2,2,3], dim_names=[C,H,W]]' ) ]) -def test_tensor_type_str(tensor_type, text): +def test_tensor_type_str(tensor_type, text, pickle_module): tensor_type_str = tensor_type.__str__() assert text in tensor_type_str From 3bbdd46725a1589ce6856b606c0a3aacedd627a2 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 12 Oct 2023 04:56:17 +0200 Subject: [PATCH 09/62] Remove uniform_dimensions, fix python test --- .../extension/tensor_extension_array_test.cc | 6 ++-- .../arrow/extension/variable_shape_tensor.cc | 31 ++----------------- .../arrow/extension/variable_shape_tensor.h | 8 ----- 3 files changed, 5 insertions(+), 40 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index cdefe7aa295..d2d06e24ea2 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -492,17 +492,16 @@ class TestVariableShapeTensorType : public ::testing::Test { shape_type_ = fixed_size_list(uint32(), ndim_); permutation_ = {0, 1, 2}; dim_names_ = {"x", "y", "z"}; - uniform_dimensions_ = {1}; uniform_shape_ = {0, 1, 0}; ext_type_ = internal::checked_pointer_cast( variable_shape_tensor(value_type_, ndim_, permutation_, dim_names_, - uniform_dimensions_, uniform_shape_)); + uniform_shape_)); shapes_ = ArrayFromJSON(fixed_size_list(uint32(), ndim_), "[[2,1,3],[2,1,2],[3,1,3]]"); data_ = ArrayFromJSON(list(value_type_), "[[0,1,2,3,4,5],[6,7,8,9],[10,11,12,13,14,15,16,17,18]]"); serialized_ = - R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"uniform_dimensions":[1],"uniform_shape":[0,1,0]})"; + R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"uniform_shape":[0,1,0]})"; storage_arr_ = ArrayFromJSON( ext_type_->storage_type(), R"([[[2,3,1],[0,1,2,3,4,5]],[[1,2,2],[6,7,8,9]],[[3,1,3],[10,11,12,13,14,15,16,17,18]]])"); @@ -516,7 +515,6 @@ class TestVariableShapeTensorType : public ::testing::Test { std::shared_ptr data_type_; std::shared_ptr shape_type_; std::vector permutation_; - std::vector uniform_dimensions_; std::vector uniform_shape_; std::vector dim_names_; std::shared_ptr ext_type_; diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 89bbf4011f9..0982fd838c9 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -93,7 +93,6 @@ bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const return (storage_type()->Equals(other_ext.storage_type())) && (dim_names_ == other_ext.dim_names()) && - (uniform_dimensions_ == other_ext.uniform_dimensions_) && (uniform_shape_ == other_ext.uniform_shape()) && permutation_equivalent; } @@ -118,15 +117,6 @@ std::string VariableShapeTensorType::Serialize() const { document.AddMember(rj::Value("dim_names", allocator), dim_names, allocator); } - if (!uniform_dimensions_.empty()) { - rj::Value uniform_dimensions(rj::kArrayType); - for (auto v : uniform_dimensions_) { - uniform_dimensions.PushBack(v, allocator); - } - document.AddMember(rj::Value("uniform_dimensions", allocator), uniform_dimensions, - allocator); - } - if (!uniform_shape_.empty()) { rj::Value uniform_shape(rj::kArrayType); for (auto v : uniform_shape_) { @@ -176,16 +166,6 @@ Result> VariableShapeTensorType::Deserialize( } } - std::vector uniform_dimensions; - if (document.HasMember("uniform_dimensions")) { - for (auto& x : document["uniform_dimensions"].GetArray()) { - uniform_dimensions.emplace_back(x.GetInt64()); - } - if (uniform_dimensions.size() > ndim) { - return Status::Invalid("Invalid uniform_dimensions"); - } - } - std::vector uniform_shape; if (document.HasMember("uniform_shape")) { for (auto& x : document["uniform_shape"].GetArray()) { @@ -197,7 +177,7 @@ Result> VariableShapeTensorType::Deserialize( } return variable_shape_tensor(value_type, static_cast(ndim), permutation, - dim_names, uniform_dimensions, uniform_shape); + dim_names, uniform_shape); } std::shared_ptr VariableShapeTensorType::MakeArray( @@ -211,7 +191,6 @@ std::shared_ptr VariableShapeTensorType::MakeArray( Result> VariableShapeTensorType::Make( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation, const std::vector& dim_names, - const std::vector& uniform_dimensions, const std::vector& uniform_shape) { if (!permutation.empty() && permutation.size() != ndim) { return Status::Invalid("permutation size must match ndim. Expected: ", ndim, @@ -221,23 +200,19 @@ Result> VariableShapeTensorType::Make( return Status::Invalid("dim_names size must match ndim. Expected: ", ndim, " Got: ", dim_names.size()); } - if (uniform_dimensions.size() > ndim) { - return Status::Invalid("uniform_dimensions size must be less or equal ndim."); - } if (uniform_shape.size() > ndim) { return Status::Invalid("uniform_shape size must be less or equal ndim."); } return std::make_shared( - value_type, ndim, permutation, dim_names, uniform_dimensions, uniform_shape); + value_type, ndim, permutation, dim_names, uniform_shape); } std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation, const std::vector& dim_names, - const std::vector& uniform_dimensions, const std::vector& uniform_shape) { auto maybe_type = VariableShapeTensorType::Make( - value_type, ndim, permutation, dim_names, uniform_dimensions, uniform_shape); + value_type, ndim, permutation, dim_names, uniform_shape); ARROW_DCHECK_OK(maybe_type.status()); return maybe_type.MoveValueUnsafe(); } diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index c2c40b364f8..469a3f87396 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -44,14 +44,12 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& uniform_dimensions = {}, const std::vector& uniform_shape = {}) : ExtensionType(struct_({::arrow::field("shape", fixed_size_list(uint32(), ndim)), ::arrow::field("data", list(value_type))})), value_type_(value_type), permutation_(permutation), dim_names_(dim_names), - uniform_dimensions_(uniform_dimensions), uniform_shape_(uniform_shape) {} std::string extension_name() const override { return "arrow.variable_shape_tensor"; } @@ -71,9 +69,6 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { /// Dimension names of tensor elements. Dimensions are ordered physically. const std::vector& dim_names() const { return dim_names_; } - /// Indexes of ragged dimensions. - const std::vector& uniform_dimensions() const { return uniform_dimensions_; } - /// Shape of uniform dimensions. const std::vector& uniform_shape() const { return uniform_shape_; } @@ -93,7 +88,6 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& uniform_dimensions = {}, const std::vector& uniform_shape = {}); private: @@ -101,7 +95,6 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { std::shared_ptr value_type_; std::vector permutation_; std::vector dim_names_; - std::vector uniform_dimensions_; std::vector uniform_shape_; }; @@ -110,7 +103,6 @@ ARROW_EXPORT std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& uniform_dimensions = {}, const std::vector& uniform_shape = {}); } // namespace extension From 46921660158bf476130bb145548482974f191c88 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 12 Oct 2023 05:30:04 +0200 Subject: [PATCH 10/62] lint --- cpp/src/arrow/extension/tensor_extension_array_test.cc | 5 ++--- cpp/src/arrow/extension/variable_shape_tensor.cc | 8 ++++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index d2d06e24ea2..7e3c4a810c5 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -493,9 +493,8 @@ class TestVariableShapeTensorType : public ::testing::Test { permutation_ = {0, 1, 2}; dim_names_ = {"x", "y", "z"}; uniform_shape_ = {0, 1, 0}; - ext_type_ = internal::checked_pointer_cast( - variable_shape_tensor(value_type_, ndim_, permutation_, dim_names_, - uniform_shape_)); + ext_type_ = internal::checked_pointer_cast(variable_shape_tensor( + value_type_, ndim_, permutation_, dim_names_, uniform_shape_)); shapes_ = ArrayFromJSON(fixed_size_list(uint32(), ndim_), "[[2,1,3],[2,1,2],[3,1,3]]"); data_ = ArrayFromJSON(list(value_type_), diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 0982fd838c9..4fa71f66b24 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -203,16 +203,16 @@ Result> VariableShapeTensorType::Make( if (uniform_shape.size() > ndim) { return Status::Invalid("uniform_shape size must be less or equal ndim."); } - return std::make_shared( - value_type, ndim, permutation, dim_names, uniform_shape); + return std::make_shared(value_type, ndim, permutation, + dim_names, uniform_shape); } std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation, const std::vector& dim_names, const std::vector& uniform_shape) { - auto maybe_type = VariableShapeTensorType::Make( - value_type, ndim, permutation, dim_names, uniform_shape); + auto maybe_type = VariableShapeTensorType::Make(value_type, ndim, permutation, + dim_names, uniform_shape); ARROW_DCHECK_OK(maybe_type.status()); return maybe_type.MoveValueUnsafe(); } From 835aceab34f39676a53f169b6ad8928e0ff61334 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 12 Oct 2023 14:44:14 +0200 Subject: [PATCH 11/62] uniform_shape values are optional --- .../extension/tensor_extension_array_test.cc | 6 +++--- .../arrow/extension/variable_shape_tensor.cc | 18 +++++++++++++----- .../arrow/extension/variable_shape_tensor.h | 12 +++++++----- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 7e3c4a810c5..74d4882e503 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -492,7 +492,7 @@ class TestVariableShapeTensorType : public ::testing::Test { shape_type_ = fixed_size_list(uint32(), ndim_); permutation_ = {0, 1, 2}; dim_names_ = {"x", "y", "z"}; - uniform_shape_ = {0, 1, 0}; + uniform_shape_ = {std::nullopt, std::optional(1), std::nullopt}; ext_type_ = internal::checked_pointer_cast(variable_shape_tensor( value_type_, ndim_, permutation_, dim_names_, uniform_shape_)); shapes_ = @@ -500,7 +500,7 @@ class TestVariableShapeTensorType : public ::testing::Test { data_ = ArrayFromJSON(list(value_type_), "[[0,1,2,3,4,5],[6,7,8,9],[10,11,12,13,14,15,16,17,18]]"); serialized_ = - R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"uniform_shape":[0,1,0]})"; + R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"uniform_shape":[null,1,null]})"; storage_arr_ = ArrayFromJSON( ext_type_->storage_type(), R"([[[2,3,1],[0,1,2,3,4,5]],[[1,2,2],[6,7,8,9]],[[3,1,3],[10,11,12,13,14,15,16,17,18]]])"); @@ -514,7 +514,7 @@ class TestVariableShapeTensorType : public ::testing::Test { std::shared_ptr data_type_; std::shared_ptr shape_type_; std::vector permutation_; - std::vector uniform_shape_; + std::vector> uniform_shape_; std::vector dim_names_; std::shared_ptr ext_type_; std::shared_ptr shapes_; diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 4fa71f66b24..0c6810671fe 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -120,7 +120,11 @@ std::string VariableShapeTensorType::Serialize() const { if (!uniform_shape_.empty()) { rj::Value uniform_shape(rj::kArrayType); for (auto v : uniform_shape_) { - uniform_shape.PushBack(v, allocator); + if (v.has_value()) { + uniform_shape.PushBack(v.value(), allocator); + } else { + uniform_shape.PushBack(rj::Value{}.SetNull(), allocator); + } } document.AddMember(rj::Value("uniform_shape", allocator), uniform_shape, allocator); } @@ -166,10 +170,14 @@ Result> VariableShapeTensorType::Deserialize( } } - std::vector uniform_shape; + std::vector> uniform_shape; if (document.HasMember("uniform_shape")) { for (auto& x : document["uniform_shape"].GetArray()) { - uniform_shape.emplace_back(x.GetInt64()); + if (x.IsNull()) { + uniform_shape.emplace_back(std::nullopt); + } else { + uniform_shape.emplace_back(x.GetInt64()); + } } if (uniform_shape.size() > ndim) { return Status::Invalid("Invalid uniform_shape"); @@ -191,7 +199,7 @@ std::shared_ptr VariableShapeTensorType::MakeArray( Result> VariableShapeTensorType::Make( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation, const std::vector& dim_names, - const std::vector& uniform_shape) { + const std::vector>& uniform_shape) { if (!permutation.empty() && permutation.size() != ndim) { return Status::Invalid("permutation size must match ndim. Expected: ", ndim, " Got: ", permutation.size()); @@ -210,7 +218,7 @@ Result> VariableShapeTensorType::Make( std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation, const std::vector& dim_names, - const std::vector& uniform_shape) { + const std::vector>& uniform_shape) { auto maybe_type = VariableShapeTensorType::Make(value_type, ndim, permutation, dim_names, uniform_shape); ARROW_DCHECK_OK(maybe_type.status()); diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 469a3f87396..da14b4d8c1b 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -44,7 +44,7 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& uniform_shape = {}) + const std::vector>& uniform_shape = {}) : ExtensionType(struct_({::arrow::field("shape", fixed_size_list(uint32(), ndim)), ::arrow::field("data", list(value_type))})), value_type_(value_type), @@ -70,7 +70,9 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const std::vector& dim_names() const { return dim_names_; } /// Shape of uniform dimensions. - const std::vector& uniform_shape() const { return uniform_shape_; } + const std::vector>& uniform_shape() const { + return uniform_shape_; + } bool ExtensionEquals(const ExtensionType& other) const override; @@ -88,14 +90,14 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& uniform_shape = {}); + const std::vector>& uniform_shape = {}); private: std::shared_ptr storage_type_; std::shared_ptr value_type_; std::vector permutation_; std::vector dim_names_; - std::vector uniform_shape_; + std::vector> uniform_shape_; }; /// \brief Return a VariableShapeTensorType instance. @@ -103,7 +105,7 @@ ARROW_EXPORT std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& uniform_shape = {}); + const std::vector>& uniform_shape = {}); } // namespace extension } // namespace arrow From a3b82a639a02e483d51312bb68dba52fafe11f10 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 29 Oct 2023 03:26:46 +0100 Subject: [PATCH 12/62] Add scalar test --- cpp/src/arrow/extension/variable_shape_tensor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 0c6810671fe..9c303bc1033 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -56,7 +56,7 @@ const Result> VariableShapeTensorArray::GetTensor( } std::vector strides; - // TODO: optimize ComputeStrides for ragged tensors + // TODO: optimize ComputeStrides for non-uniform tensors ARROW_CHECK_OK(internal::ComputeStrides(*value_type.get(), shape, ext_type->permutation(), &strides)); From 91547fd2a18a946668b41e6709a7cb8e7680a131 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 30 Oct 2023 05:47:04 +0100 Subject: [PATCH 13/62] Create Tensor from scalar --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 9 +++++---- cpp/src/arrow/extension/fixed_shape_tensor.h | 3 ++- .../extension/tensor_extension_array_test.cc | 5 +++++ cpp/src/arrow/extension/variable_shape_tensor.cc | 15 +++++++-------- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index bb7082e6976..45c211b0cee 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -43,14 +43,15 @@ namespace extension { namespace { -Status ComputeStrides(const FixedWidthType& type, const std::vector& shape, +Status ComputeStrides(const std::shared_ptr& value_type, + const std::vector& shape, const std::vector& permutation, std::vector* strides) { + auto fixed_width_type = internal::checked_pointer_cast(value_type); if (permutation.empty()) { - return internal::ComputeRowMajorStrides(type, shape, strides); + return internal::ComputeRowMajorStrides(*fixed_width_type.get(), shape, strides); } - - const int byte_width = type.byte_width(); + const int byte_width = value_type->byte_width(); int64_t remaining = 0; if (!shape.empty() && shape.front() > 0) { diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index f9a7140c6e5..3eaeab236d5 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -23,7 +23,8 @@ namespace arrow { namespace internal { ARROW_EXPORT -Status ComputeStrides(const FixedWidthType& type, const std::vector& shape, +Status ComputeStrides(const std::shared_ptr& value_type, + const std::vector& shape, const std::vector& permutation, std::vector* strides); diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 74d4882e503..0b86841acf6 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -652,6 +652,11 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { ASSERT_EQ(t->shape(), (std::vector{2, 3, 1})); ASSERT_EQ(t->strides(), (std::vector{24, 8, 8})); + ASSERT_OK_AND_ASSIGN(auto sc, ext_array->GetScalar(0)); + + auto vt = internal::checked_pointer_cast(sc->type); + auto it = vt->value_type(); + std::vector shape = {2, 3, 1}; std::vector strides = {sizeof(int64_t) * 3, sizeof(int64_t) * 1, sizeof(int64_t) * 1}; diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 9c303bc1033..c87dd4036ef 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -41,8 +41,7 @@ const Result> VariableShapeTensorArray::GetTensor( const int64_t i) const { auto ext_arr = internal::checked_pointer_cast(this->storage()); auto ext_type = internal::checked_pointer_cast(this->type()); - auto value_type = - internal::checked_pointer_cast(ext_type->value_type()); + auto value_type = ext_type->value_type(); auto ndim = ext_type->ndim(); auto dim_names = ext_type->dim_names(); auto shapes = @@ -57,16 +56,16 @@ const Result> VariableShapeTensorArray::GetTensor( std::vector strides; // TODO: optimize ComputeStrides for non-uniform tensors - ARROW_CHECK_OK(internal::ComputeStrides(*value_type.get(), shape, - ext_type->permutation(), &strides)); + ARROW_CHECK_OK( + internal::ComputeStrides(value_type, shape, ext_type->permutation(), &strides)); auto list_arr = std::static_pointer_cast(ext_arr->field(1))->value_slice(i)->data(); - auto bw = value_type->byte_width(); - auto buffer = - SliceBuffer(list_arr->buffers[1], list_arr->offset * bw, list_arr->length * bw); + auto byte_width = value_type->byte_width(); + auto buffer = SliceBuffer(list_arr->buffers[1], list_arr->offset * byte_width, + list_arr->length * byte_width); - return Tensor::Make(ext_type->value_type(), buffer, shape, strides, dim_names); + return Tensor::Make(value_type, buffer, shape, strides, dim_names); } bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const { From 5bd975dd22f5acd52dfbef2b7fa9cf177111de3d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 28 Nov 2023 04:22:41 +0100 Subject: [PATCH 14/62] Move get_tensor logic to cpp --- .../extension/tensor_extension_array_test.cc | 22 +++++++++++---- .../arrow/extension/variable_shape_tensor.cc | 28 +++++++++++++++++++ .../arrow/extension/variable_shape_tensor.h | 4 +++ 3 files changed, 49 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 0b86841acf6..104d2b86275 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -652,11 +652,6 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { ASSERT_EQ(t->shape(), (std::vector{2, 3, 1})); ASSERT_EQ(t->strides(), (std::vector{24, 8, 8})); - ASSERT_OK_AND_ASSIGN(auto sc, ext_array->GetScalar(0)); - - auto vt = internal::checked_pointer_cast(sc->type); - auto it = vt->value_type(); - std::vector shape = {2, 3, 1}; std::vector strides = {sizeof(int64_t) * 3, sizeof(int64_t) * 1, sizeof(int64_t) * 1}; @@ -688,6 +683,23 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { ASSERT_EQ(tensor->is_contiguous(), t->is_contiguous()); ASSERT_EQ(tensor->is_column_major(), t->is_column_major()); ASSERT_TRUE(tensor->Equals(*t)); + + auto exact_ext_type = + internal::checked_pointer_cast(ext_type_); + + ASSERT_OK_AND_ASSIGN(auto sc, ext_arr->GetScalar(2)); + auto s = internal::checked_pointer_cast(sc); + ASSERT_OK_AND_ASSIGN(t, exact_ext_type->GetTensor(s)); + ASSERT_EQ(tensor->strides(), t->strides()); + ASSERT_EQ(tensor->shape(), t->shape()); + ASSERT_EQ(tensor->dim_names(), t->dim_names()); + ASSERT_EQ(tensor->type(), t->type()); + ASSERT_EQ(tensor->is_contiguous(), t->is_contiguous()); + ASSERT_EQ(tensor->is_column_major(), t->is_column_major()); + + // tensor's data == {10, 11, 12, 13, 14, 15, 16, 17, 18} + // t's data == {1, 1, 2, 3, 4, 5, 6, 7, 8} + ASSERT_TRUE(tensor->Equals(*t)); } } // namespace arrow diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index c87dd4036ef..0a03597e183 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -195,6 +195,34 @@ std::shared_ptr VariableShapeTensorType::MakeArray( return std::make_shared(data); } +Result> VariableShapeTensorType::GetTensor( + const std::shared_ptr& scalar) const { + const auto tensor_scalar = internal::checked_pointer_cast(scalar->value); + const auto fields = this->storage_type()->fields(); + + ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar->field(0)); + ARROW_ASSIGN_OR_RAISE(const auto data, tensor_scalar->field(1)); + const auto shape_array = + std::static_pointer_cast(shape_scalar)->value; + + std::vector shape; + for (uint32_t j = 0; j < this->ndim(); ++j) { + ARROW_ASSIGN_OR_RAISE(auto size, shape_array->GetScalar(j)); + shape.push_back( + static_cast(std::static_pointer_cast(size)->value)); + } + + // TODO: optimize ComputeStrides for non-uniform tensors + std::vector strides; + ARROW_CHECK_OK( + internal::ComputeStrides(this->value_type(), shape, this->permutation(), &strides)); + + const auto buffer = + std::static_pointer_cast(data)->value->data()->buffers[1]; + + return Tensor::Make(this->value_type(), buffer, shape, strides, this->dim_names()); +} + Result> VariableShapeTensorType::Make( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation, const std::vector& dim_names, diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index da14b4d8c1b..f44dfd8e168 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -85,6 +85,10 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { /// Create a VariableShapeTensorArray from ArrayData std::shared_ptr MakeArray(std::shared_ptr data) const override; + /// Convert an ExtensionScalar to a Tensor + Result> GetTensor( + const std::shared_ptr&) const; + /// \brief Create a VariableShapeTensorType instance static Result> Make( const std::shared_ptr& value_type, const uint32_t& ndim, From 53426429311c726a66b722f5caf242f3169a9b56 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 28 Nov 2023 13:54:09 +0100 Subject: [PATCH 15/62] slice buffer with array offset --- .../arrow/extension/tensor_extension_array_test.cc | 3 --- cpp/src/arrow/extension/variable_shape_tensor.cc | 11 +++++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 104d2b86275..bb13b4b6182 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -696,9 +696,6 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { ASSERT_EQ(tensor->type(), t->type()); ASSERT_EQ(tensor->is_contiguous(), t->is_contiguous()); ASSERT_EQ(tensor->is_column_major(), t->is_column_major()); - - // tensor's data == {10, 11, 12, 13, 14, 15, 16, 17, 18} - // t's data == {1, 1, 2, 3, 4, 5, 6, 7, 8} ASSERT_TRUE(tensor->Equals(*t)); } diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 0a03597e183..1343e3538f6 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -217,8 +217,15 @@ Result> VariableShapeTensorType::GetTensor( ARROW_CHECK_OK( internal::ComputeStrides(this->value_type(), shape, this->permutation(), &strides)); - const auto buffer = - std::static_pointer_cast(data)->value->data()->buffers[1]; + const auto array = std::static_pointer_cast(data)->value; + const auto byte_width = this->value_type()->byte_width(); + const auto start_position = array->offset() * byte_width; + const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), + std::multiplies<>()); + + // Create a slice of the buffer + std::shared_ptr buffer = + arrow::SliceBuffer(array->data()->buffers[1], start_position, size * byte_width); return Tensor::Make(this->value_type(), buffer, shape, strides, this->dim_names()); } From 659e17cf3fc795322cfbf8d8a8c50d80bbb100fc Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 29 Nov 2023 00:11:04 +0100 Subject: [PATCH 16/62] Update cpp/src/arrow/extension/variable_shape_tensor.h Co-authored-by: Antoine Pitrou --- cpp/src/arrow/extension/variable_shape_tensor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index f44dfd8e168..8bb1e197a87 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -61,7 +61,7 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { } /// Value type of tensor elements - const std::shared_ptr value_type() const { return value_type_; } + const std::shared_ptr& value_type() const { return value_type_; } /// Permutation mapping from logical to physical memory layout of tensor elements const std::vector& permutation() const { return permutation_; } From 294479f1ac67a7b4ac2fac9b3927fd5ce2e20454 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 29 Nov 2023 00:15:12 +0100 Subject: [PATCH 17/62] Update cpp/src/arrow/extension/variable_shape_tensor.cc Co-authored-by: Antoine Pitrou --- cpp/src/arrow/extension/variable_shape_tensor.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 1343e3538f6..a01ccbfb98b 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -197,8 +197,8 @@ std::shared_ptr VariableShapeTensorType::MakeArray( Result> VariableShapeTensorType::GetTensor( const std::shared_ptr& scalar) const { - const auto tensor_scalar = internal::checked_pointer_cast(scalar->value); - const auto fields = this->storage_type()->fields(); + const auto& tensor_scalar = internal::checked_cast(*scalar->value); + const auto& fields = this->storage_type()->fields(); ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar->field(0)); ARROW_ASSIGN_OR_RAISE(const auto data, tensor_scalar->field(1)); From 8198d23d762f777096f11d6a37acc8defec9ed9e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 29 Nov 2023 00:16:20 +0100 Subject: [PATCH 18/62] Update cpp/src/arrow/extension/variable_shape_tensor.cc Co-authored-by: Antoine Pitrou --- cpp/src/arrow/extension/variable_shape_tensor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index a01ccbfb98b..2c9ff463f2f 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -217,7 +217,7 @@ Result> VariableShapeTensorType::GetTensor( ARROW_CHECK_OK( internal::ComputeStrides(this->value_type(), shape, this->permutation(), &strides)); - const auto array = std::static_pointer_cast(data)->value; + const auto& array = checked_cast(*data)->value; const auto byte_width = this->value_type()->byte_width(); const auto start_position = array->offset() * byte_width; const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), From ee7b50d9641a0180c28a9d19d48051ffd57ed864 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 29 Nov 2023 00:17:29 +0100 Subject: [PATCH 19/62] Update cpp/src/arrow/extension/variable_shape_tensor.cc Co-authored-by: Antoine Pitrou --- cpp/src/arrow/extension/variable_shape_tensor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 2c9ff463f2f..24a656ba8fd 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -227,7 +227,7 @@ Result> VariableShapeTensorType::GetTensor( std::shared_ptr buffer = arrow::SliceBuffer(array->data()->buffers[1], start_position, size * byte_width); - return Tensor::Make(this->value_type(), buffer, shape, strides, this->dim_names()); + return Tensor::Make(this->value_type(), std::move(buffer), std::move(shape), std::move(strides), this->dim_names()); } Result> VariableShapeTensorType::Make( From bb42481242b8eeaa5c7e91f49a2c45530611a20c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 29 Nov 2023 00:19:39 +0100 Subject: [PATCH 20/62] Update cpp/src/arrow/extension/variable_shape_tensor.cc Co-authored-by: Antoine Pitrou --- cpp/src/arrow/extension/variable_shape_tensor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 24a656ba8fd..925f0fe7ce1 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -255,7 +255,7 @@ std::shared_ptr variable_shape_tensor( const std::vector>& uniform_shape) { auto maybe_type = VariableShapeTensorType::Make(value_type, ndim, permutation, dim_names, uniform_shape); - ARROW_DCHECK_OK(maybe_type.status()); + ARROW_CHECK_OK(maybe_type.status()); return maybe_type.MoveValueUnsafe(); } From d30cc8028e2b5a86bd1b4fe3cc4ff386f12adde9 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 29 Nov 2023 00:20:51 +0100 Subject: [PATCH 21/62] Review feedback --- .../extension/tensor_extension_array_test.cc | 2 +- cpp/src/arrow/extension/tensor_internal.h | 47 ++++++++++++++++++- .../arrow/extension/variable_shape_tensor.cc | 14 +++--- .../arrow/extension/variable_shape_tensor.h | 20 ++++---- 4 files changed, 62 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index bb13b4b6182..104d429b4a0 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -538,7 +538,7 @@ TEST_F(TestVariableShapeTensorType, CreateExtensionType) { ASSERT_EQ(ext_type_->extension_name(), "arrow.variable_shape_tensor"); ASSERT_TRUE(ext_type_->Equals(*exact_ext_type)); auto expected_type = struct_({ - ::arrow::field("shape", fixed_size_list(uint32(), ndim_)), + ::arrow::field("shape", fixed_size_list(int32(), ndim_)), ::arrow::field("data", list(value_type_)), }); diff --git a/cpp/src/arrow/extension/tensor_internal.h b/cpp/src/arrow/extension/tensor_internal.h index 62b1dba6144..a9583991db8 100644 --- a/cpp/src/arrow/extension/tensor_internal.h +++ b/cpp/src/arrow/extension/tensor_internal.h @@ -16,9 +16,12 @@ // under the License. #pragma once +#include "arrow/extension/tensor_internal.h" -#include -#include +#include "arrow/tensor.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/int_util_overflow.h" +#include "arrow/util/sort_internal.h" #include "arrow/status.h" #include "arrow/util/print_internal.h" @@ -41,4 +44,44 @@ inline Status IsPermutationValid(const std::vector& permutation) { return Status::OK(); } +inline Status ComputeStrides(const std::shared_ptr& value_type, + const std::vector& shape, + const std::vector& permutation, + std::vector* strides) { + auto fixed_width_type = internal::checked_pointer_cast(value_type); + if (permutation.empty()) { + return internal::ComputeRowMajorStrides(*fixed_width_type.get(), shape, strides); + } + const int byte_width = value_type->byte_width(); + + int64_t remaining = 0; + if (!shape.empty() && shape.front() > 0) { + remaining = byte_width; + for (auto i : permutation) { + if (i > 0) { + if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) { + return Status::Invalid( + "Strides computed from shape would not fit in 64-bit integer"); + } + } + } + } + + if (remaining == 0) { + strides->assign(shape.size(), byte_width); + return Status::OK(); + } + + strides->push_back(remaining); + for (auto i : permutation) { + if (i > 0) { + remaining /= shape[i]; + strides->push_back(remaining); + } + } + internal::Permute(permutation, strides); + + return Status::OK(); +} + } // namespace arrow::internal diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 925f0fe7ce1..a0ee6958d46 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -17,7 +17,7 @@ #include -#include "arrow/extension/fixed_shape_tensor.h" +#include "arrow/extension/tensor_internal.h" #include "arrow/extension/variable_shape_tensor.h" #include "arrow/array/array_nested.h" @@ -191,17 +191,16 @@ std::shared_ptr VariableShapeTensorType::MakeArray( std::shared_ptr data) const { DCHECK_EQ(data->type->id(), Type::EXTENSION); DCHECK_EQ("arrow.variable_shape_tensor", - static_cast(*data->type).extension_name()); + internal::checked_cast(*data->type).extension_name()); return std::make_shared(data); } Result> VariableShapeTensorType::GetTensor( const std::shared_ptr& scalar) const { const auto& tensor_scalar = internal::checked_cast(*scalar->value); - const auto& fields = this->storage_type()->fields(); - ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar->field(0)); - ARROW_ASSIGN_OR_RAISE(const auto data, tensor_scalar->field(1)); + ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar.field(0)); + ARROW_ASSIGN_OR_RAISE(const auto data, tensor_scalar.field(1)); const auto shape_array = std::static_pointer_cast(shape_scalar)->value; @@ -217,7 +216,7 @@ Result> VariableShapeTensorType::GetTensor( ARROW_CHECK_OK( internal::ComputeStrides(this->value_type(), shape, this->permutation(), &strides)); - const auto& array = checked_cast(*data)->value; + const auto& array = internal::checked_cast(*data).value; const auto byte_width = this->value_type()->byte_width(); const auto start_position = array->offset() * byte_width; const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), @@ -227,7 +226,8 @@ Result> VariableShapeTensorType::GetTensor( std::shared_ptr buffer = arrow::SliceBuffer(array->data()->buffers[1], start_position, size * byte_width); - return Tensor::Make(this->value_type(), std::move(buffer), std::move(shape), std::move(strides), this->dim_names()); + return Tensor::Make(this->value_type(), std::move(buffer), std::move(shape), + std::move(strides), this->dim_names()); } Result> VariableShapeTensorType::Make( diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 8bb1e197a87..477a8bfbbf1 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -40,14 +40,14 @@ class ARROW_EXPORT VariableShapeTensorArray : public ExtensionArray { /// See: https://arrow.apache.org/docs/format/CanonicalExtensions.html class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { public: - VariableShapeTensorType(const std::shared_ptr& value_type, - const uint32_t& ndim, - const std::vector& permutation = {}, - const std::vector& dim_names = {}, - const std::vector>& uniform_shape = {}) - : ExtensionType(struct_({::arrow::field("shape", fixed_size_list(uint32(), ndim)), + VariableShapeTensorType(const std::shared_ptr& value_type, const int32_t ndim, + const std::vector permutation = {}, + const std::vector dim_names = {}, + const std::vector> uniform_shape = {}) + : ExtensionType(struct_({::arrow::field("shape", fixed_size_list(int32(), ndim)), ::arrow::field("data", list(value_type))})), - value_type_(value_type), + value_type_(std::move(value_type)), + ndim_(ndim), permutation_(permutation), dim_names_(dim_names), uniform_shape_(uniform_shape) {} @@ -55,10 +55,7 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { std::string extension_name() const override { return "arrow.variable_shape_tensor"; } /// Number of dimensions of tensor elements - uint32_t ndim() const { - std::shared_ptr storage_type = this->storage_type()->field(0)->type(); - return std::static_pointer_cast(storage_type)->list_size(); - } + uint32_t ndim() const { return ndim_; } /// Value type of tensor elements const std::shared_ptr& value_type() const { return value_type_; } @@ -99,6 +96,7 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { private: std::shared_ptr storage_type_; std::shared_ptr value_type_; + int32_t ndim_; std::vector permutation_; std::vector dim_names_; std::vector> uniform_shape_; From 1264a3268f4b68c0e3cc7a159f32010aa1ba2a2c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 29 Nov 2023 14:28:27 +0100 Subject: [PATCH 22/62] Update cpp/src/arrow/extension/variable_shape_tensor.cc Co-authored-by: Antoine Pitrou --- cpp/src/arrow/extension/variable_shape_tensor.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index a0ee6958d46..e3f0d682b4a 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -201,8 +201,8 @@ Result> VariableShapeTensorType::GetTensor( ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar.field(0)); ARROW_ASSIGN_OR_RAISE(const auto data, tensor_scalar.field(1)); - const auto shape_array = - std::static_pointer_cast(shape_scalar)->value; + const auto& shape_array = + checked_cast(*checked_cast(*shape_scalar)->value); std::vector shape; for (uint32_t j = 0; j < this->ndim(); ++j) { From 0c66c312c8c33ec185278389d23a978ad8ee3fe1 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 29 Nov 2023 14:30:41 +0100 Subject: [PATCH 23/62] Review feedback --- .../arrow/extension/variable_shape_tensor.cc | 76 ++++++++++++------- .../arrow/extension/variable_shape_tensor.h | 28 +++---- 2 files changed, 62 insertions(+), 42 deletions(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index e3f0d682b4a..2e46e980c2a 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -50,8 +50,7 @@ const Result> VariableShapeTensorArray::GetTensor( std::vector shape; for (int64_t j = 0; j < ndim; ++j) { ARROW_ASSIGN_OR_RAISE(auto size, shapes->GetScalar(j)); - shape.push_back( - static_cast(std::static_pointer_cast(size)->value)); + shape.push_back(std::static_pointer_cast(size)->value); } std::vector strides; @@ -59,11 +58,11 @@ const Result> VariableShapeTensorArray::GetTensor( ARROW_CHECK_OK( internal::ComputeStrides(value_type, shape, ext_type->permutation(), &strides)); - auto list_arr = + const auto list_arr = std::static_pointer_cast(ext_arr->field(1))->value_slice(i)->data(); - auto byte_width = value_type->byte_width(); - auto buffer = SliceBuffer(list_arr->buffers[1], list_arr->offset * byte_width, - list_arr->length * byte_width); + const auto byte_width = value_type->byte_width(); + const auto buffer = SliceBuffer(list_arr->buffers[1], list_arr->offset * byte_width, + list_arr->length * byte_width); return Tensor::Make(value_type, buffer, shape, strides, dim_names); } @@ -140,7 +139,16 @@ Result> VariableShapeTensorType::Deserialize( return Status::Invalid("Expected Struct storage type, got ", storage_type->ToString()); } - auto value_type = storage_type->field(1)->type()->field(0)->type(); + + ARROW_DCHECK_EQ(storage_type->num_fields(), 2); + ARROW_DCHECK_EQ(storage_type->field(0)->type()->id(), Type::FIXED_SIZE_LIST); + ARROW_DCHECK_EQ( + std::static_pointer_cast(storage_type->field(0)->type()) + ->value_type(), + int32()); + ARROW_DCHECK_EQ(storage_type->field(1)->type()->id(), Type::LIST); + + const auto value_type = storage_type->field(1)->type()->field(0)->type(); const size_t ndim = std::static_pointer_cast(storage_type->field(0)->type()) ->list_size(); @@ -178,12 +186,13 @@ Result> VariableShapeTensorType::Deserialize( uniform_shape.emplace_back(x.GetInt64()); } } - if (uniform_shape.size() > ndim) { - return Status::Invalid("Invalid uniform_shape"); + if (uniform_shape.size() != ndim) { + return Status::Invalid("uniform_shape size must match ndim. Expected: ", ndim, + " Got: ", uniform_shape.size()); } } - return variable_shape_tensor(value_type, static_cast(ndim), permutation, + return variable_shape_tensor(value_type, static_cast(ndim), permutation, dim_names, uniform_shape); } @@ -205,10 +214,11 @@ Result> VariableShapeTensorType::GetTensor( checked_cast(*checked_cast(*shape_scalar)->value); std::vector shape; - for (uint32_t j = 0; j < this->ndim(); ++j) { - ARROW_ASSIGN_OR_RAISE(auto size, shape_array->GetScalar(j)); - shape.push_back( - static_cast(std::static_pointer_cast(size)->value)); + for (int32_t j = 0; j < this->ndim(); ++j) { + ARROW_ASSIGN_OR_RAISE(const auto size, shape_array->GetScalar(j)); + const auto size_value = internal::checked_pointer_cast(size)->value; + ARROW_DCHECK_GE(size_value, 0); + shape.push_back(size_value); } // TODO: optimize ComputeStrides for non-uniform tensors @@ -231,30 +241,40 @@ Result> VariableShapeTensorType::GetTensor( } Result> VariableShapeTensorType::Make( - const std::shared_ptr& value_type, const uint32_t& ndim, - const std::vector& permutation, const std::vector& dim_names, - const std::vector>& uniform_shape) { - if (!permutation.empty() && permutation.size() != ndim) { + const std::shared_ptr& value_type, const int32_t ndim, + const std::vector permutation, const std::vector dim_names, + const std::vector> uniform_shape) { + if (!permutation.empty() && permutation.size() != static_cast(ndim)) { return Status::Invalid("permutation size must match ndim. Expected: ", ndim, " Got: ", permutation.size()); } - if (!dim_names.empty() && dim_names.size() != ndim) { + if (!dim_names.empty() && dim_names.size() != static_cast(ndim)) { return Status::Invalid("dim_names size must match ndim. Expected: ", ndim, " Got: ", dim_names.size()); } - if (uniform_shape.size() > ndim) { - return Status::Invalid("uniform_shape size must be less or equal ndim."); + if (!uniform_shape.empty() && uniform_shape.size() != static_cast(ndim)) { + return Status::Invalid("uniform_shape size must match ndim. Expected: ", ndim, + " Got: ", uniform_shape.size()); + } + if (!uniform_shape.empty()) { + for (const auto& v : uniform_shape) { + if (v.has_value() && v.value() < 0) { + return Status::Invalid("uniform_shape must have non-negative values"); + } + } } - return std::make_shared(value_type, ndim, permutation, - dim_names, uniform_shape); + return std::make_shared( + value_type, std::move(ndim), std::move(permutation), std::move(dim_names), + std::move(uniform_shape)); } std::shared_ptr variable_shape_tensor( - const std::shared_ptr& value_type, const uint32_t& ndim, - const std::vector& permutation, const std::vector& dim_names, - const std::vector>& uniform_shape) { - auto maybe_type = VariableShapeTensorType::Make(value_type, ndim, permutation, - dim_names, uniform_shape); + const std::shared_ptr& value_type, const int32_t ndim, + const std::vector permutation, const std::vector dim_names, + const std::vector> uniform_shape) { + auto maybe_type = + VariableShapeTensorType::Make(value_type, std::move(ndim), std::move(permutation), + std::move(dim_names), std::move(uniform_shape)); ARROW_CHECK_OK(maybe_type.status()); return maybe_type.MoveValueUnsafe(); } diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 477a8bfbbf1..9c34c8a0a9c 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -46,16 +46,16 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const std::vector> uniform_shape = {}) : ExtensionType(struct_({::arrow::field("shape", fixed_size_list(int32(), ndim)), ::arrow::field("data", list(value_type))})), - value_type_(std::move(value_type)), - ndim_(ndim), - permutation_(permutation), - dim_names_(dim_names), - uniform_shape_(uniform_shape) {} + value_type_(value_type), + ndim_(std::move(ndim)), + permutation_(std::move(permutation)), + dim_names_(std::move(dim_names)), + uniform_shape_(std::move(uniform_shape)) {} std::string extension_name() const override { return "arrow.variable_shape_tensor"; } /// Number of dimensions of tensor elements - uint32_t ndim() const { return ndim_; } + int32_t ndim() const { return ndim_; } /// Value type of tensor elements const std::shared_ptr& value_type() const { return value_type_; } @@ -88,10 +88,10 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { /// \brief Create a VariableShapeTensorType instance static Result> Make( - const std::shared_ptr& value_type, const uint32_t& ndim, - const std::vector& permutation = {}, - const std::vector& dim_names = {}, - const std::vector>& uniform_shape = {}); + const std::shared_ptr& value_type, const int32_t ndim, + const std::vector permutation = {}, + const std::vector dim_names = {}, + const std::vector> uniform_shape = {}); private: std::shared_ptr storage_type_; @@ -104,10 +104,10 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { /// \brief Return a VariableShapeTensorType instance. ARROW_EXPORT std::shared_ptr variable_shape_tensor( - const std::shared_ptr& value_type, const uint32_t& ndim, - const std::vector& permutation = {}, - const std::vector& dim_names = {}, - const std::vector>& uniform_shape = {}); + const std::shared_ptr& value_type, const int32_t ndim, + const std::vector permutation = {}, + const std::vector dim_names = {}, + const std::vector> uniform_shape = {}); } // namespace extension } // namespace arrow From d6dde83434805624186fe40ceb5d3b94a5d50138 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 29 Nov 2023 14:48:58 +0100 Subject: [PATCH 24/62] import and uint32->int32 --- cpp/src/arrow/extension/tensor_extension_array_test.cc | 7 +++---- cpp/src/arrow/extension/variable_shape_tensor.cc | 10 +++++----- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 104d429b4a0..912b7e77910 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -489,14 +489,13 @@ class TestVariableShapeTensorType : public ::testing::Test { ndim_ = 3; value_type_ = int64(); data_type_ = list(value_type_); - shape_type_ = fixed_size_list(uint32(), ndim_); + shape_type_ = fixed_size_list(int32(), ndim_); permutation_ = {0, 1, 2}; dim_names_ = {"x", "y", "z"}; uniform_shape_ = {std::nullopt, std::optional(1), std::nullopt}; ext_type_ = internal::checked_pointer_cast(variable_shape_tensor( value_type_, ndim_, permutation_, dim_names_, uniform_shape_)); - shapes_ = - ArrayFromJSON(fixed_size_list(uint32(), ndim_), "[[2,1,3],[2,1,2],[3,1,3]]"); + shapes_ = ArrayFromJSON(fixed_size_list(int32(), ndim_), "[[2,1,3],[2,1,2],[3,1,3]]"); data_ = ArrayFromJSON(list(value_type_), "[[0,1,2,3,4,5],[6,7,8,9],[10,11,12,13,14,15,16,17,18]]"); serialized_ = @@ -509,7 +508,7 @@ class TestVariableShapeTensorType : public ::testing::Test { } protected: - uint32_t ndim_; + int32_t ndim_; std::shared_ptr value_type_; std::shared_ptr data_type_; std::shared_ptr shape_type_; diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 2e46e980c2a..3bc73c469ad 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -50,7 +50,7 @@ const Result> VariableShapeTensorArray::GetTensor( std::vector shape; for (int64_t j = 0; j < ndim; ++j) { ARROW_ASSIGN_OR_RAISE(auto size, shapes->GetScalar(j)); - shape.push_back(std::static_pointer_cast(size)->value); + shape.push_back(std::static_pointer_cast(size)->value); } std::vector strides; @@ -210,13 +210,13 @@ Result> VariableShapeTensorType::GetTensor( ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar.field(0)); ARROW_ASSIGN_OR_RAISE(const auto data, tensor_scalar.field(1)); - const auto& shape_array = - checked_cast(*checked_cast(*shape_scalar)->value); + const auto& shape_array = internal::checked_cast( + *internal::checked_cast(*shape_scalar).value); std::vector shape; for (int32_t j = 0; j < this->ndim(); ++j) { - ARROW_ASSIGN_OR_RAISE(const auto size, shape_array->GetScalar(j)); - const auto size_value = internal::checked_pointer_cast(size)->value; + ARROW_ASSIGN_OR_RAISE(const auto size, shape_array.GetScalar(j)); + const auto size_value = internal::checked_pointer_cast(size)->value; ARROW_DCHECK_GE(size_value, 0); shape.push_back(size_value); } From 214ab1f20488dba7d8fb1f4668d4397b0509f9ea Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 29 Nov 2023 15:10:03 +0100 Subject: [PATCH 25/62] permutation check --- .../arrow/extension/variable_shape_tensor.cc | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 3bc73c469ad..e8b619fc92c 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -16,6 +16,7 @@ // under the License. #include +#include #include "arrow/extension/tensor_internal.h" #include "arrow/extension/variable_shape_tensor.h" @@ -244,10 +245,22 @@ Result> VariableShapeTensorType::Make( const std::shared_ptr& value_type, const int32_t ndim, const std::vector permutation, const std::vector dim_names, const std::vector> uniform_shape) { - if (!permutation.empty() && permutation.size() != static_cast(ndim)) { - return Status::Invalid("permutation size must match ndim. Expected: ", ndim, - " Got: ", permutation.size()); + if (!permutation.empty()) { + if (permutation.size() != static_cast(ndim)) { + return Status::Invalid("permutation size must match ndim. Expected: ", ndim, + " Got: ", permutation.size()); + } + const std::set permutation_set(permutation.begin(), permutation.end()); + if (permutation_set.size() != permutation.size()) { + return Status::Invalid("permutation must be a valid permutation vector"); + } + for (auto p : permutation) { + if (p < 0 || ndim <= p) { + return Status::Invalid("permutation must be a valid permutation vector"); + } + } } + if (!dim_names.empty() && dim_names.size() != static_cast(ndim)) { return Status::Invalid("dim_names size must match ndim. Expected: ", ndim, " Got: ", dim_names.size()); From 6d4ef16934f0eaffa2fcf55a6199c4bf78dbadff Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 29 Nov 2023 15:28:10 +0100 Subject: [PATCH 26/62] Remove serialization from cython, lint --- cpp/src/arrow/extension/variable_shape_tensor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index e8b619fc92c..1d79f0e48dd 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include +#include #include "arrow/extension/tensor_internal.h" #include "arrow/extension/variable_shape_tensor.h" From d3560ea1955d49b38a07307445800288017fcccf Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 30 Nov 2023 01:44:10 +0100 Subject: [PATCH 27/62] Review feedback --- .../arrow/extension/variable_shape_tensor.cc | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 1d79f0e48dd..819ae01edeb 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -141,13 +141,23 @@ Result> VariableShapeTensorType::Deserialize( storage_type->ToString()); } - ARROW_DCHECK_EQ(storage_type->num_fields(), 2); - ARROW_DCHECK_EQ(storage_type->field(0)->type()->id(), Type::FIXED_SIZE_LIST); - ARROW_DCHECK_EQ( - std::static_pointer_cast(storage_type->field(0)->type()) - ->value_type(), - int32()); - ARROW_DCHECK_EQ(storage_type->field(1)->type()->id(), Type::LIST); + if (storage_type->num_fields() != 2) { + return Status::Invalid("Expected Struct storage type with 2 fields, got ", + storage_type->num_fields()); + } + if (storage_type->field(0)->type()->id() != Type::FIXED_SIZE_LIST) { + return Status::Invalid("Expected FixedSizeList storage type, got ", + storage_type->field(0)->type()->ToString()); + } + if (storage_type->field(1)->type()->id() != Type::LIST) { + return Status::Invalid("Expected List storage type, got ", + storage_type->field(1)->type()->ToString()); + } + if (std::static_pointer_cast(storage_type->field(0)->type()) + ->value_type() != int32()) { + return Status::Invalid("Expected FixedSizeList value type int32, got ", + storage_type->field(0)->type()->ToString()); + } const auto value_type = storage_type->field(1)->type()->field(0)->type(); const size_t ndim = @@ -218,7 +228,9 @@ Result> VariableShapeTensorType::GetTensor( for (int32_t j = 0; j < this->ndim(); ++j) { ARROW_ASSIGN_OR_RAISE(const auto size, shape_array.GetScalar(j)); const auto size_value = internal::checked_pointer_cast(size)->value; - ARROW_DCHECK_GE(size_value, 0); + if (size_value < 0) { + return Status::Invalid("shape must have non-negative values"); + } shape.push_back(size_value); } From 2ab1f1739f816970ee56ce49415d0b6eecb883ef Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 30 Nov 2023 01:58:37 +0100 Subject: [PATCH 28/62] ndim initializer --- cpp/src/arrow/extension/tensor_extension_array_test.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 912b7e77910..90da0658f98 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -591,9 +591,10 @@ TEST_F(TestVariableShapeTensorType, MetadataSerializationRoundtrip) { using T = VariableShapeTensorType; CheckSerializationRoundtrip(ext_type_); - CheckSerializationRoundtrip(variable_shape_tensor(value_type_, {}, {}, {})); - CheckSerializationRoundtrip(variable_shape_tensor(value_type_, {0}, {}, {})); - CheckSerializationRoundtrip(variable_shape_tensor(value_type_, {1}, {0}, {"x"})); + CheckSerializationRoundtrip( + variable_shape_tensor(value_type_, 3, {1, 2, 0}, {"x", "y", "z"})); + CheckSerializationRoundtrip(variable_shape_tensor(value_type_, 0, {}, {})); + CheckSerializationRoundtrip(variable_shape_tensor(value_type_, 1, {0}, {"x"})); CheckSerializationRoundtrip( variable_shape_tensor(value_type_, 3, {0, 1, 2}, {"H", "W", "C"})); CheckSerializationRoundtrip( From 43b08c21b9e2a847132ae0dbb081defcf74d22ee Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 30 Nov 2023 05:03:34 +0100 Subject: [PATCH 29/62] Test null values --- .../extension/tensor_extension_array_test.cc | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 90da0658f98..8d8e5c1e646 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -637,9 +637,9 @@ TEST_F(TestVariableShapeTensorType, RoudtripBatch) { } TEST_F(TestVariableShapeTensorType, ComputeStrides) { - auto shapes = ArrayFromJSON(shape_type_, "[[2,3,1],[2,1,2],[3,1,3]]"); - auto data = - ArrayFromJSON(data_type_, "[[1,1,2,3,4,5],[2,7,8,9],[10,11,12,13,14,15,16,17,18]]"); + auto shapes = ArrayFromJSON(shape_type_, "[[2,3,1],[2,1,2],[3,1,3],null]"); + auto data = ArrayFromJSON( + data_type_, "[[1,1,2,3,4,5],[2,7,8,9],[10,11,12,13,14,15,16,17,18],null]"); std::vector> fields = {field("shapes", shape_type_), field("data", data_type_)}; ASSERT_OK_AND_ASSIGN(auto storage_arr, StructArray::Make({shapes, data}, fields)); @@ -697,6 +697,25 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { ASSERT_EQ(tensor->is_contiguous(), t->is_contiguous()); ASSERT_EQ(tensor->is_column_major(), t->is_column_major()); ASSERT_TRUE(tensor->Equals(*t)); + + // Null value in VariableShapeTensorArray produces a tensor with shape {0, 0, 0} + shape = {0, 0, 0}; + strides = {sizeof(int64_t), sizeof(int64_t), sizeof(int64_t)}; + values = {}; + data_buffer = Buffer::Wrap(values); + ASSERT_OK_AND_ASSIGN(tensor, + Tensor::Make(int64(), data_buffer, shape, strides, dim_names_)); + + ASSERT_OK_AND_ASSIGN(sc, ext_arr->GetScalar(3)); + ASSERT_OK_AND_ASSIGN( + t, exact_ext_type->GetTensor(internal::checked_pointer_cast(sc))); + ASSERT_EQ(tensor->strides(), t->strides()); + ASSERT_EQ(tensor->shape(), t->shape()); + ASSERT_EQ(tensor->dim_names(), t->dim_names()); + ASSERT_EQ(tensor->type(), t->type()); + ASSERT_EQ(tensor->is_contiguous(), t->is_contiguous()); + ASSERT_EQ(tensor->is_column_major(), t->is_column_major()); + ASSERT_TRUE(tensor->Equals(*t)); } } // namespace arrow From 58d5828184ab4133bf14242035044f891ae982b5 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 2 Dec 2023 06:06:38 +0100 Subject: [PATCH 30/62] Remove one GetTensor code paths, permutation handling --- .../arrow/extension/variable_shape_tensor.cc | 56 +++++++++---------- 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 819ae01edeb..76149630d09 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -41,31 +41,11 @@ namespace extension { const Result> VariableShapeTensorArray::GetTensor( const int64_t i) const { auto ext_arr = internal::checked_pointer_cast(this->storage()); - auto ext_type = internal::checked_pointer_cast(this->type()); - auto value_type = ext_type->value_type(); - auto ndim = ext_type->ndim(); - auto dim_names = ext_type->dim_names(); - auto shapes = - std::static_pointer_cast(ext_arr->field(0))->value_slice(i); - - std::vector shape; - for (int64_t j = 0; j < ndim; ++j) { - ARROW_ASSIGN_OR_RAISE(auto size, shapes->GetScalar(j)); - shape.push_back(std::static_pointer_cast(size)->value); - } - - std::vector strides; - // TODO: optimize ComputeStrides for non-uniform tensors - ARROW_CHECK_OK( - internal::ComputeStrides(value_type, shape, ext_type->permutation(), &strides)); - - const auto list_arr = - std::static_pointer_cast(ext_arr->field(1))->value_slice(i)->data(); - const auto byte_width = value_type->byte_width(); - const auto buffer = SliceBuffer(list_arr->buffers[1], list_arr->offset * byte_width, - list_arr->length * byte_width); - - return Tensor::Make(value_type, buffer, shape, strides, dim_names); + const auto ext_type = + internal::checked_pointer_cast(this->type()); + ARROW_ASSIGN_OR_RAISE(const auto tensor_scalar, this->GetScalar(i)); + return ext_type->GetTensor( + internal::checked_pointer_cast(tensor_scalar)); } bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const { @@ -224,8 +204,15 @@ Result> VariableShapeTensorType::GetTensor( const auto& shape_array = internal::checked_cast( *internal::checked_cast(*shape_scalar).value); + auto permutation = this->permutation(); + if (permutation.empty()) { + for (int64_t j = 0; j < static_cast(this->ndim()); ++j) { + permutation.emplace_back(j); + } + } + std::vector shape; - for (int32_t j = 0; j < this->ndim(); ++j) { + for (int64_t j : permutation) { ARROW_ASSIGN_OR_RAISE(const auto size, shape_array.GetScalar(j)); const auto size_value = internal::checked_pointer_cast(size)->value; if (size_value < 0) { @@ -234,7 +221,15 @@ Result> VariableShapeTensorType::GetTensor( shape.push_back(size_value); } - // TODO: optimize ComputeStrides for non-uniform tensors + std::vector dim_names; + if (!this->dim_names().empty()) { + for (auto j : permutation) { + dim_names.emplace_back(this->dim_names()[j]); + } + } else { + dim_names = {}; + } + std::vector strides; ARROW_CHECK_OK( internal::ComputeStrides(this->value_type(), shape, this->permutation(), &strides)); @@ -246,11 +241,10 @@ Result> VariableShapeTensorType::GetTensor( std::multiplies<>()); // Create a slice of the buffer - std::shared_ptr buffer = - arrow::SliceBuffer(array->data()->buffers[1], start_position, size * byte_width); + const std::shared_ptr buffer = + SliceBuffer(array->data()->buffers[1], start_position, size * byte_width); - return Tensor::Make(this->value_type(), std::move(buffer), std::move(shape), - std::move(strides), this->dim_names()); + return Tensor::Make(this->value_type(), buffer, shape, strides, this->dim_names()); } Result> VariableShapeTensorType::Make( From b5dd4f0c6639877071b7dcfce5b336994438ec34 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 3 Dec 2023 04:30:26 +0100 Subject: [PATCH 31/62] Allow arbitrary memory layout --- cpp/src/arrow/extension/variable_shape_tensor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 76149630d09..9c3880539cc 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -232,7 +232,7 @@ Result> VariableShapeTensorType::GetTensor( std::vector strides; ARROW_CHECK_OK( - internal::ComputeStrides(this->value_type(), shape, this->permutation(), &strides)); + internal::ComputeStrides(this->value_type(), shape, permutation, &strides)); const auto& array = internal::checked_cast(*data).value; const auto byte_width = this->value_type()->byte_width(); From 3137e2dc645f39776f3a6347d88eb94a005d8e6f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 3 Dec 2023 04:50:31 +0100 Subject: [PATCH 32/62] fix permutation check --- cpp/src/arrow/extension/variable_shape_tensor.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 9c3880539cc..95f397ca6a6 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -212,7 +212,8 @@ Result> VariableShapeTensorType::GetTensor( } std::vector shape; - for (int64_t j : permutation) { + for (int64_t j = 0; j < static_cast(this->ndim()); ++j) { + // for (int64_t j : permutation) { ARROW_ASSIGN_OR_RAISE(const auto size, shape_array.GetScalar(j)); const auto size_value = internal::checked_pointer_cast(size)->value; if (size_value < 0) { From b6b00fade0a510eec25e2fffa42c2aad63351f57 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 3 Dec 2023 18:17:39 +0100 Subject: [PATCH 33/62] lint --- cpp/src/arrow/extension/variable_shape_tensor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 95f397ca6a6..f6f7dde26ac 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -257,7 +257,7 @@ Result> VariableShapeTensorType::Make( return Status::Invalid("permutation size must match ndim. Expected: ", ndim, " Got: ", permutation.size()); } - const std::set permutation_set(permutation.begin(), permutation.end()); + const std::set permutation_set(permutation.begin(), permutation.end()); if (permutation_set.size() != permutation.size()) { return Status::Invalid("permutation must be a valid permutation vector"); } From b46217f9dd6c1e8b51bfbc19f41e528204085966 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 3 Dec 2023 23:15:23 +0100 Subject: [PATCH 34/62] lint --- cpp/src/arrow/extension/variable_shape_tensor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index f6f7dde26ac..852f4eb6706 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -213,7 +213,7 @@ Result> VariableShapeTensorType::GetTensor( std::vector shape; for (int64_t j = 0; j < static_cast(this->ndim()); ++j) { - // for (int64_t j : permutation) { + // for (int64_t j : permutation) { ARROW_ASSIGN_OR_RAISE(const auto size, shape_array.GetScalar(j)); const auto size_value = internal::checked_pointer_cast(size)->value; if (size_value < 0) { From b7030e26ee6efa6e186c2054ed5928c1c52b5610 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 4 Dec 2023 18:04:27 +0100 Subject: [PATCH 35/62] roundtrip strided --- cpp/src/arrow/extension/variable_shape_tensor.cc | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 852f4eb6706..aa407ccd349 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -212,14 +212,19 @@ Result> VariableShapeTensorType::GetTensor( } std::vector shape; + std::vector permuted_shape; for (int64_t j = 0; j < static_cast(this->ndim()); ++j) { - // for (int64_t j : permutation) { ARROW_ASSIGN_OR_RAISE(const auto size, shape_array.GetScalar(j)); const auto size_value = internal::checked_pointer_cast(size)->value; if (size_value < 0) { return Status::Invalid("shape must have non-negative values"); } + ARROW_ASSIGN_OR_RAISE(const auto permuted_size, + shape_array.GetScalar(permutation[j])); + const auto permuted_size_value = + internal::checked_pointer_cast(permuted_size)->value; shape.push_back(size_value); + permuted_shape.push_back(permuted_size_value); } std::vector dim_names; @@ -232,8 +237,8 @@ Result> VariableShapeTensorType::GetTensor( } std::vector strides; - ARROW_CHECK_OK( - internal::ComputeStrides(this->value_type(), shape, permutation, &strides)); + ARROW_CHECK_OK(internal::ComputeStrides(this->value_type(), permuted_shape, permutation, + &strides)); const auto& array = internal::checked_cast(*data).value; const auto byte_width = this->value_type()->byte_width(); From b2a1c34b2bd8dfe0b7a79a6522e9debf39f80ba8 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 13 Dec 2023 22:23:21 +0100 Subject: [PATCH 36/62] Apply suggestions from code review Co-authored-by: Antoine Pitrou Co-authored-by: Joris Van den Bossche --- cpp/src/arrow/extension/variable_shape_tensor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index aa407ccd349..5a2845470ae 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -250,7 +250,7 @@ Result> VariableShapeTensorType::GetTensor( const std::shared_ptr buffer = SliceBuffer(array->data()->buffers[1], start_position, size * byte_width); - return Tensor::Make(this->value_type(), buffer, shape, strides, this->dim_names()); + return Tensor::Make(this->value_type(), std::move(buffer), std::move(shape), std::move(strides), this->dim_names()); } Result> VariableShapeTensorType::Make( From 05c63bc3944997baf86d41575521890d9c96b885 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 13 Dec 2023 23:30:27 +0100 Subject: [PATCH 37/62] remove array.gettensor, simlify --- cpp/src/arrow/acero/CMakeLists.txt | 3 +- cpp/src/arrow/compute/kernels/CMakeLists.txt | 2 + .../extension/tensor_extension_array_test.cc | 17 ++++--- .../arrow/extension/variable_shape_tensor.cc | 51 ++++++++----------- .../arrow/extension/variable_shape_tensor.h | 14 ++--- 5 files changed, 41 insertions(+), 46 deletions(-) diff --git a/cpp/src/arrow/acero/CMakeLists.txt b/cpp/src/arrow/acero/CMakeLists.txt index dc18afa9797..db5ec595e32 100644 --- a/cpp/src/arrow/acero/CMakeLists.txt +++ b/cpp/src/arrow/acero/CMakeLists.txt @@ -121,7 +121,8 @@ endforeach() if(ARROW_TESTING) # test_nodes.cc isn't used by all tests but link to it for simple # CMakeLists.txt. - add_library(arrow_acero_testing OBJECT test_nodes.cc test_util_internal.cc) + add_library(arrow_acero_testing OBJECT test_nodes.cc test_util_internal.cc + ../extension/tensor_extension_array_test.cc) # Even though this is still just an object library we still need to "link" our # dependencies so that include paths are configured correctly target_link_libraries(arrow_acero_testing PRIVATE ${ARROW_ACERO_TEST_LINK_LIBS}) diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 15955b5ef88..92e9a240808 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -23,6 +23,8 @@ arrow_install_all_headers("arrow/compute/kernels") # Define arrow_compute_kernels_testing object library for common test files if(ARROW_TESTING) add_library(arrow_compute_kernels_testing OBJECT test_util_internal.cc) + add_library(arrow_compute_kernels_testing OBJECT test_util.cc + ../../extension/tensor_extension_array_test.cc) # Even though this is still just an object library we still need to "link" our # dependencies so that include paths are configured correctly target_link_libraries(arrow_compute_kernels_testing PUBLIC ${ARROW_GTEST_GMOCK}) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 8d8e5c1e646..a0d688b578c 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -644,11 +644,15 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { field("data", data_type_)}; ASSERT_OK_AND_ASSIGN(auto storage_arr, StructArray::Make({shapes, data}, fields)); auto ext_arr = ExtensionType::WrapArray(ext_type_, storage_arr); + auto exact_ext_type = + internal::checked_pointer_cast(ext_type_); auto ext_array = std::static_pointer_cast(ext_arr); std::shared_ptr t, tensor; - ASSERT_OK_AND_ASSIGN(t, ext_array->GetTensor(0)); + ASSERT_OK_AND_ASSIGN(auto scalar, ext_array->GetScalar(0)); + auto ext_scalar = internal::checked_pointer_cast(scalar); + ASSERT_OK_AND_ASSIGN(t, exact_ext_type->GetTensor(ext_scalar)); ASSERT_EQ(t->shape(), (std::vector{2, 3, 1})); ASSERT_EQ(t->strides(), (std::vector{24, 8, 8})); @@ -661,11 +665,15 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { Tensor::Make(int64(), data_buffer, shape, strides, dim_names_)); ASSERT_TRUE(tensor->Equals(*t)); - ASSERT_OK_AND_ASSIGN(t, ext_array->GetTensor(1)); + ASSERT_OK_AND_ASSIGN(scalar, ext_array->GetScalar(1)); + ext_scalar = internal::checked_pointer_cast(scalar); + ASSERT_OK_AND_ASSIGN(t, exact_ext_type->GetTensor(ext_scalar)); ASSERT_EQ(t->shape(), (std::vector{2, 1, 2})); ASSERT_EQ(t->strides(), (std::vector{16, 16, 8})); - ASSERT_OK_AND_ASSIGN(t, ext_array->GetTensor(2)); + ASSERT_OK_AND_ASSIGN(scalar, ext_array->GetScalar(2)); + ext_scalar = internal::checked_pointer_cast(scalar); + ASSERT_OK_AND_ASSIGN(t, exact_ext_type->GetTensor(ext_scalar)); ASSERT_EQ(t->shape(), (std::vector{3, 1, 3})); ASSERT_EQ(t->strides(), (std::vector{24, 24, 8})); @@ -684,9 +692,6 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { ASSERT_EQ(tensor->is_column_major(), t->is_column_major()); ASSERT_TRUE(tensor->Equals(*t)); - auto exact_ext_type = - internal::checked_pointer_cast(ext_type_); - ASSERT_OK_AND_ASSIGN(auto sc, ext_arr->GetScalar(2)); auto s = internal::checked_pointer_cast(sc); ASSERT_OK_AND_ASSIGN(t, exact_ext_type->GetTensor(s)); diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 5a2845470ae..0584a4fb5a9 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -21,7 +21,6 @@ #include "arrow/extension/tensor_internal.h" #include "arrow/extension/variable_shape_tensor.h" -#include "arrow/array/array_nested.h" #include "arrow/array/array_primitive.h" #include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep #include "arrow/scalar.h" @@ -38,16 +37,6 @@ namespace rj = arrow::rapidjson; namespace arrow { namespace extension { -const Result> VariableShapeTensorArray::GetTensor( - const int64_t i) const { - auto ext_arr = internal::checked_pointer_cast(this->storage()); - const auto ext_type = - internal::checked_pointer_cast(this->type()); - ARROW_ASSIGN_OR_RAISE(const auto tensor_scalar, this->GetScalar(i)); - return ext_type->GetTensor( - internal::checked_pointer_cast(tensor_scalar)); -} - bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const { if (extension_name() != other.extension_name()) { return false; @@ -203,6 +192,17 @@ Result> VariableShapeTensorType::GetTensor( ARROW_ASSIGN_OR_RAISE(const auto data, tensor_scalar.field(1)); const auto& shape_array = internal::checked_cast( *internal::checked_cast(*shape_scalar).value); + const auto& array = internal::checked_cast(*data).value; + + const auto value_type = + internal::checked_pointer_cast(this->value_type()); + + if (!is_fixed_width(*value_type)) { + return Status::Invalid("Cannot convert non-fixed-width values to Tensor."); + } + if (array->null_count() > 0) { + return Status::Invalid("Cannot convert data with nulls values to Tensor."); + } auto permutation = this->permutation(); if (permutation.empty()) { @@ -212,45 +212,36 @@ Result> VariableShapeTensorType::GetTensor( } std::vector shape; - std::vector permuted_shape; for (int64_t j = 0; j < static_cast(this->ndim()); ++j) { ARROW_ASSIGN_OR_RAISE(const auto size, shape_array.GetScalar(j)); const auto size_value = internal::checked_pointer_cast(size)->value; if (size_value < 0) { return Status::Invalid("shape must have non-negative values"); } - ARROW_ASSIGN_OR_RAISE(const auto permuted_size, - shape_array.GetScalar(permutation[j])); - const auto permuted_size_value = - internal::checked_pointer_cast(permuted_size)->value; shape.push_back(size_value); - permuted_shape.push_back(permuted_size_value); } + internal::Permute(permutation, &shape); - std::vector dim_names; - if (!this->dim_names().empty()) { - for (auto j : permutation) { - dim_names.emplace_back(this->dim_names()[j]); - } - } else { - dim_names = {}; + std::vector dim_names = this->dim_names(); + if (!dim_names.empty()) { + internal::Permute(permutation, &dim_names); } std::vector strides; - ARROW_CHECK_OK(internal::ComputeStrides(this->value_type(), permuted_shape, permutation, - &strides)); + // ARROW_CHECK_OK(ComputeStrides(*value_type.get(), shape, permutation, &strides)); + ARROW_CHECK_OK(internal::ComputeStrides(value_type, shape, permutation, &strides)); - const auto& array = internal::checked_cast(*data).value; - const auto byte_width = this->value_type()->byte_width(); + const auto byte_width = value_type->byte_width(); const auto start_position = array->offset() * byte_width; const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), std::multiplies<>()); // Create a slice of the buffer - const std::shared_ptr buffer = + const auto buffer = SliceBuffer(array->data()->buffers[1], start_position, size * byte_width); - return Tensor::Make(this->value_type(), std::move(buffer), std::move(shape), std::move(strides), this->dim_names()); + return Tensor::Make(value_type, std::move(buffer), std::move(shape), std::move(strides), + this->dim_names()); } Result> VariableShapeTensorType::Make( diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 9c34c8a0a9c..203424ee873 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -25,14 +25,6 @@ namespace extension { class ARROW_EXPORT VariableShapeTensorArray : public ExtensionArray { public: using ExtensionArray::ExtensionArray; - - /// \brief Get a Tensor of VariableShapeTensorArray at i - /// - /// This method will return a Tensor from VariableShapeTensorArray with strides - /// derived from shape and permutation of VariableShapeTensorType. Shape and - /// dim_names will be permuted according to permutation stored in the - /// VariableShapeTensorType metadata. - const Result> GetTensor(const int64_t i) const; }; /// \brief Concrete type class for variable-shape Tensor data. @@ -82,7 +74,11 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { /// Create a VariableShapeTensorArray from ArrayData std::shared_ptr MakeArray(std::shared_ptr data) const override; - /// Convert an ExtensionScalar to a Tensor + /// \brief Convert an ExtensionScalar to a Tensor + /// + /// This method will return a Tensor from ExtensionScalar with strides derived + /// from shape and permutation stored. Shape and dim_names will be permuted + /// according to permutation stored in the VariableShapeTensorType. Result> GetTensor( const std::shared_ptr&) const; From 845a041db6f758960ecf744c57266670c8122c54 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 14 Dec 2023 02:30:57 +0100 Subject: [PATCH 38/62] work --- .../arrow/extension/variable_shape_tensor.cc | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 0584a4fb5a9..66aa8ae82e5 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -186,13 +186,14 @@ std::shared_ptr VariableShapeTensorType::MakeArray( Result> VariableShapeTensorType::GetTensor( const std::shared_ptr& scalar) const { - const auto& tensor_scalar = internal::checked_cast(*scalar->value); + const auto tensor_scalar = internal::checked_pointer_cast(scalar->value); - ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar.field(0)); - ARROW_ASSIGN_OR_RAISE(const auto data, tensor_scalar.field(1)); - const auto& shape_array = internal::checked_cast( - *internal::checked_cast(*shape_scalar).value); - const auto& array = internal::checked_cast(*data).value; + ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar->field(0)); + ARROW_ASSIGN_OR_RAISE(const auto data_scalar, tensor_scalar->field(1)); + const auto shape_array = internal::checked_pointer_cast( + internal::checked_pointer_cast(shape_scalar)->value); + const auto data_array = + internal::checked_pointer_cast(data_scalar)->value; const auto value_type = internal::checked_pointer_cast(this->value_type()); @@ -200,7 +201,7 @@ Result> VariableShapeTensorType::GetTensor( if (!is_fixed_width(*value_type)) { return Status::Invalid("Cannot convert non-fixed-width values to Tensor."); } - if (array->null_count() > 0) { + if (data_array->null_count() > 0) { return Status::Invalid("Cannot convert data with nulls values to Tensor."); } @@ -213,12 +214,12 @@ Result> VariableShapeTensorType::GetTensor( std::vector shape; for (int64_t j = 0; j < static_cast(this->ndim()); ++j) { - ARROW_ASSIGN_OR_RAISE(const auto size, shape_array.GetScalar(j)); - const auto size_value = internal::checked_pointer_cast(size)->value; + ARROW_ASSIGN_OR_RAISE(const auto size, shape_array->GetScalar(j)); + auto size_value = internal::checked_pointer_cast(size)->value; if (size_value < 0) { return Status::Invalid("shape must have non-negative values"); } - shape.push_back(size_value); + shape.push_back(std::move(size_value)); } internal::Permute(permutation, &shape); @@ -228,17 +229,16 @@ Result> VariableShapeTensorType::GetTensor( } std::vector strides; - // ARROW_CHECK_OK(ComputeStrides(*value_type.get(), shape, permutation, &strides)); ARROW_CHECK_OK(internal::ComputeStrides(value_type, shape, permutation, &strides)); const auto byte_width = value_type->byte_width(); - const auto start_position = array->offset() * byte_width; + const auto start_position = data_array->offset() * byte_width; const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), std::multiplies<>()); - // Create a slice of the buffer - const auto buffer = - SliceBuffer(array->data()->buffers[1], start_position, size * byte_width); + ARROW_ASSIGN_OR_RAISE( + const auto buffer, + SliceBufferSafe(data_array->data()->buffers[1], start_position, size * byte_width)); return Tensor::Make(value_type, std::move(buffer), std::move(shape), std::move(strides), this->dim_names()); From 668108569568f834b3a77adaac5520f75e078c17 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 14 Dec 2023 16:11:23 +0100 Subject: [PATCH 39/62] Add repr --- .../extension/tensor_extension_array_test.cc | 81 +++++++++++++++---- .../arrow/extension/variable_shape_tensor.cc | 28 +++++++ .../arrow/extension/variable_shape_tensor.h | 1 + 3 files changed, 96 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index a0d688b578c..2624bbd286f 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -44,7 +44,7 @@ using VariableShapeTensorType = extension::VariableShapeTensorType; using extension::variable_shape_tensor; using extension::VariableShapeTensorArray; -class TestExtensionType : public ::testing::Test { +class TestFixedShapeTensorType : public ::testing::Test { public: void SetUp() override { shape_ = {3, 3, 4}; @@ -79,13 +79,27 @@ class TestExtensionType : public ::testing::Test { std::string serialized_; }; -TEST_F(TestExtensionType, CheckDummyRegistration) { +auto RoundtripBatch = [](const std::shared_ptr& batch, + std::shared_ptr* out) { + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); + ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), + out_stream.get())); + + ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); + + io::BufferReader reader(complete_ipc_stream); + std::shared_ptr batch_reader; + ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); + ASSERT_OK(batch_reader->ReadNext(out)); +}; + +TEST_F(TestFixedShapeTensorType, CheckDummyRegistration) { // We need a registered dummy type at runtime to allow for IPC deserialization auto registered_type = GetExtensionType("arrow.fixed_shape_tensor"); ASSERT_TRUE(registered_type->type_id == Type::EXTENSION); } -TEST_F(TestExtensionType, CreateExtensionType) { +TEST_F(TestFixedShapeTensorType, CreateExtensionType) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); // Test ExtensionType methods @@ -115,7 +129,7 @@ TEST_F(TestExtensionType, CreateExtensionType) { FixedShapeTensorType::Make(value_type_, cell_shape_, {}, {"x"})); } -TEST_F(TestExtensionType, EqualsCases) { +TEST_F(TestFixedShapeTensorType, EqualsCases) { auto ext_type_permutation_1 = fixed_shape_tensor(int64(), {3, 4}, {0, 1}, {"x", "y"}); auto ext_type_permutation_2 = fixed_shape_tensor(int64(), {3, 4}, {1, 0}, {"x", "y"}); auto ext_type_no_permutation = fixed_shape_tensor(int64(), {3, 4}, {}, {"x", "y"}); @@ -137,7 +151,7 @@ TEST_F(TestExtensionType, EqualsCases) { ASSERT_FALSE(ext_type_permutation_2->Equals(ext_type_permutation_1)); } -TEST_F(TestExtensionType, CreateFromArray) { +TEST_F(TestFixedShapeTensorType, CreateFromArray) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; @@ -189,7 +203,7 @@ void CheckDeserializationRaises(const std::shared_ptr& extension_type, ext_type->Deserialize(storage_type, serialized)); } -TEST_F(TestExtensionType, MetadataSerializationRoundtrip) { +TEST_F(TestFixedShapeTensorType, MetadataSerializationRoundtrip) { using T = FixedShapeTensorType; CheckSerializationRoundtrip(ext_type_); CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {}, {}, {})); @@ -215,7 +229,7 @@ TEST_F(TestExtensionType, MetadataSerializationRoundtrip) { "Invalid dim_names"); } -TEST_F(TestExtensionType, RoundtripBatch) { +TEST_F(TestFixedShapeTensorType, RoundtripBatch) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; @@ -243,7 +257,7 @@ TEST_F(TestExtensionType, RoundtripBatch) { CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); } -TEST_F(TestExtensionType, CreateFromTensor) { +TEST_F(TestFixedShapeTensorType, CreateFromTensor) { std::vector column_major_strides = {8, 24, 72}; std::vector neither_major_strides = {96, 8, 32}; @@ -317,7 +331,7 @@ void CheckFromTensorType(const std::shared_ptr& tensor, ASSERT_TRUE(generated_ext_type->Equals(ext_type)); } -TEST_F(TestExtensionType, TestFromTensorType) { +TEST_F(TestFixedShapeTensorType, TestFromTensorType) { auto values = Buffer::Wrap(values_); auto shapes = std::vector>{{3, 3, 4}, {3, 3, 4}, {3, 4, 3}, {3, 4, 3}}; @@ -356,7 +370,7 @@ void CheckTensorRoundtrip(const std::shared_ptr& tensor) { ASSERT_TRUE(tensor->Equals(*tensor_from_array)); } -TEST_F(TestExtensionType, RoundtripTensor) { +TEST_F(TestFixedShapeTensorType, RoundtripTensor) { auto values = Buffer::Wrap(values_); auto shapes = std::vector>{ @@ -377,7 +391,7 @@ TEST_F(TestExtensionType, RoundtripTensor) { } } -TEST_F(TestExtensionType, SliceTensor) { +TEST_F(TestFixedShapeTensorType, SliceTensor) { ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); ASSERT_OK_AND_ASSIGN( @@ -404,7 +418,7 @@ TEST_F(TestExtensionType, SliceTensor) { ASSERT_EQ(sliced->length(), partial->length()); } -TEST_F(TestExtensionType, RoundtripBatchFromTensor) { +TEST_F(TestFixedShapeTensorType, RoundtripBatchFromTensor) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, Buffer::Wrap(values_), shape_, {}, {"n", "x", "y"})); @@ -421,7 +435,7 @@ TEST_F(TestExtensionType, RoundtripBatchFromTensor) { CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); } -TEST_F(TestExtensionType, ComputeStrides) { +TEST_F(TestFixedShapeTensorType, ComputeStrides) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); auto ext_type_1 = internal::checked_pointer_cast( @@ -455,7 +469,7 @@ TEST_F(TestExtensionType, ComputeStrides) { ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); } -TEST_F(TestExtensionType, ToString) { +TEST_F(TestFixedShapeTensorType, FixedShapeTensoToString) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); auto ext_type_1 = internal::checked_pointer_cast( @@ -723,4 +737,43 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { ASSERT_TRUE(tensor->Equals(*t)); } +TEST_F(TestVariableShapeTensorType, ToString) { + auto exact_ext_type = + internal::checked_pointer_cast(ext_type_); + + auto uniform_shape = std::vector>{ + std::nullopt, std::optional(1), std::nullopt}; + auto ext_type_1 = internal::checked_pointer_cast( + variable_shape_tensor(int16(), 3)); + auto ext_type_2 = internal::checked_pointer_cast( + variable_shape_tensor(int32(), 3, {1, 0, 2})); + auto ext_type_3 = internal::checked_pointer_cast( + variable_shape_tensor(int64(), 3, {}, {"C", "H", "W"})); + auto ext_type_4 = internal::checked_pointer_cast( + variable_shape_tensor(int64(), 3, {}, {}, uniform_shape)); + + std::string result_1 = ext_type_1->ToString(); + std::string expected_1 = + "extension"; + ASSERT_EQ(expected_1, result_1); + + std::string result_2 = ext_type_2->ToString(); + std::string expected_2 = + "extension"; + ASSERT_EQ(expected_2, result_2); + + std::string result_3 = ext_type_3->ToString(); + std::string expected_3 = + "extension"; + ASSERT_EQ(expected_3, result_3); + + std::string result_4 = ext_type_4->ToString(); + std::string expected_4 = + "extension"; + ASSERT_EQ(expected_4, result_4); +} + } // namespace arrow diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 66aa8ae82e5..4cab83117ce 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -27,7 +27,9 @@ #include "arrow/tensor.h" #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" +#include "arrow/util/print.h" #include "arrow/util/sort.h" +#include "arrow/util/string.h" #include #include @@ -64,6 +66,32 @@ bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const (uniform_shape_ == other_ext.uniform_shape()) && permutation_equivalent; } +std::string VariableShapeTensorType::ToString() const { + std::stringstream ss; + ss << "extension<" << this->extension_name() + << "[value_type=" << value_type_->ToString() << ", ndim=" << ndim_; + + if (!permutation_.empty()) { + ss << ", permutation=" << ::arrow::internal::PrintVector{permutation_, ","}; + } + if (!dim_names_.empty()) { + ss << ", dim_names=[" << internal::JoinStrings(dim_names_, ",") << "]"; + } + if (!uniform_shape_.empty()) { + std::vector uniform_shape; + for (const auto& v : uniform_shape_) { + if (v.has_value()) { + uniform_shape.emplace_back(std::to_string(v.value())); + } else { + uniform_shape.emplace_back("null"); + } + } + ss << ", uniform_shape=[" << internal::JoinStrings(uniform_shape, ",") << "]"; + } + ss << "]>"; + return ss.str(); +} + std::string VariableShapeTensorType::Serialize() const { rj::Document document; document.SetObject(); diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 203424ee873..f5403e91443 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -45,6 +45,7 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { uniform_shape_(std::move(uniform_shape)) {} std::string extension_name() const override { return "arrow.variable_shape_tensor"; } + std::string ToString() const override; /// Number of dimensions of tensor elements int32_t ndim() const { return ndim_; } From e1be1b5888d3dea577ea81bb0800b9e3e3253f7b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 14 Dec 2023 19:34:47 +0100 Subject: [PATCH 40/62] Review feedback --- cpp/src/arrow/extension/fixed_shape_tensor.h | 10 ---------- cpp/src/arrow/extension/tensor_internal.h | 1 + 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 3eaeab236d5..80a602021c6 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -20,16 +20,6 @@ #include "arrow/extension_type.h" namespace arrow { -namespace internal { - -ARROW_EXPORT -Status ComputeStrides(const std::shared_ptr& value_type, - const std::vector& shape, - const std::vector& permutation, - std::vector* strides); - -} // namespace internal - namespace extension { class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { diff --git a/cpp/src/arrow/extension/tensor_internal.h b/cpp/src/arrow/extension/tensor_internal.h index a9583991db8..bba62e7d63b 100644 --- a/cpp/src/arrow/extension/tensor_internal.h +++ b/cpp/src/arrow/extension/tensor_internal.h @@ -44,6 +44,7 @@ inline Status IsPermutationValid(const std::vector& permutation) { return Status::OK(); } +ARROW_EXPORT inline Status ComputeStrides(const std::shared_ptr& value_type, const std::vector& shape, const std::vector& permutation, From e601dc82b6e778f94efd43807d50af5f15130093 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 23 Dec 2023 01:45:26 +0100 Subject: [PATCH 41/62] GetTensor->MakeTensor, static --- .../extension/tensor_extension_array_test.cc | 10 +++++----- .../arrow/extension/variable_shape_tensor.cc | 19 +++++++++++-------- .../arrow/extension/variable_shape_tensor.h | 4 ++-- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 2624bbd286f..85c068e013c 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -666,7 +666,7 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { ASSERT_OK_AND_ASSIGN(auto scalar, ext_array->GetScalar(0)); auto ext_scalar = internal::checked_pointer_cast(scalar); - ASSERT_OK_AND_ASSIGN(t, exact_ext_type->GetTensor(ext_scalar)); + ASSERT_OK_AND_ASSIGN(t, exact_ext_type->MakeTensor(ext_scalar)); ASSERT_EQ(t->shape(), (std::vector{2, 3, 1})); ASSERT_EQ(t->strides(), (std::vector{24, 8, 8})); @@ -681,13 +681,13 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { ASSERT_OK_AND_ASSIGN(scalar, ext_array->GetScalar(1)); ext_scalar = internal::checked_pointer_cast(scalar); - ASSERT_OK_AND_ASSIGN(t, exact_ext_type->GetTensor(ext_scalar)); + ASSERT_OK_AND_ASSIGN(t, exact_ext_type->MakeTensor(ext_scalar)); ASSERT_EQ(t->shape(), (std::vector{2, 1, 2})); ASSERT_EQ(t->strides(), (std::vector{16, 16, 8})); ASSERT_OK_AND_ASSIGN(scalar, ext_array->GetScalar(2)); ext_scalar = internal::checked_pointer_cast(scalar); - ASSERT_OK_AND_ASSIGN(t, exact_ext_type->GetTensor(ext_scalar)); + ASSERT_OK_AND_ASSIGN(t, exact_ext_type->MakeTensor(ext_scalar)); ASSERT_EQ(t->shape(), (std::vector{3, 1, 3})); ASSERT_EQ(t->strides(), (std::vector{24, 24, 8})); @@ -708,7 +708,7 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { ASSERT_OK_AND_ASSIGN(auto sc, ext_arr->GetScalar(2)); auto s = internal::checked_pointer_cast(sc); - ASSERT_OK_AND_ASSIGN(t, exact_ext_type->GetTensor(s)); + ASSERT_OK_AND_ASSIGN(t, exact_ext_type->MakeTensor(s)); ASSERT_EQ(tensor->strides(), t->strides()); ASSERT_EQ(tensor->shape(), t->shape()); ASSERT_EQ(tensor->dim_names(), t->dim_names()); @@ -727,7 +727,7 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { ASSERT_OK_AND_ASSIGN(sc, ext_arr->GetScalar(3)); ASSERT_OK_AND_ASSIGN( - t, exact_ext_type->GetTensor(internal::checked_pointer_cast(sc))); + t, exact_ext_type->MakeTensor(internal::checked_pointer_cast(sc))); ASSERT_EQ(tensor->strides(), t->strides()); ASSERT_EQ(tensor->shape(), t->shape()); ASSERT_EQ(tensor->dim_names(), t->dim_names()); diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 4cab83117ce..ec0537868d8 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -212,9 +212,12 @@ std::shared_ptr VariableShapeTensorType::MakeArray( return std::make_shared(data); } -Result> VariableShapeTensorType::GetTensor( - const std::shared_ptr& scalar) const { +Result> VariableShapeTensorType::MakeTensor( + const std::shared_ptr& scalar) { + const auto ext_scalar = internal::checked_pointer_cast(scalar); const auto tensor_scalar = internal::checked_pointer_cast(scalar->value); + const auto ext_type = + internal::checked_pointer_cast(scalar->type); ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar->field(0)); ARROW_ASSIGN_OR_RAISE(const auto data_scalar, tensor_scalar->field(1)); @@ -224,7 +227,7 @@ Result> VariableShapeTensorType::GetTensor( internal::checked_pointer_cast(data_scalar)->value; const auto value_type = - internal::checked_pointer_cast(this->value_type()); + internal::checked_pointer_cast(ext_type->value_type()); if (!is_fixed_width(*value_type)) { return Status::Invalid("Cannot convert non-fixed-width values to Tensor."); @@ -233,15 +236,15 @@ Result> VariableShapeTensorType::GetTensor( return Status::Invalid("Cannot convert data with nulls values to Tensor."); } - auto permutation = this->permutation(); + auto permutation = ext_type->permutation(); if (permutation.empty()) { - for (int64_t j = 0; j < static_cast(this->ndim()); ++j) { + for (int64_t j = 0; j < static_cast(ext_type->ndim()); ++j) { permutation.emplace_back(j); } } std::vector shape; - for (int64_t j = 0; j < static_cast(this->ndim()); ++j) { + for (int64_t j = 0; j < static_cast(ext_type->ndim()); ++j) { ARROW_ASSIGN_OR_RAISE(const auto size, shape_array->GetScalar(j)); auto size_value = internal::checked_pointer_cast(size)->value; if (size_value < 0) { @@ -251,7 +254,7 @@ Result> VariableShapeTensorType::GetTensor( } internal::Permute(permutation, &shape); - std::vector dim_names = this->dim_names(); + std::vector dim_names = ext_type->dim_names(); if (!dim_names.empty()) { internal::Permute(permutation, &dim_names); } @@ -269,7 +272,7 @@ Result> VariableShapeTensorType::GetTensor( SliceBufferSafe(data_array->data()->buffers[1], start_position, size * byte_width)); return Tensor::Make(value_type, std::move(buffer), std::move(shape), std::move(strides), - this->dim_names()); + ext_type->dim_names()); } Result> VariableShapeTensorType::Make( diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index f5403e91443..0a22db56ff3 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -80,8 +80,8 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { /// This method will return a Tensor from ExtensionScalar with strides derived /// from shape and permutation stored. Shape and dim_names will be permuted /// according to permutation stored in the VariableShapeTensorType. - Result> GetTensor( - const std::shared_ptr&) const; + static Result> MakeTensor( + const std::shared_ptr&); /// \brief Create a VariableShapeTensorType instance static Result> Make( From 88601cc6c5701e378b2d26a865ff4fcf7eacfc54 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 23 Dec 2023 02:17:24 +0100 Subject: [PATCH 42/62] Better permutations check --- .../extension/tensor_extension_array_test.cc | 10 ++++ .../arrow/extension/variable_shape_tensor.cc | 47 ++++++++++++------- .../arrow/extension/variable_shape_tensor.h | 6 +-- 3 files changed, 42 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 85c068e013c..b11ce97f210 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -577,6 +577,16 @@ TEST_F(TestVariableShapeTensorType, CreateExtensionType) { EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("Invalid: dim_names size must match ndim."), VariableShapeTensorType::Make(value_type_, ndim_, {}, {"x"})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: Permutation indices for 3 dimensional tensors must be " + "unique and within [0, 2] range. Got: [0,0,2]"), + VariableShapeTensorType::Make(value_type_, 3, {2, 0, 0}, {"C", "H", "W"})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: Permutation indices for 3 dimensional tensors must be " + "unique and within [0, 2] range. Got: [1,2,3]"), + VariableShapeTensorType::Make(value_type_, 3, {1, 2, 3}, {"C", "H", "W"})); } TEST_F(TestVariableShapeTensorType, EqualsCases) { diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index ec0537868d8..37ccc3cd110 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -277,24 +277,8 @@ Result> VariableShapeTensorType::MakeTensor( Result> VariableShapeTensorType::Make( const std::shared_ptr& value_type, const int32_t ndim, - const std::vector permutation, const std::vector dim_names, - const std::vector> uniform_shape) { - if (!permutation.empty()) { - if (permutation.size() != static_cast(ndim)) { - return Status::Invalid("permutation size must match ndim. Expected: ", ndim, - " Got: ", permutation.size()); - } - const std::set permutation_set(permutation.begin(), permutation.end()); - if (permutation_set.size() != permutation.size()) { - return Status::Invalid("permutation must be a valid permutation vector"); - } - for (auto p : permutation) { - if (p < 0 || ndim <= p) { - return Status::Invalid("permutation must be a valid permutation vector"); - } - } - } - + const std::vector& permutation, const std::vector& dim_names, + const std::vector>& uniform_shape) { if (!dim_names.empty() && dim_names.size() != static_cast(ndim)) { return Status::Invalid("dim_names size must match ndim. Expected: ", ndim, " Got: ", dim_names.size()); @@ -310,6 +294,33 @@ Result> VariableShapeTensorType::Make( } } } + if (!permutation.empty()) { + if (permutation.size() != static_cast(ndim)) { + return Status::Invalid("permutation size must match ndim. Expected: ", ndim, + " Got: ", permutation.size()); + } + + std::vector sorted_permutation = permutation; + std::sort(sorted_permutation.begin(), sorted_permutation.end()); + const auto max_index = std::max(static_cast(ndim - 1), 0); + + if (sorted_permutation[0] != 0) { + return Status::Invalid( + "Permutation indices for ", ndim, + " dimensional tensors must be unique and within [0, ", max_index, + "] range. Got: ", ::arrow::internal::PrintVector{sorted_permutation, ","}); + } + + for (size_t i = 1; i < sorted_permutation.size(); ++i) { + if (sorted_permutation[i - 1] + 1 != sorted_permutation[i]) { + return Status::Invalid( + "Permutation indices for ", ndim, + " dimensional tensors must be unique and within [0, ", max_index, + "] range. Got: ", ::arrow::internal::PrintVector{sorted_permutation, ","}); + } + } + } + return std::make_shared( value_type, std::move(ndim), std::move(permutation), std::move(dim_names), std::move(uniform_shape)); diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 0a22db56ff3..d26b207d8dd 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -86,9 +86,9 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { /// \brief Create a VariableShapeTensorType instance static Result> Make( const std::shared_ptr& value_type, const int32_t ndim, - const std::vector permutation = {}, - const std::vector dim_names = {}, - const std::vector> uniform_shape = {}); + const std::vector& permutation = {}, + const std::vector& dim_names = {}, + const std::vector>& uniform_shape = {}); private: std::shared_ptr storage_type_; From f1a7f528d74f49bd6fe34b92796a1be908b3e3a9 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 8 Feb 2024 15:03:22 +0100 Subject: [PATCH 43/62] post rebase changes --- .../extension/tensor_extension_array_test.cc | 228 ++++++++++++++++-- cpp/src/arrow/extension/tensor_internal.h | 7 +- 2 files changed, 209 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index b11ce97f210..4300db73611 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -48,34 +48,34 @@ class TestFixedShapeTensorType : public ::testing::Test { public: void SetUp() override { shape_ = {3, 3, 4}; - cell_shape_ = {3, 4}; + element_shape_ = {3, 4}; value_type_ = int64(); - cell_type_ = fixed_size_list(value_type_, 12); + element_type_ = fixed_size_list(value_type_, 12); dim_names_ = {"x", "y"}; ext_type_ = internal::checked_pointer_cast( - fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_)); + fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_)); values_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; values_partial_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; shape_partial_ = {2, 3, 4}; tensor_strides_ = {96, 32, 8}; - cell_strides_ = {32, 8}; + element_strides_ = {32, 8}; serialized_ = R"({"shape":[3,4],"dim_names":["x","y"]})"; } protected: std::vector shape_; std::vector shape_partial_; - std::vector cell_shape_; + std::vector element_shape_; std::shared_ptr value_type_; - std::shared_ptr cell_type_; + std::shared_ptr element_type_; std::vector dim_names_; std::shared_ptr ext_type_; std::vector values_; std::vector values_partial_; std::vector tensor_strides_; - std::vector cell_strides_; + std::vector element_strides_; std::string serialized_; }; @@ -105,8 +105,8 @@ TEST_F(TestFixedShapeTensorType, CreateExtensionType) { // Test ExtensionType methods ASSERT_EQ(ext_type_->extension_name(), "arrow.fixed_shape_tensor"); ASSERT_TRUE(ext_type_->Equals(*exact_ext_type)); - ASSERT_FALSE(ext_type_->Equals(*cell_type_)); - ASSERT_TRUE(ext_type_->storage_type()->Equals(*cell_type_)); + ASSERT_FALSE(ext_type_->Equals(*element_type_)); + ASSERT_TRUE(ext_type_->storage_type()->Equals(*element_type_)); ASSERT_EQ(ext_type_->Serialize(), serialized_); ASSERT_OK_AND_ASSIGN(auto ds, ext_type_->Deserialize(ext_type_->storage_type(), serialized_)); @@ -115,18 +115,28 @@ TEST_F(TestFixedShapeTensorType, CreateExtensionType) { // Test FixedShapeTensorType methods ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); - ASSERT_EQ(exact_ext_type->ndim(), cell_shape_.size()); - ASSERT_EQ(exact_ext_type->shape(), cell_shape_); + ASSERT_EQ(exact_ext_type->ndim(), element_shape_.size()); + ASSERT_EQ(exact_ext_type->shape(), element_shape_); ASSERT_EQ(exact_ext_type->value_type(), value_type_); - ASSERT_EQ(exact_ext_type->strides(), cell_strides_); + ASSERT_EQ(exact_ext_type->strides(), element_strides_); ASSERT_EQ(exact_ext_type->dim_names(), dim_names_); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("Invalid: permutation size must match shape size."), - FixedShapeTensorType::Make(value_type_, cell_shape_, {0})); + FixedShapeTensorType::Make(value_type_, element_shape_, {0})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("Invalid: dim_names size must match shape size."), - FixedShapeTensorType::Make(value_type_, cell_shape_, {}, {"x"})); + FixedShapeTensorType::Make(value_type_, element_shape_, {}, {"x"})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: Permutation indices for 2 dimensional tensors must be " + "unique and within [0, 1] range. Got: [3,0]"), + FixedShapeTensorType::Make(value_type_, {5, 6}, {3, 0})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: Permutation indices for 3 dimensional tensors must be " + "unique and within [0, 2] range. Got: [0,1,1]"), + FixedShapeTensorType::Make(value_type_, {1, 2, 3}, {0, 1, 1})); } TEST_F(TestFixedShapeTensorType, EqualsCases) { @@ -157,7 +167,7 @@ TEST_F(TestFixedShapeTensorType, CreateFromArray) { std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); - ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type_)); auto ext_arr = ExtensionType::WrapArray(ext_type_, fsla_arr); ASSERT_EQ(ext_arr->length(), shape_[0]); ASSERT_EQ(ext_arr->null_count(), 0); @@ -235,7 +245,7 @@ TEST_F(TestFixedShapeTensorType, RoundtripBatch) { std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); - ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type_)); auto ext_arr = ExtensionType::WrapArray(ext_type_, fsla_arr); // Pass extension array, expect getting back extension array @@ -250,7 +260,7 @@ TEST_F(TestFixedShapeTensorType, RoundtripBatch) { auto ext_metadata = key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, {"ARROW:extension:metadata", serialized_}}); - ext_field = field(/*name=*/"f0", /*type=*/cell_type_, /*nullable=*/true, + ext_field = field(/*name=*/"f0", /*type=*/element_type_, /*nullable=*/true, /*metadata=*/ext_metadata); auto batch2 = RecordBatch::Make(schema({ext_field}), fsla_arr->length(), {fsla_arr}); ASSERT_OK(RoundtripBatch(batch2, &read_batch2)); @@ -305,7 +315,7 @@ TEST_F(TestFixedShapeTensorType, CreateFromTensor) { auto ext_arr_5 = std::static_pointer_cast( ExtensionType::WrapArray(ext_type_5, fsla_arr)); EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("binary is not valid data type for a tensor"), + TypeError, testing::HasSubstr("binary is not valid data type for a tensor"), ext_arr_5->ToTensor()); auto ext_type_6 = internal::checked_pointer_cast( @@ -313,6 +323,10 @@ TEST_F(TestFixedShapeTensorType, CreateFromTensor) { auto arr_with_null = ArrayFromJSON(int64(), "[1, 0, null, null, 1, 2]"); ASSERT_OK_AND_ASSIGN(auto fsla_arr_6, FixedSizeListArray::FromArrays( arr_with_null, fixed_size_list(int64(), 2))); + + auto ext_type_7 = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), {3, 4}, {})); + ASSERT_OK_AND_ASSIGN(auto ext_arr_7, FixedShapeTensorArray::FromTensor(tensor)); } void CheckFromTensorType(const std::shared_ptr& tensor, @@ -343,7 +357,7 @@ TEST_F(TestFixedShapeTensorType, TestFromTensorType) { auto dim_names = std::vector>{ {"y", "z"}, {"z", "y"}, {"y", "z"}, {"z", "y"}, {"y", "z"}, {"y", "z"}, {"y", "z"}, {"y", "z"}}; - auto cell_shapes = std::vector>{{3, 4}, {4, 3}, {4, 3}, {3, 4}}; + auto element_shapes = std::vector>{{3, 4}, {4, 3}, {4, 3}, {3, 4}}; auto permutations = std::vector>{{0, 1}, {1, 0}, {0, 1}, {1, 0}}; for (size_t i = 0; i < shapes.size(); i++) { @@ -351,11 +365,82 @@ TEST_F(TestFixedShapeTensorType, TestFromTensorType) { strides[i], tensor_dim_names[i])); ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); auto ext_type = - fixed_shape_tensor(value_type_, cell_shapes[i], permutations[i], dim_names[i]); + fixed_shape_tensor(value_type_, element_shapes[i], permutations[i], dim_names[i]); CheckFromTensorType(tensor, ext_type); } } +template +void CheckToTensor(const std::vector& values, const std::shared_ptr typ, + const int32_t& element_size, const std::vector& element_shape, + const std::vector& element_permutation, + const std::vector& element_dim_names, + const std::vector& tensor_shape, + const std::vector& tensor_dim_names, + const std::vector& tensor_strides) { + auto buffer = Buffer::Wrap(values); + const std::shared_ptr element_type = fixed_size_list(typ, element_size); + std::vector> buffers = {nullptr, buffer}; + auto arr_data = std::make_shared(typ, values.size(), buffers); + auto arr = std::make_shared(arr_data); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type)); + + ASSERT_OK_AND_ASSIGN( + auto expected_tensor, + Tensor::Make(typ, buffer, tensor_shape, tensor_strides, tensor_dim_names)); + const auto ext_type = + fixed_shape_tensor(typ, element_shape, element_permutation, element_dim_names); + + auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr); + const auto tensor_array = std::static_pointer_cast(ext_arr); + ASSERT_OK_AND_ASSIGN(const auto actual_tensor, tensor_array->ToTensor()); + ASSERT_OK(actual_tensor->Validate()); + + ASSERT_EQ(actual_tensor->type(), expected_tensor->type()); + ASSERT_EQ(actual_tensor->shape(), expected_tensor->shape()); + ASSERT_EQ(actual_tensor->strides(), expected_tensor->strides()); + ASSERT_EQ(actual_tensor->dim_names(), expected_tensor->dim_names()); + ASSERT_TRUE(actual_tensor->data()->Equals(*expected_tensor->data())); + ASSERT_TRUE(actual_tensor->Equals(*expected_tensor)); +} + +TEST_F(TestFixedShapeTensorType, ToTensor) { + std::vector float_values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; + + auto element_sizes = std::vector{6, 6, 18, 18, 18, 18}; + + auto element_shapes = std::vector>{{2, 3}, {3, 2}, {3, 6}, + {6, 3}, {3, 2, 3}, {3, 2, 3}}; + auto tensor_shapes = std::vector>{ + {6, 2, 3}, {6, 2, 3}, {2, 3, 6}, {2, 3, 6}, {2, 3, 2, 3}, {2, 3, 2, 3}}; + + auto element_permutations = std::vector>{ + {0, 1}, {1, 0}, {0, 1}, {1, 0}, {0, 1, 2}, {2, 1, 0}}; + auto tensor_strides_32 = + std::vector>{{24, 12, 4}, {24, 4, 8}, {72, 24, 4}, + {72, 4, 12}, {72, 24, 12, 4}, {72, 4, 12, 24}}; + auto tensor_strides_64 = + std::vector>{{48, 24, 8}, {48, 8, 16}, {144, 48, 8}, + {144, 8, 24}, {144, 48, 24, 8}, {144, 8, 24, 48}}; + + auto element_dim_names = std::vector>{ + {"y", "z"}, {"z", "y"}, {"y", "z"}, {"z", "y"}, {"H", "W", "C"}, {"H", "W", "C"}}; + auto tensor_dim_names = std::vector>{ + {"", "y", "z"}, {"", "y", "z"}, {"", "y", "z"}, + {"", "y", "z"}, {"", "H", "W", "C"}, {"", "C", "W", "H"}}; + + for (size_t i = 0; i < element_shapes.size(); i++) { + CheckToTensor(float_values, float32(), element_sizes[i], element_shapes[i], + element_permutations[i], element_dim_names[i], tensor_shapes[i], + tensor_dim_names[i], tensor_strides_32[i]); + CheckToTensor(values_, int64(), element_sizes[i], element_shapes[i], + element_permutations[i], element_dim_names[i], tensor_shapes[i], + tensor_dim_names[i], tensor_strides_64[i]); + } +} + void CheckTensorRoundtrip(const std::shared_ptr& tensor) { ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); ASSERT_OK_AND_ASSIGN(auto tensor_from_array, ext_arr->ToTensor()); @@ -399,7 +484,7 @@ TEST_F(TestFixedShapeTensorType, SliceTensor) { Tensor::Make(value_type_, Buffer::Wrap(values_partial_), shape_partial_)); ASSERT_EQ(tensor->strides(), tensor_strides_); ASSERT_EQ(tensor_partial->strides(), tensor_strides_); - auto ext_type = fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_); + auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_); auto exact_ext_type = internal::checked_pointer_cast(ext_type_); ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); @@ -439,11 +524,11 @@ TEST_F(TestFixedShapeTensorType, ComputeStrides) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); auto ext_type_1 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_)); + fixed_shape_tensor(int64(), element_shape_, {}, dim_names_)); auto ext_type_2 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_)); + fixed_shape_tensor(int64(), element_shape_, {}, dim_names_)); auto ext_type_3 = internal::checked_pointer_cast( - fixed_shape_tensor(int32(), cell_shape_, {}, dim_names_)); + fixed_shape_tensor(int32(), element_shape_, {}, dim_names_)); ASSERT_TRUE(ext_type_1->Equals(*ext_type_2)); ASSERT_FALSE(ext_type_1->Equals(*ext_type_3)); @@ -497,6 +582,98 @@ TEST_F(TestFixedShapeTensorType, FixedShapeTensoToString) { ASSERT_EQ(expected_3, result_3); } +TEST_F(TestFixedShapeTensorType, GetTensor) { + auto arr = ArrayFromJSON(element_type_, + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]," + "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]"); + auto element_values = + std::vector>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, + {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}}; + + auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_); + auto permuted_ext_type = fixed_shape_tensor(value_type_, {3, 4}, {1, 0}, {"x", "y"}); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); + auto exact_permuted_ext_type = + internal::checked_pointer_cast(permuted_ext_type); + + auto array = std::static_pointer_cast( + ExtensionType::WrapArray(ext_type, arr)); + auto permuted_array = std::static_pointer_cast( + ExtensionType::WrapArray(permuted_ext_type, arr)); + + for (size_t i = 0; i < element_values.size(); i++) { + // Get tensor from extension array with trivial permutation + ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(i)); + auto actual_ext_scalar = internal::checked_pointer_cast(scalar); + ASSERT_OK_AND_ASSIGN(auto actual_tensor, + exact_ext_type->MakeTensor(actual_ext_scalar)); + ASSERT_OK(actual_tensor->Validate()); + ASSERT_OK_AND_ASSIGN(auto expected_tensor, + Tensor::Make(value_type_, Buffer::Wrap(element_values[i]), + {3, 4}, {}, {"x", "y"})); + ASSERT_EQ(expected_tensor->shape(), actual_tensor->shape()); + ASSERT_EQ(expected_tensor->dim_names(), actual_tensor->dim_names()); + ASSERT_EQ(expected_tensor->strides(), actual_tensor->strides()); + ASSERT_EQ(actual_tensor->strides(), std::vector({32, 8})); + ASSERT_EQ(expected_tensor->type(), actual_tensor->type()); + ASSERT_TRUE(expected_tensor->Equals(*actual_tensor)); + + // Get tensor from extension array with non-trivial permutation + ASSERT_OK_AND_ASSIGN(auto expected_permuted_tensor, + Tensor::Make(value_type_, Buffer::Wrap(element_values[i]), + {4, 3}, {8, 24}, {"y", "x"})); + ASSERT_OK_AND_ASSIGN(scalar, permuted_array->GetScalar(i)); + ASSERT_OK_AND_ASSIGN(auto actual_permuted_tensor, + exact_permuted_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + ASSERT_OK(actual_permuted_tensor->Validate()); + ASSERT_EQ(expected_permuted_tensor->strides(), actual_permuted_tensor->strides()); + ASSERT_EQ(expected_permuted_tensor->shape(), actual_permuted_tensor->shape()); + ASSERT_EQ(expected_permuted_tensor->dim_names(), actual_permuted_tensor->dim_names()); + ASSERT_EQ(expected_permuted_tensor->type(), actual_permuted_tensor->type()); + ASSERT_EQ(expected_permuted_tensor->is_contiguous(), + actual_permuted_tensor->is_contiguous()); + ASSERT_EQ(expected_permuted_tensor->is_column_major(), + actual_permuted_tensor->is_column_major()); + ASSERT_TRUE(expected_permuted_tensor->Equals(*actual_permuted_tensor)); + } + + // Test null values fail + auto element_type = fixed_size_list(int64(), 1); + auto fsla_arr = ArrayFromJSON(element_type, "[[1], [null], null]"); + ext_type = fixed_shape_tensor(int64(), {1}); + exact_ext_type = internal::checked_pointer_cast(ext_type); + auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr); + auto tensor_array = internal::checked_pointer_cast(ext_arr); + + ASSERT_OK_AND_ASSIGN(auto scalar, tensor_array->GetScalar(0)); + ASSERT_OK_AND_ASSIGN(auto tensor, + exact_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + + ASSERT_OK_AND_ASSIGN(scalar, tensor_array->GetScalar(1)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid: Cannot convert data with nulls to Tensor."), + exact_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + + ASSERT_OK_AND_ASSIGN(scalar, tensor_array->GetScalar(2)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid: Cannot convert data with nulls to Tensor."), + exact_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + + element_type = list(utf8()); + ext_type = fixed_shape_tensor(utf8(), {1}); + exact_ext_type = internal::checked_pointer_cast(ext_type); + scalar = std::make_shared(ArrayFromJSON(element_type, R"([["a", "b"]])")); + auto ext_scalar = std::make_shared(scalar, ext_type); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + testing::HasSubstr("Type error: Cannot convert non-fixed-width values to Tensor."), + exact_ext_type->MakeTensor(ext_scalar)); +} + class TestVariableShapeTensorType : public ::testing::Test { public: void SetUp() override { @@ -509,6 +686,8 @@ class TestVariableShapeTensorType : public ::testing::Test { uniform_shape_ = {std::nullopt, std::optional(1), std::nullopt}; ext_type_ = internal::checked_pointer_cast(variable_shape_tensor( value_type_, ndim_, permutation_, dim_names_, uniform_shape_)); + values_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; shapes_ = ArrayFromJSON(fixed_size_list(int32(), ndim_), "[[2,1,3],[2,1,2],[3,1,3]]"); data_ = ArrayFromJSON(list(value_type_), "[[0,1,2,3,4,5],[6,7,8,9],[10,11,12,13,14,15,16,17,18]]"); @@ -530,6 +709,7 @@ class TestVariableShapeTensorType : public ::testing::Test { std::vector> uniform_shape_; std::vector dim_names_; std::shared_ptr ext_type_; + std::vector values_; std::shared_ptr shapes_; std::shared_ptr data_; std::string serialized_; diff --git a/cpp/src/arrow/extension/tensor_internal.h b/cpp/src/arrow/extension/tensor_internal.h index bba62e7d63b..effbba635ba 100644 --- a/cpp/src/arrow/extension/tensor_internal.h +++ b/cpp/src/arrow/extension/tensor_internal.h @@ -18,16 +18,19 @@ #pragma once #include "arrow/extension/tensor_internal.h" +#include "arrow/array/array_nested.h" +#include +#include #include "arrow/tensor.h" +#include "arrow/status.h" #include "arrow/util/checked_cast.h" #include "arrow/util/int_util_overflow.h" #include "arrow/util/sort_internal.h" - -#include "arrow/status.h" #include "arrow/util/print_internal.h" namespace arrow::internal { +ARROW_EXPORT inline Status IsPermutationValid(const std::vector& permutation) { const auto size = static_cast(permutation.size()); std::vector dim_seen(size, 0); From a88d1d9274b857cd51f4b0fe777a31025414df40 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 10 Feb 2024 00:54:43 +0100 Subject: [PATCH 44/62] work --- cpp/src/arrow/extension/variable_shape_tensor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index d26b207d8dd..750f26af475 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -45,7 +45,7 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { uniform_shape_(std::move(uniform_shape)) {} std::string extension_name() const override { return "arrow.variable_shape_tensor"; } - std::string ToString() const override; + std::string ToString() const; /// Number of dimensions of tensor elements int32_t ndim() const { return ndim_; } From 674d8708fdf4f380998fad69299bc98bf7fd4914 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 4 Mar 2024 15:19:14 +0100 Subject: [PATCH 45/62] ToString new parameter --- cpp/src/arrow/extension/variable_shape_tensor.cc | 2 +- cpp/src/arrow/extension/variable_shape_tensor.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 37ccc3cd110..b9ee1d0a898 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -66,7 +66,7 @@ bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const (uniform_shape_ == other_ext.uniform_shape()) && permutation_equivalent; } -std::string VariableShapeTensorType::ToString() const { +std::string VariableShapeTensorType::ToString(bool show_metadata) const { std::stringstream ss; ss << "extension<" << this->extension_name() << "[value_type=" << value_type_->ToString() << ", ndim=" << ndim_; diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 750f26af475..8c0d3a850e0 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -45,7 +45,7 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { uniform_shape_(std::move(uniform_shape)) {} std::string extension_name() const override { return "arrow.variable_shape_tensor"; } - std::string ToString() const; + std::string ToString(bool show_metadata = false) const override; /// Number of dimensions of tensor elements int32_t ndim() const { return ndim_; } From d69a30703a69d12e5ae4feaba1ce33216937f2cd Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 4 Mar 2024 17:36:18 +0100 Subject: [PATCH 46/62] Remove Python bindings --- cpp/src/arrow/acero/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/acero/CMakeLists.txt b/cpp/src/arrow/acero/CMakeLists.txt index db5ec595e32..9c0052d87bd 100644 --- a/cpp/src/arrow/acero/CMakeLists.txt +++ b/cpp/src/arrow/acero/CMakeLists.txt @@ -122,7 +122,7 @@ if(ARROW_TESTING) # test_nodes.cc isn't used by all tests but link to it for simple # CMakeLists.txt. add_library(arrow_acero_testing OBJECT test_nodes.cc test_util_internal.cc - ../extension/tensor_extension_array_test.cc) + ../extension/tensor_extension_array_test.cc) # Even though this is still just an object library we still need to "link" our # dependencies so that include paths are configured correctly target_link_libraries(arrow_acero_testing PRIVATE ${ARROW_ACERO_TEST_LINK_LIBS}) From 84a93f01cd9c9bf60c2b9120ef7ae4a35274ca2b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 16 Mar 2024 22:00:49 +0100 Subject: [PATCH 47/62] Review feedback --- cpp/src/arrow/acero/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/acero/CMakeLists.txt b/cpp/src/arrow/acero/CMakeLists.txt index 9c0052d87bd..dc18afa9797 100644 --- a/cpp/src/arrow/acero/CMakeLists.txt +++ b/cpp/src/arrow/acero/CMakeLists.txt @@ -121,8 +121,7 @@ endforeach() if(ARROW_TESTING) # test_nodes.cc isn't used by all tests but link to it for simple # CMakeLists.txt. - add_library(arrow_acero_testing OBJECT test_nodes.cc test_util_internal.cc - ../extension/tensor_extension_array_test.cc) + add_library(arrow_acero_testing OBJECT test_nodes.cc test_util_internal.cc) # Even though this is still just an object library we still need to "link" our # dependencies so that include paths are configured correctly target_link_libraries(arrow_acero_testing PRIVATE ${ARROW_ACERO_TEST_LINK_LIBS}) From 3500b0902c4e82e3afae0224e8d607270791a3d5 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 16 Mar 2024 23:43:01 +0100 Subject: [PATCH 48/62] Use TensorFromJSON --- .../extension/tensor_extension_array_test.cc | 22 ++++++------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 4300db73611..fe691d2a413 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -860,13 +860,11 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { ASSERT_EQ(t->shape(), (std::vector{2, 3, 1})); ASSERT_EQ(t->strides(), (std::vector{24, 8, 8})); - std::vector shape = {2, 3, 1}; std::vector strides = {sizeof(int64_t) * 3, sizeof(int64_t) * 1, sizeof(int64_t) * 1}; - std::vector values = {1, 1, 2, 3, 4, 5}; - auto data_buffer = Buffer::Wrap(values); - ASSERT_OK_AND_ASSIGN(tensor, - Tensor::Make(int64(), data_buffer, shape, strides, dim_names_)); + tensor = TensorFromJSON(int64(), R"([1,1,2,3,4,5])", {2, 3, 1}, strides, + dim_names_); + ASSERT_TRUE(tensor->Equals(*t)); ASSERT_OK_AND_ASSIGN(scalar, ext_array->GetScalar(1)); @@ -881,12 +879,9 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { ASSERT_EQ(t->shape(), (std::vector{3, 1, 3})); ASSERT_EQ(t->strides(), (std::vector{24, 24, 8})); - shape = {3, 1, 3}; strides = {sizeof(int64_t) * 3, sizeof(int64_t) * 3, sizeof(int64_t) * 1}; - values = {10, 11, 12, 13, 14, 15, 16, 17, 18}; - data_buffer = Buffer::Wrap(values); - ASSERT_OK_AND_ASSIGN(tensor, - Tensor::Make(int64(), data_buffer, shape, strides, dim_names_)); + tensor = TensorFromJSON(int64(), R"([10,11,12,13,14,15,16,17,18])", {3, 1, 3}, strides, + dim_names_); ASSERT_EQ(tensor->strides(), t->strides()); ASSERT_EQ(tensor->shape(), t->shape()); @@ -908,12 +903,9 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { ASSERT_TRUE(tensor->Equals(*t)); // Null value in VariableShapeTensorArray produces a tensor with shape {0, 0, 0} - shape = {0, 0, 0}; strides = {sizeof(int64_t), sizeof(int64_t), sizeof(int64_t)}; - values = {}; - data_buffer = Buffer::Wrap(values); - ASSERT_OK_AND_ASSIGN(tensor, - Tensor::Make(int64(), data_buffer, shape, strides, dim_names_)); + tensor = TensorFromJSON(int64(), R"([10,11,12,13,14,15,16,17,18])", {0, 0, 0}, strides, + dim_names_); ASSERT_OK_AND_ASSIGN(sc, ext_arr->GetScalar(3)); ASSERT_OK_AND_ASSIGN( From 3ca3f2e8e2059c8b745166e8c552e8ba34411c6b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 17 Mar 2024 15:41:18 +0100 Subject: [PATCH 49/62] lint --- cpp/src/arrow/extension/tensor_extension_array_test.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index fe691d2a413..e3dbb13421d 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -862,8 +862,7 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { std::vector strides = {sizeof(int64_t) * 3, sizeof(int64_t) * 1, sizeof(int64_t) * 1}; - tensor = TensorFromJSON(int64(), R"([1,1,2,3,4,5])", {2, 3, 1}, strides, - dim_names_); + tensor = TensorFromJSON(int64(), R"([1,1,2,3,4,5])", {2, 3, 1}, strides, dim_names_); ASSERT_TRUE(tensor->Equals(*t)); From 24056e0348382ed341cb403df2434c06c4fd1a0c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 27 Mar 2024 17:23:04 +0100 Subject: [PATCH 50/62] Apply suggestions from code review Co-authored-by: Antoine Pitrou --- .../extension/tensor_extension_array_test.cc | 4 +-- cpp/src/arrow/extension/tensor_internal.h | 6 ++-- .../arrow/extension/variable_shape_tensor.cc | 28 ++++++++++--------- .../arrow/extension/variable_shape_tensor.h | 12 ++++++-- 4 files changed, 30 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index e3dbb13421d..374951f349a 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -197,7 +197,7 @@ TEST_F(TestExtensionType, MakeArrayCanGetCorrectScalarType) { template void CheckSerializationRoundtrip(const std::shared_ptr& ext_type) { - auto type = internal::checked_pointer_cast(ext_type); + auto type = internal::checked_pointer_cast(ext_type); auto serialized = type->Serialize(); ASSERT_OK_AND_ASSIGN(auto deserialized, type->Deserialize(type->storage_type(), serialized)); @@ -720,7 +720,7 @@ class TestVariableShapeTensorType : public ::testing::Test { TEST_F(TestVariableShapeTensorType, CheckDummyRegistration) { // We need a registered dummy type at runtime to allow for IPC deserialization auto registered_type = GetExtensionType("arrow.variable_shape_tensor"); - ASSERT_TRUE(registered_type->type_id == Type::EXTENSION); + ASSERT_EQ(registered_type->type_id, Type::EXTENSION); } TEST_F(TestVariableShapeTensorType, CreateExtensionType) { diff --git a/cpp/src/arrow/extension/tensor_internal.h b/cpp/src/arrow/extension/tensor_internal.h index effbba635ba..651372d01cf 100644 --- a/cpp/src/arrow/extension/tensor_internal.h +++ b/cpp/src/arrow/extension/tensor_internal.h @@ -18,15 +18,17 @@ #pragma once #include "arrow/extension/tensor_internal.h" -#include "arrow/array/array_nested.h" #include #include + +#include "arrow/array/array_nested.h" #include "arrow/tensor.h" #include "arrow/status.h" #include "arrow/util/checked_cast.h" #include "arrow/util/int_util_overflow.h" #include "arrow/util/sort_internal.h" #include "arrow/util/print_internal.h" +#include "arrow/de" namespace arrow::internal { @@ -83,7 +85,7 @@ inline Status ComputeStrides(const std::shared_ptr& value_type, strides->push_back(remaining); } } - internal::Permute(permutation, strides); + Permute(permutation, strides); return Status::OK(); } diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index b9ee1d0a898..6ef1bec85f9 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -43,7 +43,7 @@ bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const if (extension_name() != other.extension_name()) { return false; } - const auto& other_ext = static_cast(other); + const auto& other_ext = checked_cast(other); if (this->ndim() != other_ext.ndim()) { return false; } @@ -150,16 +150,15 @@ Result> VariableShapeTensorType::Deserialize( return Status::Invalid("Expected List storage type, got ", storage_type->field(1)->type()->ToString()); } - if (std::static_pointer_cast(storage_type->field(0)->type()) + if (checked_cast(*storage_type->field(0)->type()) ->value_type() != int32()) { return Status::Invalid("Expected FixedSizeList value type int32, got ", storage_type->field(0)->type()->ToString()); } const auto value_type = storage_type->field(1)->type()->field(0)->type(); - const size_t ndim = - std::static_pointer_cast(storage_type->field(0)->type()) - ->list_size(); + const int32_t ndim = + checked_cast(*storage_type->field(0)->type()).list_size(); rj::Document document; if (document.Parse(serialized_data.data(), serialized_data.length()).HasParseError()) { @@ -168,6 +167,7 @@ Result> VariableShapeTensorType::Deserialize( std::vector permutation; if (document.HasMember("permutation")) { + permutation.reserve(ndim); for (auto& x : document["permutation"].GetArray()) { permutation.emplace_back(x.GetInt64()); } @@ -177,7 +177,8 @@ Result> VariableShapeTensorType::Deserialize( } std::vector dim_names; if (document.HasMember("dim_names")) { - for (auto& x : document["dim_names"].GetArray()) { + dim_names.reserve(ndim); + for (const auto& x : document["dim_names"].GetArray()) { dim_names.emplace_back(x.GetString()); } if (dim_names.size() != ndim) { @@ -187,7 +188,8 @@ Result> VariableShapeTensorType::Deserialize( std::vector> uniform_shape; if (document.HasMember("uniform_shape")) { - for (auto& x : document["uniform_shape"].GetArray()) { + uniform_shape.reserve(ndim); + for (const auto& x : document["uniform_shape"].GetArray()) { if (x.IsNull()) { uniform_shape.emplace_back(std::nullopt); } else { @@ -200,8 +202,8 @@ Result> VariableShapeTensorType::Deserialize( } } - return variable_shape_tensor(value_type, static_cast(ndim), permutation, - dim_names, uniform_shape); + return variable_shape_tensor(value_type, static_cast(ndim), std::move(permutation), + std::move(dim_names), std::move(uniform_shape)); } std::shared_ptr VariableShapeTensorType::MakeArray( @@ -233,17 +235,17 @@ Result> VariableShapeTensorType::MakeTensor( return Status::Invalid("Cannot convert non-fixed-width values to Tensor."); } if (data_array->null_count() > 0) { - return Status::Invalid("Cannot convert data with nulls values to Tensor."); + return Status::Invalid("Cannot convert data with nulls to Tensor."); } auto permutation = ext_type->permutation(); if (permutation.empty()) { - for (int64_t j = 0; j < static_cast(ext_type->ndim()); ++j) { - permutation.emplace_back(j); - } + permutation.resize(ndim); + std::iota(permutation.begin(), permutation.end(), 0); } std::vector shape; + shape.reserve(ndim); for (int64_t j = 0; j < static_cast(ext_type->ndim()); ++j) { ARROW_ASSIGN_OR_RAISE(const auto size, shape_array->GetScalar(j)); auto size_value = internal::checked_pointer_cast(size)->value; diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 8c0d3a850e0..85b3ed9289e 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -17,6 +17,12 @@ #pragma once +#include +#include +#include +#include +#include + #include "arrow/extension_type.h" namespace arrow { @@ -86,9 +92,9 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { /// \brief Create a VariableShapeTensorType instance static Result> Make( const std::shared_ptr& value_type, const int32_t ndim, - const std::vector& permutation = {}, - const std::vector& dim_names = {}, - const std::vector>& uniform_shape = {}); + const std::vector permutation = {}, + const std::vector dim_names = {}, + const std::vector> uniform_shape = {}); private: std::shared_ptr storage_type_; From 616a1fb1601edb7b54c90fe2b2383842c0795852 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 27 Mar 2024 17:23:50 +0100 Subject: [PATCH 51/62] Update cpp/src/arrow/extension/variable_shape_tensor.cc Co-authored-by: Antoine Pitrou --- cpp/src/arrow/extension/variable_shape_tensor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 6ef1bec85f9..6667aba07aa 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -168,7 +168,7 @@ Result> VariableShapeTensorType::Deserialize( std::vector permutation; if (document.HasMember("permutation")) { permutation.reserve(ndim); - for (auto& x : document["permutation"].GetArray()) { + for (const auto& x : document["permutation"].GetArray()) { permutation.emplace_back(x.GetInt64()); } if (permutation.size() != ndim) { From 1fad2934babbee449c26049f08a294323771a6fb Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 27 Mar 2024 17:43:47 +0100 Subject: [PATCH 52/62] fix --- cpp/src/arrow/extension/tensor_internal.h | 1 - .../arrow/extension/variable_shape_tensor.cc | 20 ++++++++++--------- .../arrow/extension/variable_shape_tensor.h | 6 +++--- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_internal.h b/cpp/src/arrow/extension/tensor_internal.h index 651372d01cf..cd583ec790a 100644 --- a/cpp/src/arrow/extension/tensor_internal.h +++ b/cpp/src/arrow/extension/tensor_internal.h @@ -28,7 +28,6 @@ #include "arrow/util/int_util_overflow.h" #include "arrow/util/sort_internal.h" #include "arrow/util/print_internal.h" -#include "arrow/de" namespace arrow::internal { diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 6667aba07aa..d5e58cce402 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -43,7 +43,7 @@ bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const if (extension_name() != other.extension_name()) { return false; } - const auto& other_ext = checked_cast(other); + const auto& other_ext = internal::checked_cast(other); if (this->ndim() != other_ext.ndim()) { return false; } @@ -150,15 +150,16 @@ Result> VariableShapeTensorType::Deserialize( return Status::Invalid("Expected List storage type, got ", storage_type->field(1)->type()->ToString()); } - if (checked_cast(*storage_type->field(0)->type()) - ->value_type() != int32()) { + if (internal::checked_cast(*storage_type->field(0)->type()) + .value_type() != int32()) { return Status::Invalid("Expected FixedSizeList value type int32, got ", storage_type->field(0)->type()->ToString()); } const auto value_type = storage_type->field(1)->type()->field(0)->type(); - const int32_t ndim = - checked_cast(*storage_type->field(0)->type()).list_size(); + const uint32_t ndim = + internal::checked_cast(*storage_type->field(0)->type()) + .list_size(); rj::Document document; if (document.Parse(serialized_data.data(), serialized_data.length()).HasParseError()) { @@ -202,8 +203,9 @@ Result> VariableShapeTensorType::Deserialize( } } - return variable_shape_tensor(value_type, static_cast(ndim), std::move(permutation), - std::move(dim_names), std::move(uniform_shape)); + return variable_shape_tensor(value_type, static_cast(ndim), + std::move(permutation), std::move(dim_names), + std::move(uniform_shape)); } std::shared_ptr VariableShapeTensorType::MakeArray( @@ -240,12 +242,12 @@ Result> VariableShapeTensorType::MakeTensor( auto permutation = ext_type->permutation(); if (permutation.empty()) { - permutation.resize(ndim); + // permutation.resize(ndim); std::iota(permutation.begin(), permutation.end(), 0); } std::vector shape; - shape.reserve(ndim); + // shape.reserve(ndim); for (int64_t j = 0; j < static_cast(ext_type->ndim()); ++j) { ARROW_ASSIGN_OR_RAISE(const auto size, shape_array->GetScalar(j)); auto size_value = internal::checked_pointer_cast(size)->value; diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 85b3ed9289e..44edeac9cab 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -92,9 +92,9 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { /// \brief Create a VariableShapeTensorType instance static Result> Make( const std::shared_ptr& value_type, const int32_t ndim, - const std::vector permutation = {}, - const std::vector dim_names = {}, - const std::vector> uniform_shape = {}); + const std::vector& permutation = {}, + const std::vector& dim_names = {}, + const std::vector>& uniform_shape = {}); private: std::shared_ptr storage_type_; From 2195f2ecc39a94609889b055e066669a3cc4adef Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 27 Mar 2024 19:26:55 +0100 Subject: [PATCH 53/62] Review feedback --- cpp/src/arrow/CMakeLists.txt | 2 +- .../extension/tensor_extension_array_test.cc | 47 +++++----- cpp/src/arrow/extension/tensor_internal.cc | 91 +++++++++++++++++++ .../arrow/extension/variable_shape_tensor.cc | 80 +++++----------- .../arrow/extension/variable_shape_tensor.h | 4 +- 5 files changed, 141 insertions(+), 83 deletions(-) create mode 100644 cpp/src/arrow/extension/tensor_internal.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index d40022b4a5b..7841d37666a 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -983,7 +983,7 @@ endif() if(ARROW_JSON) arrow_add_object_library(ARROW_JSON extension/fixed_shape_tensor.cc - extension/opaque.cc + extension/tensor_internal.cc extension/variable_shape_tensor.cc json/options.cc json/chunked_builder.cc diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 374951f349a..b395da34a8a 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -173,6 +173,7 @@ TEST_F(TestFixedShapeTensorType, CreateFromArray) { ASSERT_EQ(ext_arr->null_count(), 0); } +<<<<<<< HEAD TEST_F(TestExtensionType, MakeArrayCanGetCorrectScalarType) { ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); @@ -214,14 +215,13 @@ void CheckDeserializationRaises(const std::shared_ptr& extension_type, } TEST_F(TestFixedShapeTensorType, MetadataSerializationRoundtrip) { - using T = FixedShapeTensorType; - CheckSerializationRoundtrip(ext_type_); - CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {}, {}, {})); - CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {0}, {}, {})); - CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {1}, {0}, {"x"})); - CheckSerializationRoundtrip( + CheckSerializationRoundtrip(ext_type_); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {}, {}, {})); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {0}, {}, {})); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {1}, {0}, {"x"})); + CheckSerializationRoundtrip( fixed_shape_tensor(value_type_, {256, 256, 3}, {0, 1, 2}, {"H", "W", "C"})); - CheckSerializationRoundtrip( + CheckSerializationRoundtrip( fixed_shape_tensor(value_type_, {256, 256, 3}, {2, 0, 1}, {"C", "H", "W"})); auto storage_type = fixed_size_list(int64(), 12); @@ -554,7 +554,7 @@ TEST_F(TestFixedShapeTensorType, ComputeStrides) { ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); } -TEST_F(TestFixedShapeTensorType, FixedShapeTensoToString) { +TEST_F(TestFixedShapeTensorType, FixedShapeTensorToString) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); auto ext_type_1 = internal::checked_pointer_cast( @@ -695,7 +695,7 @@ class TestVariableShapeTensorType : public ::testing::Test { R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"uniform_shape":[null,1,null]})"; storage_arr_ = ArrayFromJSON( ext_type_->storage_type(), - R"([[[2,3,1],[0,1,2,3,4,5]],[[1,2,2],[6,7,8,9]],[[3,1,3],[10,11,12,13,14,15,16,17,18]]])"); + R"([[[0,1,2,3,4,5],[2,3,1]],[[6,7,8,9],[1,2,2]],[[10,11,12,13,14,15,16,17,18],[3,1,3]]])"); ext_arr_ = internal::checked_pointer_cast( ExtensionType::WrapArray(ext_type_, storage_arr_)); } @@ -730,10 +730,9 @@ TEST_F(TestVariableShapeTensorType, CreateExtensionType) { // Test ExtensionType methods ASSERT_EQ(ext_type_->extension_name(), "arrow.variable_shape_tensor"); ASSERT_TRUE(ext_type_->Equals(*exact_ext_type)); - auto expected_type = struct_({ - ::arrow::field("shape", fixed_size_list(int32(), ndim_)), - ::arrow::field("data", list(value_type_)), - }); + auto expected_type = + struct_({::arrow::field("data", list(value_type_)), + ::arrow::field("shape", fixed_size_list(int32(), ndim_))}); ASSERT_TRUE(ext_type_->storage_type()->Equals(*expected_type)); ASSERT_EQ(ext_type_->Serialize(), serialized_); @@ -760,7 +759,7 @@ TEST_F(TestVariableShapeTensorType, CreateExtensionType) { EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("Invalid: Permutation indices for 3 dimensional tensors must be " - "unique and within [0, 2] range. Got: [0,0,2]"), + "unique and within [0, 2] range. Got: [2,0,0]"), VariableShapeTensorType::Make(value_type_, 3, {2, 0, 0}, {"C", "H", "W"})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, @@ -792,18 +791,16 @@ TEST_F(TestVariableShapeTensorType, EqualsCases) { } TEST_F(TestVariableShapeTensorType, MetadataSerializationRoundtrip) { - using T = VariableShapeTensorType; - - CheckSerializationRoundtrip(ext_type_); - CheckSerializationRoundtrip( + CheckSerializationRoundtrip(ext_type_); + CheckSerializationRoundtrip( variable_shape_tensor(value_type_, 3, {1, 2, 0}, {"x", "y", "z"})); - CheckSerializationRoundtrip(variable_shape_tensor(value_type_, 0, {}, {})); - CheckSerializationRoundtrip(variable_shape_tensor(value_type_, 1, {0}, {"x"})); - CheckSerializationRoundtrip( + CheckSerializationRoundtrip(variable_shape_tensor(value_type_, 0, {}, {})); + CheckSerializationRoundtrip(variable_shape_tensor(value_type_, 1, {0}, {"x"})); + CheckSerializationRoundtrip( variable_shape_tensor(value_type_, 3, {0, 1, 2}, {"H", "W", "C"})); - CheckSerializationRoundtrip( + CheckSerializationRoundtrip( variable_shape_tensor(value_type_, 3, {2, 0, 1}, {"C", "H", "W"})); - CheckSerializationRoundtrip( + CheckSerializationRoundtrip( variable_shape_tensor(value_type_, 3, {2, 0, 1}, {"C", "H", "W"}, {0, 1, 2})); auto storage_type = ext_type_->storage_type(); @@ -812,9 +809,9 @@ TEST_F(TestVariableShapeTensorType, MetadataSerializationRoundtrip) { CheckDeserializationRaises(ext_type_, storage_type, R"({"shape":(3,4)})", "Invalid serialized JSON data"); CheckDeserializationRaises(ext_type_, storage_type, R"({"permutation":[1,0]})", - "Invalid permutation"); + "Invalid: permutation"); CheckDeserializationRaises(ext_type_, storage_type, R"({"dim_names":["x","y"]})", - "Invalid dim_names"); + "Invalid: dim_names"); } TEST_F(TestVariableShapeTensorType, RoudtripBatch) { diff --git a/cpp/src/arrow/extension/tensor_internal.cc b/cpp/src/arrow/extension/tensor_internal.cc new file mode 100644 index 00000000000..d021553702e --- /dev/null +++ b/cpp/src/arrow/extension/tensor_internal.cc @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/tensor_internal.h" + +#include "arrow/tensor.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/int_util_overflow.h" +#include "arrow/util/sort.h" + +#include "arrow/status.h" +#include "arrow/util/logging.h" +#include "arrow/util/print.h" + +namespace arrow::internal { + +Status IsPermutationValid(const std::vector& permutation) { + const auto size = static_cast(permutation.size()); + std::vector dim_seen(size, 0); + + for (const auto p : permutation) { + if (p < 0 || p >= size || dim_seen[p] != 0) { + return Status::Invalid( + "Permutation indices for ", size, + " dimensional tensors must be unique and within [0, ", size - 1, + "] range. Got: ", ::arrow::internal::PrintVector{permutation, ","}); + } + dim_seen[p] = 1; + } + return Status::OK(); +} + +Result> ComputeStrides(const std::shared_ptr& value_type, + const std::vector& shape, + const std::vector& permutation) { + const auto fixed_width_type = + internal::checked_pointer_cast(value_type); + + std::vector strides; + if (permutation.empty()) { + ARROW_DCHECK_OK( + internal::ComputeRowMajorStrides(*fixed_width_type.get(), shape, &strides)); + return strides; + } + const int byte_width = value_type->byte_width(); + + int64_t remaining = 0; + if (!shape.empty() && shape.front() > 0) { + remaining = byte_width; + for (auto i : permutation) { + if (i > 0) { + if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) { + return Status::Invalid( + "Strides computed from shape would not fit in 64-bit integer"); + } + } + } + } + + if (remaining == 0) { + strides.assign(shape.size(), byte_width); + return strides; + } + + strides.push_back(remaining); + for (auto i : permutation) { + if (i > 0) { + remaining /= shape[i]; + strides.push_back(remaining); + } + } + DCHECK_EQ(strides.back(), byte_width); + Permute(permutation, &strides); + + return strides; +} +} // namespace arrow::internal diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index d5e58cce402..243d44fd3df 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -137,28 +137,27 @@ Result> VariableShapeTensorType::Deserialize( return Status::Invalid("Expected Struct storage type, got ", storage_type->ToString()); } - if (storage_type->num_fields() != 2) { return Status::Invalid("Expected Struct storage type with 2 fields, got ", storage_type->num_fields()); } - if (storage_type->field(0)->type()->id() != Type::FIXED_SIZE_LIST) { - return Status::Invalid("Expected FixedSizeList storage type, got ", - storage_type->field(0)->type()->ToString()); - } - if (storage_type->field(1)->type()->id() != Type::LIST) { + if (storage_type->field(0)->type()->id() != Type::LIST) { return Status::Invalid("Expected List storage type, got ", storage_type->field(1)->type()->ToString()); } - if (internal::checked_cast(*storage_type->field(0)->type()) + if (storage_type->field(1)->type()->id() != Type::FIXED_SIZE_LIST) { + return Status::Invalid("Expected FixedSizeList storage type, got ", + storage_type->field(0)->type()->ToString()); + } + if (internal::checked_cast(*storage_type->field(1)->type()) .value_type() != int32()) { return Status::Invalid("Expected FixedSizeList value type int32, got ", - storage_type->field(0)->type()->ToString()); + storage_type->field(1)->type()->ToString()); } - const auto value_type = storage_type->field(1)->type()->field(0)->type(); + const auto value_type = storage_type->field(0)->type()->field(0)->type(); const uint32_t ndim = - internal::checked_cast(*storage_type->field(0)->type()) + internal::checked_cast(*storage_type->field(1)->type()) .list_size(); rj::Document document; @@ -172,9 +171,6 @@ Result> VariableShapeTensorType::Deserialize( for (const auto& x : document["permutation"].GetArray()) { permutation.emplace_back(x.GetInt64()); } - if (permutation.size() != ndim) { - return Status::Invalid("Invalid permutation"); - } } std::vector dim_names; if (document.HasMember("dim_names")) { @@ -182,9 +178,6 @@ Result> VariableShapeTensorType::Deserialize( for (const auto& x : document["dim_names"].GetArray()) { dim_names.emplace_back(x.GetString()); } - if (dim_names.size() != ndim) { - return Status::Invalid("Invalid dim_names"); - } } std::vector> uniform_shape; @@ -197,15 +190,10 @@ Result> VariableShapeTensorType::Deserialize( uniform_shape.emplace_back(x.GetInt64()); } } - if (uniform_shape.size() != ndim) { - return Status::Invalid("uniform_shape size must match ndim. Expected: ", ndim, - " Got: ", uniform_shape.size()); - } } - return variable_shape_tensor(value_type, static_cast(ndim), - std::move(permutation), std::move(dim_names), - std::move(uniform_shape)); + return VariableShapeTensorType::Make(value_type, ndim, permutation, dim_names, + uniform_shape); } std::shared_ptr VariableShapeTensorType::MakeArray( @@ -218,13 +206,13 @@ std::shared_ptr VariableShapeTensorType::MakeArray( Result> VariableShapeTensorType::MakeTensor( const std::shared_ptr& scalar) { - const auto ext_scalar = internal::checked_pointer_cast(scalar); const auto tensor_scalar = internal::checked_pointer_cast(scalar->value); const auto ext_type = internal::checked_pointer_cast(scalar->type); - ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar->field(0)); ARROW_ASSIGN_OR_RAISE(const auto data_scalar, tensor_scalar->field(1)); + ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar->field(0)); + ARROW_CHECK(tensor_scalar->is_valid); const auto shape_array = internal::checked_pointer_cast( internal::checked_pointer_cast(shape_scalar)->value); const auto data_array = @@ -233,24 +221,21 @@ Result> VariableShapeTensorType::MakeTensor( const auto value_type = internal::checked_pointer_cast(ext_type->value_type()); - if (!is_fixed_width(*value_type)) { - return Status::Invalid("Cannot convert non-fixed-width values to Tensor."); - } if (data_array->null_count() > 0) { return Status::Invalid("Cannot convert data with nulls to Tensor."); } auto permutation = ext_type->permutation(); if (permutation.empty()) { - // permutation.resize(ndim); + permutation.resize(ext_type->ndim()); std::iota(permutation.begin(), permutation.end(), 0); } + ARROW_CHECK_EQ(shape_array->length(), ext_type->ndim()); std::vector shape; - // shape.reserve(ndim); + shape.reserve(ext_type->ndim()); for (int64_t j = 0; j < static_cast(ext_type->ndim()); ++j) { - ARROW_ASSIGN_OR_RAISE(const auto size, shape_array->GetScalar(j)); - auto size_value = internal::checked_pointer_cast(size)->value; + const auto size_value = shape_array->Value(j); if (size_value < 0) { return Status::Invalid("shape must have non-negative values"); } @@ -263,14 +248,14 @@ Result> VariableShapeTensorType::MakeTensor( internal::Permute(permutation, &dim_names); } - std::vector strides; - ARROW_CHECK_OK(internal::ComputeStrides(value_type, shape, permutation, &strides)); + ARROW_ASSIGN_OR_RAISE(std::vector strides, + internal::ComputeStrides(value_type, shape, permutation)); const auto byte_width = value_type->byte_width(); const auto start_position = data_array->offset() * byte_width; const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), std::multiplies<>()); - + ARROW_CHECK_EQ(size * byte_width, data_array->length() * byte_width); ARROW_ASSIGN_OR_RAISE( const auto buffer, SliceBufferSafe(data_array->data()->buffers[1], start_position, size * byte_width)); @@ -283,6 +268,10 @@ Result> VariableShapeTensorType::Make( const std::shared_ptr& value_type, const int32_t ndim, const std::vector& permutation, const std::vector& dim_names, const std::vector>& uniform_shape) { + if (!is_fixed_width(*value_type)) { + return Status::Invalid("Cannot convert non-fixed-width values to Tensor."); + } + if (!dim_names.empty() && dim_names.size() != static_cast(ndim)) { return Status::Invalid("dim_names size must match ndim. Expected: ", ndim, " Got: ", dim_names.size()); @@ -303,26 +292,7 @@ Result> VariableShapeTensorType::Make( return Status::Invalid("permutation size must match ndim. Expected: ", ndim, " Got: ", permutation.size()); } - - std::vector sorted_permutation = permutation; - std::sort(sorted_permutation.begin(), sorted_permutation.end()); - const auto max_index = std::max(static_cast(ndim - 1), 0); - - if (sorted_permutation[0] != 0) { - return Status::Invalid( - "Permutation indices for ", ndim, - " dimensional tensors must be unique and within [0, ", max_index, - "] range. Got: ", ::arrow::internal::PrintVector{sorted_permutation, ","}); - } - - for (size_t i = 1; i < sorted_permutation.size(); ++i) { - if (sorted_permutation[i - 1] + 1 != sorted_permutation[i]) { - return Status::Invalid( - "Permutation indices for ", ndim, - " dimensional tensors must be unique and within [0, ", max_index, - "] range. Got: ", ::arrow::internal::PrintVector{sorted_permutation, ","}); - } - } + RETURN_NOT_OK(internal::IsPermutationValid(permutation)); } return std::make_shared( diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 44edeac9cab..7b3e14fbc7e 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -42,8 +42,8 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const std::vector permutation = {}, const std::vector dim_names = {}, const std::vector> uniform_shape = {}) - : ExtensionType(struct_({::arrow::field("shape", fixed_size_list(int32(), ndim)), - ::arrow::field("data", list(value_type))})), + : ExtensionType(struct_({::arrow::field("data", list(value_type)), + ::arrow::field("shape", fixed_size_list(int32(), ndim))})), value_type_(value_type), ndim_(std::move(ndim)), permutation_(std::move(permutation)), From d0e16f4ad5866b000553a309f8047d29609dd752 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 28 Mar 2024 14:34:26 +0100 Subject: [PATCH 54/62] mingw64 issue --- cpp/src/arrow/extension/tensor_extension_array_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index b395da34a8a..a00955cfeaa 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -96,7 +96,7 @@ auto RoundtripBatch = [](const std::shared_ptr& batch, TEST_F(TestFixedShapeTensorType, CheckDummyRegistration) { // We need a registered dummy type at runtime to allow for IPC deserialization auto registered_type = GetExtensionType("arrow.fixed_shape_tensor"); - ASSERT_TRUE(registered_type->type_id == Type::EXTENSION); + ASSERT_EQ(registered_type->id(), Type::EXTENSION); } TEST_F(TestFixedShapeTensorType, CreateExtensionType) { @@ -720,7 +720,7 @@ class TestVariableShapeTensorType : public ::testing::Test { TEST_F(TestVariableShapeTensorType, CheckDummyRegistration) { // We need a registered dummy type at runtime to allow for IPC deserialization auto registered_type = GetExtensionType("arrow.variable_shape_tensor"); - ASSERT_EQ(registered_type->type_id, Type::EXTENSION); + ASSERT_EQ(registered_type->id(), Type::EXTENSION); } TEST_F(TestVariableShapeTensorType, CreateExtensionType) { From adec2045d705afa318ef5e1e8081679677ff0074 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 29 Mar 2024 23:56:03 +0100 Subject: [PATCH 55/62] refactor ComputeStrides --- .../extension/tensor_extension_array_test.cc | 8 ++-- cpp/src/arrow/extension/tensor_internal.cc | 39 ++++--------------- 2 files changed, 11 insertions(+), 36 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index a00955cfeaa..7ad3988b6f0 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -541,16 +541,16 @@ TEST_F(TestFixedShapeTensorType, ComputeStrides) { auto ext_type_5 = internal::checked_pointer_cast( fixed_shape_tensor(int64(), {3, 4, 7}, {1, 0, 2})); - ASSERT_EQ(ext_type_5->strides(), (std::vector{56, 224, 8})); + ASSERT_EQ(ext_type_5->strides(), (std::vector{56, 168, 8})); ASSERT_EQ(ext_type_5->Serialize(), R"({"shape":[3,4,7],"permutation":[1,0,2]})"); auto ext_type_6 = internal::checked_pointer_cast( fixed_shape_tensor(int64(), {3, 4, 7}, {1, 2, 0}, {})); - ASSERT_EQ(ext_type_6->strides(), (std::vector{56, 8, 224})); + ASSERT_EQ(ext_type_6->strides(), (std::vector{32, 8, 96})); ASSERT_EQ(ext_type_6->Serialize(), R"({"shape":[3,4,7],"permutation":[1,2,0]})"); auto ext_type_7 = internal::checked_pointer_cast( fixed_shape_tensor(int32(), {3, 4, 7}, {2, 0, 1}, {})); - ASSERT_EQ(ext_type_7->strides(), (std::vector{4, 112, 16})); + ASSERT_EQ(ext_type_7->strides(), (std::vector{4, 84, 12})); ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); } @@ -621,7 +621,7 @@ TEST_F(TestFixedShapeTensorType, GetTensor) { // Get tensor from extension array with non-trivial permutation ASSERT_OK_AND_ASSIGN(auto expected_permuted_tensor, Tensor::Make(value_type_, Buffer::Wrap(element_values[i]), - {4, 3}, {8, 24}, {"y", "x"})); + {4, 3}, {8, 32}, {"y", "x"})); ASSERT_OK_AND_ASSIGN(scalar, permuted_array->GetScalar(i)); ASSERT_OK_AND_ASSIGN(auto actual_permuted_tensor, exact_permuted_ext_type->MakeTensor( diff --git a/cpp/src/arrow/extension/tensor_internal.cc b/cpp/src/arrow/extension/tensor_internal.cc index d021553702e..b3d3f626441 100644 --- a/cpp/src/arrow/extension/tensor_internal.cc +++ b/cpp/src/arrow/extension/tensor_internal.cc @@ -47,43 +47,18 @@ Status IsPermutationValid(const std::vector& permutation) { Result> ComputeStrides(const std::shared_ptr& value_type, const std::vector& shape, const std::vector& permutation) { - const auto fixed_width_type = - internal::checked_pointer_cast(value_type); - + const auto& fw_type = checked_cast(*value_type); std::vector strides; - if (permutation.empty()) { - ARROW_DCHECK_OK( - internal::ComputeRowMajorStrides(*fixed_width_type.get(), shape, &strides)); - return strides; - } - const int byte_width = value_type->byte_width(); - - int64_t remaining = 0; - if (!shape.empty() && shape.front() > 0) { - remaining = byte_width; - for (auto i : permutation) { - if (i > 0) { - if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) { - return Status::Invalid( - "Strides computed from shape would not fit in 64-bit integer"); - } - } - } - } - if (remaining == 0) { - strides.assign(shape.size(), byte_width); + if (permutation.empty()) { + ARROW_DCHECK_OK(internal::ComputeRowMajorStrides(fw_type, shape, &strides)); return strides; } - strides.push_back(remaining); - for (auto i : permutation) { - if (i > 0) { - remaining /= shape[i]; - strides.push_back(remaining); - } - } - DCHECK_EQ(strides.back(), byte_width); + auto permuted_shape = std::move(shape); + auto reverse_permutation = internal::ArgSort(permutation, std::less<>()); + Permute(reverse_permutation, &permuted_shape); + ARROW_DCHECK_OK(internal::ComputeRowMajorStrides(fw_type, permuted_shape, &strides)); Permute(permutation, &strides); return strides; From 419b7c2c743830eb3c7e125946e23ede9ac63c39 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 1 Apr 2024 18:37:48 +0200 Subject: [PATCH 56/62] Change to ComputeStrides --- cpp/src/arrow/extension/tensor_extension_array_test.cc | 4 ++-- cpp/src/arrow/extension/tensor_internal.cc | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 7ad3988b6f0..7e532f9ed45 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -546,11 +546,11 @@ TEST_F(TestFixedShapeTensorType, ComputeStrides) { auto ext_type_6 = internal::checked_pointer_cast( fixed_shape_tensor(int64(), {3, 4, 7}, {1, 2, 0}, {})); - ASSERT_EQ(ext_type_6->strides(), (std::vector{32, 8, 96})); + ASSERT_EQ(ext_type_6->strides(), (std::vector{8, 168, 24})); ASSERT_EQ(ext_type_6->Serialize(), R"({"shape":[3,4,7],"permutation":[1,2,0]})"); auto ext_type_7 = internal::checked_pointer_cast( fixed_shape_tensor(int32(), {3, 4, 7}, {2, 0, 1}, {})); - ASSERT_EQ(ext_type_7->strides(), (std::vector{4, 84, 12})); + ASSERT_EQ(ext_type_7->strides(), (std::vector{16, 4, 48})); ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); } diff --git a/cpp/src/arrow/extension/tensor_internal.cc b/cpp/src/arrow/extension/tensor_internal.cc index b3d3f626441..4a495da81ba 100644 --- a/cpp/src/arrow/extension/tensor_internal.cc +++ b/cpp/src/arrow/extension/tensor_internal.cc @@ -56,10 +56,11 @@ Result> ComputeStrides(const std::shared_ptr& val } auto permuted_shape = std::move(shape); - auto reverse_permutation = internal::ArgSort(permutation, std::less<>()); - Permute(reverse_permutation, &permuted_shape); + auto reverse_permutation = internal::ArgSort(permutation, std::greater<>()); + Permute(permutation, &permuted_shape); ARROW_DCHECK_OK(internal::ComputeRowMajorStrides(fw_type, permuted_shape, &strides)); - Permute(permutation, &strides); + Permute(reverse_permutation, &strides); + std::reverse(strides.begin(), strides.end()); return strides; } From c6993818c9a24c7dfe24aac35eaa1443ac345966 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 1 Apr 2024 23:39:16 +0200 Subject: [PATCH 57/62] Change ToTensor --- python/pyarrow/tests/test_extension_type.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index a44a52f3998..4344aef3fbb 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1482,6 +1482,17 @@ def test_tensor_class_methods(np_type_str): assert result.to_tensor().shape == (1, 3, 2, 2) assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw) + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[2, 1, 0]) + result = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = as_strided(flat_arr, shape=(1, 3, 2, 2), + strides=(bw * 12, bw, bw * 3, bw * 6)) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + assert result.type.permutation == [2, 1, 0] + assert result.type.shape == [2, 2, 3] + assert result.to_tensor().shape == (1, 3, 2, 2) + assert result.to_tensor().strides == (12 * bw, 1 * bw, 3 * bw, 6 * bw) + @pytest.mark.numpy @pytest.mark.parametrize("np_type_str", ("int8", "int64", "float32")) From 00c3c3945ce494a96ea42ac6f22c46f672b66657 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 2 Apr 2024 02:01:34 +0200 Subject: [PATCH 58/62] Refactoring ComputeStrides --- .../extension/tensor_extension_array_test.cc | 12 ++++++------ cpp/src/arrow/extension/tensor_internal.cc | 16 +++------------- cpp/src/arrow/extension/variable_shape_tensor.cc | 14 +++++++------- 3 files changed, 16 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 7e532f9ed45..26b7874a4c8 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -541,16 +541,16 @@ TEST_F(TestFixedShapeTensorType, ComputeStrides) { auto ext_type_5 = internal::checked_pointer_cast( fixed_shape_tensor(int64(), {3, 4, 7}, {1, 0, 2})); - ASSERT_EQ(ext_type_5->strides(), (std::vector{56, 168, 8})); + ASSERT_EQ(ext_type_5->strides(), (std::vector{56, 224, 8})); ASSERT_EQ(ext_type_5->Serialize(), R"({"shape":[3,4,7],"permutation":[1,0,2]})"); auto ext_type_6 = internal::checked_pointer_cast( fixed_shape_tensor(int64(), {3, 4, 7}, {1, 2, 0}, {})); - ASSERT_EQ(ext_type_6->strides(), (std::vector{8, 168, 24})); + ASSERT_EQ(ext_type_6->strides(), (std::vector{56, 8, 224})); ASSERT_EQ(ext_type_6->Serialize(), R"({"shape":[3,4,7],"permutation":[1,2,0]})"); auto ext_type_7 = internal::checked_pointer_cast( fixed_shape_tensor(int32(), {3, 4, 7}, {2, 0, 1}, {})); - ASSERT_EQ(ext_type_7->strides(), (std::vector{16, 4, 48})); + ASSERT_EQ(ext_type_7->strides(), (std::vector{4, 112, 28})); ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); } @@ -841,9 +841,9 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { auto shapes = ArrayFromJSON(shape_type_, "[[2,3,1],[2,1,2],[3,1,3],null]"); auto data = ArrayFromJSON( data_type_, "[[1,1,2,3,4,5],[2,7,8,9],[10,11,12,13,14,15,16,17,18],null]"); - std::vector> fields = {field("shapes", shape_type_), - field("data", data_type_)}; - ASSERT_OK_AND_ASSIGN(auto storage_arr, StructArray::Make({shapes, data}, fields)); + std::vector> fields = {field("data", data_type_), + field("shapes", shape_type_)}; + ASSERT_OK_AND_ASSIGN(auto storage_arr, StructArray::Make({data, shapes}, fields)); auto ext_arr = ExtensionType::WrapArray(ext_type_, storage_arr); auto exact_ext_type = internal::checked_pointer_cast(ext_type_); diff --git a/cpp/src/arrow/extension/tensor_internal.cc b/cpp/src/arrow/extension/tensor_internal.cc index 4a495da81ba..02a3fdbbae7 100644 --- a/cpp/src/arrow/extension/tensor_internal.cc +++ b/cpp/src/arrow/extension/tensor_internal.cc @@ -49,19 +49,9 @@ Result> ComputeStrides(const std::shared_ptr& val const std::vector& permutation) { const auto& fw_type = checked_cast(*value_type); std::vector strides; - - if (permutation.empty()) { - ARROW_DCHECK_OK(internal::ComputeRowMajorStrides(fw_type, shape, &strides)); - return strides; - } - - auto permuted_shape = std::move(shape); - auto reverse_permutation = internal::ArgSort(permutation, std::greater<>()); - Permute(permutation, &permuted_shape); - ARROW_DCHECK_OK(internal::ComputeRowMajorStrides(fw_type, permuted_shape, &strides)); - Permute(reverse_permutation, &strides); - std::reverse(strides.begin(), strides.end()); - + ARROW_DCHECK_OK(internal::ComputeRowMajorStrides(fw_type, shape, &strides)); + // If the permutation is empty, the strides are already in the correct order. + internal::Permute(permutation, &strides); return strides; } } // namespace arrow::internal diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 243d44fd3df..739f2c8edf2 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -143,11 +143,11 @@ Result> VariableShapeTensorType::Deserialize( } if (storage_type->field(0)->type()->id() != Type::LIST) { return Status::Invalid("Expected List storage type, got ", - storage_type->field(1)->type()->ToString()); + storage_type->field(0)->type()->ToString()); } if (storage_type->field(1)->type()->id() != Type::FIXED_SIZE_LIST) { return Status::Invalid("Expected FixedSizeList storage type, got ", - storage_type->field(0)->type()->ToString()); + storage_type->field(1)->type()->ToString()); } if (internal::checked_cast(*storage_type->field(1)->type()) .value_type() != int32()) { @@ -210,13 +210,13 @@ Result> VariableShapeTensorType::MakeTensor( const auto ext_type = internal::checked_pointer_cast(scalar->type); - ARROW_ASSIGN_OR_RAISE(const auto data_scalar, tensor_scalar->field(1)); - ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar->field(0)); + ARROW_ASSIGN_OR_RAISE(const auto data_scalar, tensor_scalar->field(0)); + ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar->field(1)); ARROW_CHECK(tensor_scalar->is_valid); - const auto shape_array = internal::checked_pointer_cast( - internal::checked_pointer_cast(shape_scalar)->value); const auto data_array = internal::checked_pointer_cast(data_scalar)->value; + const auto shape_array = internal::checked_pointer_cast( + internal::checked_pointer_cast(shape_scalar)->value); const auto value_type = internal::checked_pointer_cast(ext_type->value_type()); @@ -241,7 +241,6 @@ Result> VariableShapeTensorType::MakeTensor( } shape.push_back(std::move(size_value)); } - internal::Permute(permutation, &shape); std::vector dim_names = ext_type->dim_names(); if (!dim_names.empty()) { @@ -250,6 +249,7 @@ Result> VariableShapeTensorType::MakeTensor( ARROW_ASSIGN_OR_RAISE(std::vector strides, internal::ComputeStrides(value_type, shape, permutation)); + internal::Permute(permutation, &shape); const auto byte_width = value_type->byte_width(); const auto start_position = data_array->offset() * byte_width; From faa527fbc01ed18e5baf5798598a91455cd62a94 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 15 Apr 2024 00:13:12 +0200 Subject: [PATCH 59/62] Move RoundtripBatch to gtest_util.cc --- .../arrow/extension/tensor_extension_array_test.cc | 14 -------------- cpp/src/arrow/testing/gtest_util.cc | 14 ++++++++++++++ cpp/src/arrow/testing/gtest_util.h | 3 +++ 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 26b7874a4c8..9d50bc994f9 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -79,20 +79,6 @@ class TestFixedShapeTensorType : public ::testing::Test { std::string serialized_; }; -auto RoundtripBatch = [](const std::shared_ptr& batch, - std::shared_ptr* out) { - ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); - ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), - out_stream.get())); - - ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); - - io::BufferReader reader(complete_ipc_stream); - std::shared_ptr batch_reader; - ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); - ASSERT_OK(batch_reader->ReadNext(out)); -}; - TEST_F(TestFixedShapeTensorType, CheckDummyRegistration) { // We need a registered dummy type at runtime to allow for IPC deserialization auto registered_type = GetExtensionType("arrow.fixed_shape_tensor"); diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 1acc47a99d4..284ac8775a9 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -568,6 +568,20 @@ void ApproxCompareBatch(const RecordBatch& left, const RecordBatch& right, }); } +void RoundtripBatch(const std::shared_ptr& batch, + std::shared_ptr* out) { + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); + ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), + out_stream.get())); + + ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); + + io::BufferReader reader(complete_ipc_stream); + std::shared_ptr batch_reader; + ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); + ASSERT_OK(batch_reader->ReadNext(out)); +} + std::shared_ptr TweakValidityBit(const std::shared_ptr& array, int64_t index, bool validity) { auto data = array->data()->Copy(); diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 62bf907a2d8..628aa7d58db 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -318,6 +318,9 @@ ARROW_TESTING_EXPORT void ApproxCompareBatch( const RecordBatch& left, const RecordBatch& right, bool compare_metadata = true, const EqualOptions& options = TestingEqualOptions()); +ARROW_TESTING_EXPORT void RoundtripBatch(const std::shared_ptr& batch, + std::shared_ptr* out); + // Check if the padding of the buffers of the array is zero. // Also cause valgrind warnings if the padding bytes are uninitialized. ARROW_TESTING_EXPORT void AssertZeroPadded(const Array& array); From 9649d2e87e5328c4e261fbd21e877fdfc5c125a1 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 6 Jun 2024 15:36:31 +0200 Subject: [PATCH 60/62] Post rebase changes --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/extension/fixed_shape_tensor.cc | 62 +++---------------- cpp/src/arrow/extension/fixed_shape_tensor.h | 6 +- .../extension/tensor_extension_array_test.cc | 5 +- cpp/src/arrow/extension/tensor_internal.cc | 48 +++++++++++--- cpp/src/arrow/extension/uuid_test.cc | 2 - .../arrow/extension/variable_shape_tensor.cc | 45 +++++++------- cpp/src/arrow/extension_type_test.cc | 2 - 8 files changed, 73 insertions(+), 98 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 7841d37666a..6b1c25dc604 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -983,6 +983,7 @@ endif() if(ARROW_JSON) arrow_add_object_library(ARROW_JSON extension/fixed_shape_tensor.cc + extension/opaque.cc extension/tensor_internal.cc extension/variable_shape_tensor.cc json/options.cc diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 45c211b0cee..e7df91f5892 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -37,53 +37,7 @@ namespace rj = arrow::rapidjson; -namespace arrow { - -namespace extension { - -namespace { - -Status ComputeStrides(const std::shared_ptr& value_type, - const std::vector& shape, - const std::vector& permutation, - std::vector* strides) { - auto fixed_width_type = internal::checked_pointer_cast(value_type); - if (permutation.empty()) { - return internal::ComputeRowMajorStrides(*fixed_width_type.get(), shape, strides); - } - const int byte_width = value_type->byte_width(); - - int64_t remaining = 0; - if (!shape.empty() && shape.front() > 0) { - remaining = byte_width; - for (auto i : permutation) { - if (i > 0) { - if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) { - return Status::Invalid( - "Strides computed from shape would not fit in 64-bit integer"); - } - } - } - } - - if (remaining == 0) { - strides->assign(shape.size(), byte_width); - return Status::OK(); - } - - strides->push_back(remaining); - for (auto i : permutation) { - if (i > 0) { - remaining /= shape[i]; - strides->push_back(remaining); - } - } - internal::Permute(permutation, strides); - - return Status::OK(); -} - -} // namespace +namespace arrow::extension { bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const { if (extension_name() != other.extension_name()) { @@ -238,7 +192,8 @@ Result> FixedShapeTensorType::MakeTensor( } std::vector strides; - RETURN_NOT_OK(ComputeStrides(value_type, shape, permutation, &strides)); + RETURN_NOT_OK( + internal::ComputeStrides(ext_type.value_type(), shape, permutation, &strides)); const auto start_position = array->offset() * byte_width; const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), std::multiplies<>()); @@ -377,9 +332,8 @@ const Result> FixedShapeTensorArray::ToTensor() const { internal::Permute(permutation, &shape); std::vector tensor_strides; - const auto* fw_value_type = internal::checked_cast(value_type.get()); ARROW_RETURN_NOT_OK( - ComputeStrides(*fw_value_type, shape, permutation, &tensor_strides)); + internal::ComputeStrides(value_type, shape, permutation, &tensor_strides)); const auto& raw_buffer = this->storage()->data()->child_data[0]->buffers[1]; ARROW_ASSIGN_OR_RAISE( @@ -413,10 +367,9 @@ Result> FixedShapeTensorType::Make( const std::vector& FixedShapeTensorType::strides() { if (strides_.empty()) { - auto value_type = internal::checked_cast(this->value_type_.get()); std::vector tensor_strides; - ARROW_CHECK_OK( - ComputeStrides(*value_type, this->shape(), this->permutation(), &tensor_strides)); + ARROW_CHECK_OK(internal::ComputeStrides(this->value_type_, this->shape(), + this->permutation(), &tensor_strides)); strides_ = tensor_strides; } return strides_; @@ -431,5 +384,4 @@ std::shared_ptr fixed_shape_tensor(const std::shared_ptr& va return maybe_type.MoveValueUnsafe(); } -} // namespace extension -} // namespace arrow +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 80a602021c6..5098da0405f 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -19,8 +19,7 @@ #include "arrow/extension_type.h" -namespace arrow { -namespace extension { +namespace arrow::extension { class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { public: @@ -126,5 +125,4 @@ ARROW_EXPORT std::shared_ptr fixed_shape_tensor( const std::vector& permutation = {}, const std::vector& dim_names = {}); -} // namespace extension -} // namespace arrow +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 9d50bc994f9..440e90e42ef 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -36,7 +36,6 @@ namespace arrow { using FixedShapeTensorType = extension::FixedShapeTensorType; -using arrow::ipc::test::RoundtripBatch; using extension::fixed_shape_tensor; using extension::FixedShapeTensorArray; @@ -536,7 +535,7 @@ TEST_F(TestFixedShapeTensorType, ComputeStrides) { ASSERT_EQ(ext_type_6->Serialize(), R"({"shape":[3,4,7],"permutation":[1,2,0]})"); auto ext_type_7 = internal::checked_pointer_cast( fixed_shape_tensor(int32(), {3, 4, 7}, {2, 0, 1}, {})); - ASSERT_EQ(ext_type_7->strides(), (std::vector{4, 112, 28})); + ASSERT_EQ(ext_type_7->strides(), (std::vector{4, 112, 16})); ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); } @@ -607,7 +606,7 @@ TEST_F(TestFixedShapeTensorType, GetTensor) { // Get tensor from extension array with non-trivial permutation ASSERT_OK_AND_ASSIGN(auto expected_permuted_tensor, Tensor::Make(value_type_, Buffer::Wrap(element_values[i]), - {4, 3}, {8, 32}, {"y", "x"})); + {4, 3}, {8, 24}, {"y", "x"})); ASSERT_OK_AND_ASSIGN(scalar, permuted_array->GetScalar(i)); ASSERT_OK_AND_ASSIGN(auto actual_permuted_tensor, exact_permuted_ext_type->MakeTensor( diff --git a/cpp/src/arrow/extension/tensor_internal.cc b/cpp/src/arrow/extension/tensor_internal.cc index 02a3fdbbae7..2f3d8ae5d05 100644 --- a/cpp/src/arrow/extension/tensor_internal.cc +++ b/cpp/src/arrow/extension/tensor_internal.cc @@ -44,14 +44,44 @@ Status IsPermutationValid(const std::vector& permutation) { return Status::OK(); } -Result> ComputeStrides(const std::shared_ptr& value_type, - const std::vector& shape, - const std::vector& permutation) { - const auto& fw_type = checked_cast(*value_type); - std::vector strides; - ARROW_DCHECK_OK(internal::ComputeRowMajorStrides(fw_type, shape, &strides)); - // If the permutation is empty, the strides are already in the correct order. - internal::Permute(permutation, &strides); - return strides; +Status ComputeStrides(const std::shared_ptr& value_type, + const std::vector& shape, + const std::vector& permutation, + std::vector* strides) { + auto fixed_width_type = internal::checked_pointer_cast(value_type); + if (permutation.empty()) { + return internal::ComputeRowMajorStrides(*fixed_width_type.get(), shape, strides); + } + const int byte_width = value_type->byte_width(); + + int64_t remaining = 0; + if (!shape.empty() && shape.front() > 0) { + remaining = byte_width; + for (auto i : permutation) { + if (i > 0) { + if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) { + return Status::Invalid( + "Strides computed from shape would not fit in 64-bit integer"); + } + } + } + } + + if (remaining == 0) { + strides->assign(shape.size(), byte_width); + return Status::OK(); + } + + strides->push_back(remaining); + for (auto i : permutation) { + if (i > 0) { + remaining /= shape[i]; + strides->push_back(remaining); + } + } + internal::Permute(permutation, strides); + + return Status::OK(); } + } // namespace arrow::internal diff --git a/cpp/src/arrow/extension/uuid_test.cc b/cpp/src/arrow/extension/uuid_test.cc index 1c1ffb6eb8e..e3fff0d49b9 100644 --- a/cpp/src/arrow/extension/uuid_test.cc +++ b/cpp/src/arrow/extension/uuid_test.cc @@ -29,8 +29,6 @@ namespace arrow { -using arrow::ipc::test::RoundtripBatch; - TEST(TestUuuidExtensionType, ExtensionTypeTest) { auto type = uuid(); ASSERT_EQ(type->id(), Type::EXTENSION); diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 739f2c8edf2..96e82689183 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -36,8 +36,7 @@ namespace rj = arrow::rapidjson; -namespace arrow { -namespace extension { +namespace arrow::extension { bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const { if (extension_name() != other.extension_name()) { @@ -206,35 +205,35 @@ std::shared_ptr VariableShapeTensorType::MakeArray( Result> VariableShapeTensorType::MakeTensor( const std::shared_ptr& scalar) { - const auto tensor_scalar = internal::checked_pointer_cast(scalar->value); - const auto ext_type = - internal::checked_pointer_cast(scalar->type); + const auto& tensor_scalar = internal::checked_cast(*scalar->value); + const auto& ext_type = + internal::checked_cast(*scalar->type); - ARROW_ASSIGN_OR_RAISE(const auto data_scalar, tensor_scalar->field(0)); - ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar->field(1)); - ARROW_CHECK(tensor_scalar->is_valid); + ARROW_ASSIGN_OR_RAISE(const auto data_scalar, tensor_scalar.field(0)); + ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar.field(1)); + ARROW_CHECK(tensor_scalar.is_valid); const auto data_array = internal::checked_pointer_cast(data_scalar)->value; const auto shape_array = internal::checked_pointer_cast( internal::checked_pointer_cast(shape_scalar)->value); - const auto value_type = - internal::checked_pointer_cast(ext_type->value_type()); + const auto& value_type = + internal::checked_cast(*ext_type.value_type()); if (data_array->null_count() > 0) { return Status::Invalid("Cannot convert data with nulls to Tensor."); } - auto permutation = ext_type->permutation(); + auto permutation = ext_type.permutation(); if (permutation.empty()) { - permutation.resize(ext_type->ndim()); + permutation.resize(ext_type.ndim()); std::iota(permutation.begin(), permutation.end(), 0); } - ARROW_CHECK_EQ(shape_array->length(), ext_type->ndim()); + ARROW_CHECK_EQ(shape_array->length(), ext_type.ndim()); std::vector shape; - shape.reserve(ext_type->ndim()); - for (int64_t j = 0; j < static_cast(ext_type->ndim()); ++j) { + shape.reserve(ext_type.ndim()); + for (int64_t j = 0; j < static_cast(ext_type.ndim()); ++j) { const auto size_value = shape_array->Value(j); if (size_value < 0) { return Status::Invalid("shape must have non-negative values"); @@ -242,16 +241,17 @@ Result> VariableShapeTensorType::MakeTensor( shape.push_back(std::move(size_value)); } - std::vector dim_names = ext_type->dim_names(); + std::vector dim_names = ext_type.dim_names(); if (!dim_names.empty()) { internal::Permute(permutation, &dim_names); } - ARROW_ASSIGN_OR_RAISE(std::vector strides, - internal::ComputeStrides(value_type, shape, permutation)); + std::vector strides; + ARROW_RETURN_NOT_OK( + internal::ComputeStrides(ext_type.value_type(), shape, permutation, &strides)); internal::Permute(permutation, &shape); - const auto byte_width = value_type->byte_width(); + const auto byte_width = value_type.byte_width(); const auto start_position = data_array->offset() * byte_width; const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), std::multiplies<>()); @@ -260,8 +260,8 @@ Result> VariableShapeTensorType::MakeTensor( const auto buffer, SliceBufferSafe(data_array->data()->buffers[1], start_position, size * byte_width)); - return Tensor::Make(value_type, std::move(buffer), std::move(shape), std::move(strides), - ext_type->dim_names()); + return Tensor::Make(ext_type.value_type(), std::move(buffer), std::move(shape), + std::move(strides), ext_type.dim_names()); } Result> VariableShapeTensorType::Make( @@ -311,5 +311,4 @@ std::shared_ptr variable_shape_tensor( return maybe_type.MoveValueUnsafe(); } -} // namespace extension -} // namespace arrow +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index 23c1ff731da..9c017f1ad19 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -42,8 +42,6 @@ namespace arrow { -using arrow::ipc::test::RoundtripBatch; - class Parametric1Array : public ExtensionArray { public: using ExtensionArray::ExtensionArray; From 70326530ab6ff74da2bcaef1254c11859f2e8eb4 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 11 Sep 2024 14:00:53 +0200 Subject: [PATCH 61/62] Post rebase changes --- .../arrow/extension/tensor_extension_array_test.cc | 5 +++-- cpp/src/arrow/extension/uuid_test.cc | 2 ++ cpp/src/arrow/extension_type.cc | 6 +++--- cpp/src/arrow/extension_type_test.cc | 2 ++ cpp/src/arrow/testing/gtest_util.cc | 14 -------------- cpp/src/arrow/testing/gtest_util.h | 3 --- 6 files changed, 10 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 440e90e42ef..5978f244148 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -36,6 +36,7 @@ namespace arrow { using FixedShapeTensorType = extension::FixedShapeTensorType; +using arrow::ipc::test::RoundtripBatch; using extension::fixed_shape_tensor; using extension::FixedShapeTensorArray; @@ -807,7 +808,7 @@ TEST_F(TestVariableShapeTensorType, RoudtripBatch) { std::shared_ptr read_batch; auto ext_field = field(/*name=*/"f0", /*type=*/ext_type_); auto batch = RecordBatch::Make(schema({ext_field}), ext_arr_->length(), {ext_arr_}); - RoundtripBatch(batch, &read_batch); + ASSERT_OK(RoundtripBatch(batch, &read_batch)); CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); // Pass extension metadata and storage array, expect getting back extension array @@ -818,7 +819,7 @@ TEST_F(TestVariableShapeTensorType, RoudtripBatch) { ext_field = field(/*name=*/"f0", /*type=*/ext_type_->storage_type(), /*nullable=*/true, /*metadata=*/ext_metadata); auto batch2 = RecordBatch::Make(schema({ext_field}), ext_arr_->length(), {ext_arr_}); - RoundtripBatch(batch2, &read_batch2); + ASSERT_OK(RoundtripBatch(batch2, &read_batch2)); CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); } diff --git a/cpp/src/arrow/extension/uuid_test.cc b/cpp/src/arrow/extension/uuid_test.cc index e3fff0d49b9..1c1ffb6eb8e 100644 --- a/cpp/src/arrow/extension/uuid_test.cc +++ b/cpp/src/arrow/extension/uuid_test.cc @@ -29,6 +29,8 @@ namespace arrow { +using arrow::ipc::test::RoundtripBatch; + TEST(TestUuuidExtensionType, ExtensionTypeTest) { auto type = uuid(); ASSERT_EQ(type->id(), Type::EXTENSION); diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 31749601ca7..ce88c951741 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -29,9 +29,9 @@ #include "arrow/config.h" #include "arrow/extension/bool8.h" #ifdef ARROW_JSON -#include "arrow/extension/fixed_shape_tensor.h" -#include "arrow/extension/opaque.h" -#include "arrow/extension/variable_shape_tensor.h" +# include "arrow/extension/fixed_shape_tensor.h" +# include "arrow/extension/opaque.h" +# include "arrow/extension/variable_shape_tensor.h" #endif #include "arrow/extension/json.h" #include "arrow/extension/uuid.h" diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index 9c017f1ad19..0b256f1b45b 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -40,6 +40,8 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging_internal.h" +using arrow::ipc::test::RoundtripBatch; + namespace arrow { class Parametric1Array : public ExtensionArray { diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 284ac8775a9..1acc47a99d4 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -568,20 +568,6 @@ void ApproxCompareBatch(const RecordBatch& left, const RecordBatch& right, }); } -void RoundtripBatch(const std::shared_ptr& batch, - std::shared_ptr* out) { - ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); - ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), - out_stream.get())); - - ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); - - io::BufferReader reader(complete_ipc_stream); - std::shared_ptr batch_reader; - ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); - ASSERT_OK(batch_reader->ReadNext(out)); -} - std::shared_ptr TweakValidityBit(const std::shared_ptr& array, int64_t index, bool validity) { auto data = array->data()->Copy(); diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 628aa7d58db..62bf907a2d8 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -318,9 +318,6 @@ ARROW_TESTING_EXPORT void ApproxCompareBatch( const RecordBatch& left, const RecordBatch& right, bool compare_metadata = true, const EqualOptions& options = TestingEqualOptions()); -ARROW_TESTING_EXPORT void RoundtripBatch(const std::shared_ptr& batch, - std::shared_ptr* out); - // Check if the padding of the buffers of the array is zero. // Also cause valgrind warnings if the padding bytes are uninitialized. ARROW_TESTING_EXPORT void AssertZeroPadded(const Array& array); From add340390553c953f93f09f30b187595e24d348f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 12 Nov 2025 14:06:13 +0100 Subject: [PATCH 62/62] post rebase fixes --- cpp/src/arrow/compute/kernels/CMakeLists.txt | 5 +- .../extension/tensor_extension_array_test.cc | 8 +-- cpp/src/arrow/extension/tensor_internal.cc | 5 +- cpp/src/arrow/extension/tensor_internal.h | 62 +------------------ .../arrow/extension/variable_shape_tensor.cc | 6 +- 5 files changed, 12 insertions(+), 74 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 92e9a240808..546a3e9ffe2 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -22,9 +22,8 @@ arrow_install_all_headers("arrow/compute/kernels") # Define arrow_compute_kernels_testing object library for common test files if(ARROW_TESTING) - add_library(arrow_compute_kernels_testing OBJECT test_util_internal.cc) - add_library(arrow_compute_kernels_testing OBJECT test_util.cc - ../../extension/tensor_extension_array_test.cc) + add_library(arrow_compute_kernels_testing OBJECT + test_util_internal.cc ../../extension/tensor_extension_array_test.cc) # Even though this is still just an object library we still need to "link" our # dependencies so that include paths are configured correctly target_link_libraries(arrow_compute_kernels_testing PUBLIC ${ARROW_GTEST_GMOCK}) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 5978f244148..2305d2a9e9d 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -29,9 +29,7 @@ #include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/key_value_metadata.h" -#include "arrow/util/sort.h" -#include "arrow/util/logging.h" - +#include "arrow/util/sort_internal.h" namespace arrow { @@ -159,8 +157,7 @@ TEST_F(TestFixedShapeTensorType, CreateFromArray) { ASSERT_EQ(ext_arr->null_count(), 0); } -<<<<<<< HEAD -TEST_F(TestExtensionType, MakeArrayCanGetCorrectScalarType) { +TEST_F(TestFixedShapeTensorType, MakeArrayCanGetCorrectScalarType) { ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); @@ -182,7 +179,6 @@ TEST_F(TestExtensionType, MakeArrayCanGetCorrectScalarType) { ASSERT_TRUE(tensor->Equals(*tensor_from_array)); } -template void CheckSerializationRoundtrip(const std::shared_ptr& ext_type) { auto type = internal::checked_pointer_cast(ext_type); auto serialized = type->Serialize(); diff --git a/cpp/src/arrow/extension/tensor_internal.cc b/cpp/src/arrow/extension/tensor_internal.cc index 2f3d8ae5d05..a875adc55fc 100644 --- a/cpp/src/arrow/extension/tensor_internal.cc +++ b/cpp/src/arrow/extension/tensor_internal.cc @@ -20,11 +20,10 @@ #include "arrow/tensor.h" #include "arrow/util/checked_cast.h" #include "arrow/util/int_util_overflow.h" -#include "arrow/util/sort.h" +#include "arrow/util/sort_internal.h" #include "arrow/status.h" -#include "arrow/util/logging.h" -#include "arrow/util/print.h" +#include "arrow/util/print_internal.h" namespace arrow::internal { diff --git a/cpp/src/arrow/extension/tensor_internal.h b/cpp/src/arrow/extension/tensor_internal.h index cd583ec790a..1a0bd0b29c2 100644 --- a/cpp/src/arrow/extension/tensor_internal.h +++ b/cpp/src/arrow/extension/tensor_internal.h @@ -16,77 +16,21 @@ // under the License. #pragma once -#include "arrow/extension/tensor_internal.h" #include #include #include "arrow/array/array_nested.h" -#include "arrow/tensor.h" -#include "arrow/status.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/int_util_overflow.h" -#include "arrow/util/sort_internal.h" -#include "arrow/util/print_internal.h" namespace arrow::internal { ARROW_EXPORT -inline Status IsPermutationValid(const std::vector& permutation) { - const auto size = static_cast(permutation.size()); - std::vector dim_seen(size, 0); - - for (const auto p : permutation) { - if (p < 0 || p >= size || dim_seen[p] != 0) { - return Status::Invalid( - "Permutation indices for ", size, - " dimensional tensors must be unique and within [0, ", size - 1, - "] range. Got: ", ::arrow::internal::PrintVector{permutation, ","}); - } - dim_seen[p] = 1; - } - return Status::OK(); -} +Status IsPermutationValid(const std::vector& permutation); ARROW_EXPORT -inline Status ComputeStrides(const std::shared_ptr& value_type, +Status ComputeStrides(const std::shared_ptr& value_type, const std::vector& shape, const std::vector& permutation, - std::vector* strides) { - auto fixed_width_type = internal::checked_pointer_cast(value_type); - if (permutation.empty()) { - return internal::ComputeRowMajorStrides(*fixed_width_type.get(), shape, strides); - } - const int byte_width = value_type->byte_width(); - - int64_t remaining = 0; - if (!shape.empty() && shape.front() > 0) { - remaining = byte_width; - for (auto i : permutation) { - if (i > 0) { - if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) { - return Status::Invalid( - "Strides computed from shape would not fit in 64-bit integer"); - } - } - } - } - - if (remaining == 0) { - strides->assign(shape.size(), byte_width); - return Status::OK(); - } - - strides->push_back(remaining); - for (auto i : permutation) { - if (i > 0) { - remaining /= shape[i]; - strides->push_back(remaining); - } - } - Permute(permutation, strides); - - return Status::OK(); -} + std::vector* strides); } // namespace arrow::internal diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 96e82689183..ee2dd165c56 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -26,9 +26,9 @@ #include "arrow/scalar.h" #include "arrow/tensor.h" #include "arrow/util/int_util_overflow.h" -#include "arrow/util/logging.h" -#include "arrow/util/print.h" -#include "arrow/util/sort.h" +#include "arrow/util/logging_internal.h" +#include "arrow/util/print_internal.h" +#include "arrow/util/sort_internal.h" #include "arrow/util/string.h" #include