From b205109a1afa2612188a6cf3360f96e1f044832a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 15 Aug 2023 03:23:25 +0200 Subject: [PATCH 01/18] Initial commit --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/extension/CMakeLists.txt | 4 +- ...test.cc => tensor_extension_array_test.cc} | 199 ++++++++++++++++-- .../arrow/extension/variable_shape_tensor.cc | 166 +++++++++++++++ .../arrow/extension/variable_shape_tensor.h | 91 ++++++++ cpp/src/arrow/extension_type.cc | 11 +- docs/source/format/CanonicalExtensions.rst | 70 ++++++ 7 files changed, 517 insertions(+), 25 deletions(-) rename cpp/src/arrow/extension/{fixed_shape_tensor_test.cc => tensor_extension_array_test.cc} (70%) create mode 100644 cpp/src/arrow/extension/variable_shape_tensor.cc create mode 100644 cpp/src/arrow/extension/variable_shape_tensor.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 9a611701153..20bb550bc5c 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -548,6 +548,7 @@ if(ARROW_JSON) list(APPEND ARROW_SRCS extension/fixed_shape_tensor.cc + extension/variable_shape_tensor.cc json/options.cc json/chunked_builder.cc json/chunker.cc diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index c15c42874d4..2e29e1f2b2e 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -17,8 +17,8 @@ add_arrow_test(test SOURCES - fixed_shape_tensor_test.cc + tensor_extension_array_test.cc PREFIX - "arrow-fixed-shape-tensor") + "arrow-canonical-extensions") arrow_install_all_headers("arrow/extension") diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc similarity index 70% rename from cpp/src/arrow/extension/fixed_shape_tensor_test.cc rename to cpp/src/arrow/extension/tensor_extension_array_test.cc index c3c97bc6e57..139fe8c119e 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/extension/fixed_shape_tensor.h" +#include "arrow/extension/variable_shape_tensor.h" #include "arrow/testing/matchers.h" @@ -35,6 +36,10 @@ using FixedShapeTensorType = extension::FixedShapeTensorType; using extension::fixed_shape_tensor; using extension::FixedShapeTensorArray; +using VariableShapeTensorType = extension::VariableShapeTensorType; +using extension::variable_shape_tensor; +using extension::VariableShapeTensorArray; + class TestExtensionType : public ::testing::Test { public: void SetUp() override { @@ -154,43 +159,47 @@ TEST_F(TestExtensionType, CreateFromArray) { ASSERT_EQ(ext_arr->null_count(), 0); } +template void CheckSerializationRoundtrip(const std::shared_ptr& ext_type) { - auto fst_type = internal::checked_pointer_cast(ext_type); - auto serialized = fst_type->Serialize(); + auto type = internal::checked_pointer_cast(ext_type); + auto serialized = type->Serialize(); ASSERT_OK_AND_ASSIGN(auto deserialized, - fst_type->Deserialize(fst_type->storage_type(), serialized)); - ASSERT_TRUE(fst_type->Equals(*deserialized)); + type->Deserialize(type->storage_type(), serialized)); + ASSERT_TRUE(type->Equals(*deserialized)); } -void CheckDeserializationRaises(const std::shared_ptr& storage_type, +void CheckDeserializationRaises(const std::shared_ptr& extension_type, + const std::shared_ptr& storage_type, const std::string& serialized, const std::string& expected_message) { - auto fst_type = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), {3, 4})); + auto ext_type = internal::checked_pointer_cast(extension_type); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr(expected_message), - fst_type->Deserialize(storage_type, serialized)); + ext_type->Deserialize(storage_type, serialized)); } TEST_F(TestExtensionType, MetadataSerializationRoundtrip) { - CheckSerializationRoundtrip(ext_type_); - CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {}, {}, {})); - CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {0}, {}, {})); - CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {1}, {0}, {"x"})); - CheckSerializationRoundtrip( + using T = FixedShapeTensorType; + CheckSerializationRoundtrip(ext_type_); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {}, {}, {})); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {0}, {}, {})); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {1}, {0}, {"x"})); + CheckSerializationRoundtrip( fixed_shape_tensor(value_type_, {256, 256, 3}, {0, 1, 2}, {"H", "W", "C"})); - CheckSerializationRoundtrip( + CheckSerializationRoundtrip( fixed_shape_tensor(value_type_, {256, 256, 3}, {2, 0, 1}, {"C", "H", "W"})); auto storage_type = fixed_size_list(int64(), 12); - CheckDeserializationRaises(boolean(), R"({"shape":[3,4]})", + CheckDeserializationRaises(ext_type_, boolean(), R"({"shape":[3,4]})", "Expected FixedSizeList storage type, got bool"); - CheckDeserializationRaises(storage_type, R"({"dim_names":["x","y"]})", + CheckDeserializationRaises(ext_type_, storage_type, R"({"dim_names":["x","y"]})", "Invalid serialized JSON data"); - CheckDeserializationRaises(storage_type, R"({"shape":(3,4)})", + CheckDeserializationRaises(ext_type_, storage_type, R"({"shape":(3,4)})", "Invalid serialized JSON data"); - CheckDeserializationRaises(storage_type, R"({"shape":[3,4],"permutation":[1,0,2]})", + CheckDeserializationRaises(ext_type_, storage_type, + R"({"shape":[3,4],"permutation":[1,0,2]})", "Invalid permutation"); - CheckDeserializationRaises(storage_type, R"({"shape":[3],"dim_names":["x","y"]})", + CheckDeserializationRaises(ext_type_, storage_type, + R"({"shape":[3],"dim_names":["x","y"]})", "Invalid dim_names"); } @@ -434,4 +443,156 @@ TEST_F(TestExtensionType, ComputeStrides) { ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); } +class TestVariableShapeTensorType : public ::testing::Test { + public: + void SetUp() override { + ndim_ = 3; + value_type_ = int64(); + permutation_ = {0, 1, 2}; + dim_names_ = {"x", "y", "z"}; + ext_type_ = internal::checked_pointer_cast( + variable_shape_tensor(value_type_, ndim_, permutation_, dim_names_)); + shapes_ = + ArrayFromJSON(fixed_size_list(uint32(), ndim_), "[[2,3,1],[1,2,2],[3,1,3]]"); + data_ = ArrayFromJSON(list(value_type_), + "[[0,1,2,3,4,5],[6,7,8,9],[10,11,12,13,14,15,16,17,18]]"); + serialized_ = R"({"permutation":[0,1,2],"dim_names":["x","y","z"]})"; + storage_arr_ = ArrayFromJSON( + ext_type_->storage_type(), + R"([[[2,3,1],[0,1,2,3,4,5]],[[1,2,2],[6,7,8,9]],[[3,1,3],[10,11,12,13,14,15,16,17,18]]])"); + ext_arr_ = internal::checked_pointer_cast( + ExtensionType::WrapArray(ext_type_, storage_arr_)); + } + + protected: + uint32_t ndim_; + std::shared_ptr value_type_; + std::vector permutation_; + std::vector dim_names_; + std::shared_ptr ext_type_; + std::shared_ptr shapes_; + std::shared_ptr data_; + std::string serialized_; + std::shared_ptr storage_arr_; + std::shared_ptr ext_arr_; +}; + +TEST_F(TestVariableShapeTensorType, CheckDummyRegistration) { + // We need a registered dummy type at runtime to allow for IPC deserialization + auto registered_type = GetExtensionType("arrow.variable_shape_tensor"); + ASSERT_TRUE(registered_type->type_id == Type::EXTENSION); +} + +TEST_F(TestVariableShapeTensorType, CreateExtensionType) { + auto exact_ext_type = + internal::checked_pointer_cast(ext_type_); + + // Test ExtensionType methods + ASSERT_EQ(ext_type_->extension_name(), "arrow.variable_shape_tensor"); + ASSERT_TRUE(ext_type_->Equals(*exact_ext_type)); + auto expected_type = struct_({ + ::arrow::field("shape", fixed_size_list(uint32(), ndim_)), + ::arrow::field("data", list(value_type_)), + }); + + ASSERT_TRUE(ext_type_->storage_type()->Equals(*expected_type)); + ASSERT_EQ(ext_type_->Serialize(), serialized_); + ASSERT_OK_AND_ASSIGN(auto ds, + ext_type_->Deserialize(ext_type_->storage_type(), serialized_)); + auto deserialized = internal::checked_pointer_cast(ds); + ASSERT_TRUE(deserialized->Equals(*exact_ext_type)); + ASSERT_TRUE(deserialized->Equals(*ext_type_)); + + // Test FixedShapeTensorType methods + ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); + ASSERT_EQ(exact_ext_type->ndim(), ndim_); + ASSERT_EQ(exact_ext_type->value_type(), value_type_); + ASSERT_EQ(exact_ext_type->permutation(), permutation_); + ASSERT_EQ(exact_ext_type->dim_names(), dim_names_); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: permutation size must match ndim. Expected: 3 Got: 1"), + VariableShapeTensorType::Make(value_type_, ndim_, {0})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid: dim_names size must match ndim."), + VariableShapeTensorType::Make(value_type_, ndim_, {}, {"x"})); +} + +TEST_F(TestVariableShapeTensorType, EqualsCases) { + auto ext_type_permutation_1 = variable_shape_tensor(int64(), 2, {0, 1}, {"x", "y"}); + auto ext_type_permutation_2 = variable_shape_tensor(int64(), 2, {1, 0}, {"x", "y"}); + auto ext_type_no_permutation = variable_shape_tensor(int64(), 2, {}, {"x", "y"}); + + ASSERT_TRUE(ext_type_permutation_1->Equals(ext_type_permutation_1)); + + ASSERT_FALSE( + variable_shape_tensor(int32(), 2, {}, {"x", "y"})->Equals(ext_type_no_permutation)); + ASSERT_FALSE(variable_shape_tensor(int64(), 2, {}, {}) + ->Equals(variable_shape_tensor(int64(), 3, {}, {}))); + ASSERT_FALSE( + variable_shape_tensor(int64(), 2, {}, {"H", "W"})->Equals(ext_type_no_permutation)); + + ASSERT_TRUE(ext_type_no_permutation->Equals(ext_type_permutation_1)); + ASSERT_TRUE(ext_type_permutation_1->Equals(ext_type_no_permutation)); + ASSERT_FALSE(ext_type_no_permutation->Equals(ext_type_permutation_2)); + ASSERT_FALSE(ext_type_permutation_2->Equals(ext_type_no_permutation)); + ASSERT_FALSE(ext_type_permutation_1->Equals(ext_type_permutation_2)); + ASSERT_FALSE(ext_type_permutation_2->Equals(ext_type_permutation_1)); +} + +TEST_F(TestVariableShapeTensorType, CreateFromArray) { + std::vector field_names = {"shapes", "data"}; + ASSERT_OK_AND_ASSIGN(auto storage_arr, + StructArray::Make({shapes_, data_}, field_names)); + auto arr = ExtensionType::WrapArray(ext_type_, storage_arr); + ASSERT_TRUE(ext_arr_->Equals(*arr)); +} + +TEST_F(TestVariableShapeTensorType, MetadataSerializationRoundtrip) { + using T = VariableShapeTensorType; + + CheckSerializationRoundtrip(ext_type_); + CheckSerializationRoundtrip(variable_shape_tensor(value_type_, {}, {}, {})); + CheckSerializationRoundtrip(variable_shape_tensor(value_type_, {0}, {}, {})); + CheckSerializationRoundtrip(variable_shape_tensor(value_type_, {1}, {0}, {"x"})); + CheckSerializationRoundtrip( + variable_shape_tensor(value_type_, 3, {0, 1, 2}, {"H", "W", "C"})); + CheckSerializationRoundtrip( + variable_shape_tensor(value_type_, 3, {2, 0, 1}, {"C", "H", "W"})); + + auto storage_type = ext_type_->storage_type(); + CheckDeserializationRaises(ext_type_, boolean(), R"({"shape":[3,4]})", + "Expected Struct storage type, got bool"); + CheckDeserializationRaises(ext_type_, storage_type, R"({"shape":(3,4)})", + "Invalid serialized JSON data"); + CheckDeserializationRaises(ext_type_, storage_type, R"({"permutation":[1,0]})", + "Invalid permutation"); + CheckDeserializationRaises(ext_type_, storage_type, R"({"dim_names":["x","y"]})", + "Invalid dim_names"); +} + +TEST_F(TestVariableShapeTensorType, RoudtripBatch) { + auto exact_ext_type = + internal::checked_pointer_cast(ext_type_); + + // Pass extension array, expect getting back extension array + std::shared_ptr read_batch; + auto ext_field = field(/*name=*/"f0", /*type=*/ext_type_); + auto batch = RecordBatch::Make(schema({ext_field}), ext_arr_->length(), {ext_arr_}); + RoundtripBatch(batch, &read_batch); + CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); + + // Pass extension metadata and storage array, expect getting back extension array + std::shared_ptr read_batch2; + auto ext_metadata = + key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, + {"ARROW:extension:metadata", serialized_}}); + ext_field = field(/*name=*/"f0", /*type=*/ext_type_->storage_type(), /*nullable=*/true, + /*metadata=*/ext_metadata); + auto batch2 = RecordBatch::Make(schema({ext_field}), ext_arr_->length(), {ext_arr_}); + RoundtripBatch(batch2, &read_batch2); + CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); +} + } // namespace arrow diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc new file mode 100644 index 00000000000..65062132e5c --- /dev/null +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/extension/variable_shape_tensor.h" + +#include "arrow/array/array_nested.h" +#include "arrow/array/array_primitive.h" +#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep +#include "arrow/tensor.h" +#include "arrow/util/int_util_overflow.h" +#include "arrow/util/logging.h" +#include "arrow/util/sort.h" + +#include +#include + +namespace rj = arrow::rapidjson; + +namespace arrow { + +namespace extension { + +namespace {} // namespace + +bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const { + if (extension_name() != other.extension_name()) { + return false; + } + const auto& other_ext = static_cast(other); + if (this->ndim() != other_ext.ndim()) { + return false; + } + + auto is_permutation_trivial = [](const std::vector& permutation) { + for (size_t i = 1; i < permutation.size(); ++i) { + if (permutation[i - 1] + 1 != permutation[i]) { + return false; + } + } + return true; + }; + const bool permutation_equivalent = + ((permutation_ == other_ext.permutation()) || + (permutation_.empty() && is_permutation_trivial(other_ext.permutation())) || + (is_permutation_trivial(permutation_) && other_ext.permutation().empty())); + + return (storage_type()->Equals(other_ext.storage_type())) && + (dim_names_ == other_ext.dim_names()) && permutation_equivalent; +} + +std::string VariableShapeTensorType::Serialize() const { + rj::Document document; + document.SetObject(); + rj::Document::AllocatorType& allocator = document.GetAllocator(); + + if (!permutation_.empty()) { + rj::Value permutation(rj::kArrayType); + for (auto v : permutation_) { + permutation.PushBack(v, allocator); + } + document.AddMember(rj::Value("permutation", allocator), permutation, allocator); + } + + if (!dim_names_.empty()) { + rj::Value dim_names(rj::kArrayType); + for (std::string v : dim_names_) { + dim_names.PushBack(rj::Value{}.SetString(v.c_str(), allocator), allocator); + } + document.AddMember(rj::Value("dim_names", allocator), dim_names, allocator); + } + + rj::StringBuffer buffer; + rj::Writer writer(buffer); + document.Accept(writer); + return buffer.GetString(); +} + +Result> VariableShapeTensorType::Deserialize( + std::shared_ptr storage_type, const std::string& serialized_data) const { + if (storage_type->id() != Type::STRUCT) { + return Status::Invalid("Expected Struct storage type, got ", + storage_type->ToString()); + } + auto value_type = storage_type->field(1)->type()->field(0)->type(); + const size_t ndim = + std::static_pointer_cast(storage_type->field(0)->type()) + ->list_size(); + + rj::Document document; + if (document.Parse(serialized_data.data(), serialized_data.length()).HasParseError()) { + return Status::Invalid("Invalid serialized JSON data: ", serialized_data); + } + + std::vector permutation; + if (document.HasMember("permutation")) { + for (auto& x : document["permutation"].GetArray()) { + permutation.emplace_back(x.GetInt64()); + } + if (permutation.size() != ndim) { + return Status::Invalid("Invalid permutation"); + } + } + std::vector dim_names; + if (document.HasMember("dim_names")) { + for (auto& x : document["dim_names"].GetArray()) { + dim_names.emplace_back(x.GetString()); + } + if (dim_names.size() != ndim) { + return Status::Invalid("Invalid dim_names"); + } + } + + return variable_shape_tensor(value_type, static_cast(ndim), permutation, + dim_names); +} + +std::shared_ptr VariableShapeTensorType::MakeArray( + std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK_EQ("arrow.variable_shape_tensor", + static_cast(*data->type).extension_name()); + return std::make_shared(data); +} + +Result> VariableShapeTensorType::Make( + const std::shared_ptr& value_type, const uint32_t& ndim, + const std::vector& permutation, const std::vector& dim_names) { + if (!permutation.empty() && permutation.size() != ndim) { + return Status::Invalid("permutation size must match ndim. Expected: ", ndim, + " Got: ", permutation.size()); + } + if (!dim_names.empty() && dim_names.size() != ndim) { + return Status::Invalid("dim_names size must match ndim. Expected: ", ndim, + " Got: ", dim_names.size()); + } + return std::make_shared(value_type, ndim, permutation, + dim_names); +} + +std::shared_ptr variable_shape_tensor( + const std::shared_ptr& value_type, const uint32_t& ndim, + const std::vector& permutation, const std::vector& dim_names) { + auto maybe_type = + VariableShapeTensorType::Make(value_type, ndim, permutation, dim_names); + ARROW_DCHECK_OK(maybe_type.status()); + return maybe_type.MoveValueUnsafe(); +} + +} // namespace extension +} // namespace arrow diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h new file mode 100644 index 00000000000..2d981222ea8 --- /dev/null +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension_type.h" + +namespace arrow { +namespace extension { + +class ARROW_EXPORT VariableShapeTensorArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +/// \brief Concrete type class for variable-shape Tensor data. +/// This is a canonical arrow extension type. +/// See: https://arrow.apache.org/docs/format/CanonicalExtensions.html +class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { + public: + VariableShapeTensorType(const std::shared_ptr& value_type, + const uint32_t& ndim, + const std::vector& permutation = {}, + const std::vector& dim_names = {}) + : ExtensionType(struct_({::arrow::field("shape", fixed_size_list(uint32(), ndim)), + ::arrow::field("data", list(value_type))})), + value_type_(value_type), + permutation_(permutation), + dim_names_(dim_names) {} + + std::string extension_name() const override { return "arrow.variable_shape_tensor"; } + + /// Number of dimensions of tensor elements + uint32_t ndim() const { + std::shared_ptr storage_type = this->storage_type()->field(0)->type(); + return std::static_pointer_cast(storage_type)->list_size(); + } + + /// Value type of tensor elements + const std::shared_ptr value_type() const { return value_type_; } + + /// Permutation mapping from logical to physical memory layout of tensor elements + const std::vector& permutation() const { return permutation_; } + + /// Dimension names of tensor elements. Dimensions are ordered physically. + const std::vector& dim_names() const { return dim_names_; } + + bool ExtensionEquals(const ExtensionType& other) const override; + + std::string Serialize() const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override; + + /// Create a VariableShapeTensorArray from ArrayData + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + /// \brief Create a VariableShapeTensorType instance + static Result> Make( + const std::shared_ptr& value_type, const uint32_t& ndim, + const std::vector& permutation = {}, + const std::vector& dim_names = {}); + + private: + std::shared_ptr storage_type_; + std::shared_ptr value_type_; + std::vector permutation_; + std::vector dim_names_; +}; + +/// \brief Return a VariableShapeTensorType instance. +ARROW_EXPORT std::shared_ptr variable_shape_tensor( + const std::shared_ptr& value_type, const uint32_t& ndim, + const std::vector& permutation = {}, + const std::vector& dim_names = {}); + +} // namespace extension +} // namespace arrow diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 1199336763d..b3a3dfc6ef6 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -29,6 +29,7 @@ #include "arrow/config.h" #ifdef ARROW_JSON #include "arrow/extension/fixed_shape_tensor.h" +#include "arrow/extension/variable_shape_tensor.h" #endif #include "arrow/status.h" #include "arrow/type.h" @@ -146,10 +147,12 @@ static void CreateGlobalRegistry() { #ifdef ARROW_JSON // Register canonical extension types - auto ext_type = - checked_pointer_cast(extension::fixed_shape_tensor(int64(), {})); - - ARROW_CHECK_OK(g_registry->RegisterType(ext_type)); + auto ext_types = {extension::fixed_shape_tensor(int64(), {}), + extension::variable_shape_tensor(int64(), 0)}; + for (const auto& ext_type : ext_types) { + ARROW_CHECK_OK( + g_registry->RegisterType(checked_pointer_cast(ext_type))); + } #endif } diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 9f7948cbfe9..d6d91929edb 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -148,6 +148,76 @@ Fixed shape tensor by this specification. Instead, this extension type lets one use fixed shape tensors as elements in a field of a RecordBatch or a Table. +.. _variable_shape_tensor_extension: + +Variable shape tensor +===================== + +* Extension name: `arrow.variable_shape_tensor`. + +* The storage type of the extension is: ``StructArray`` where struct + is composed of **data** and **shape** fields describing a single + tensor per row: + + * **data** is a ``List`` holding tensor elements of a single tensor. + Data type of the list elements is uniform across the entire column + and also provided in metadata. + * **shape** is a ``FixedSizeList[ndim]`` of the tensor shape where + the size of the list ``ndim`` is equal to the number of dimensions of the + tensor. + +* Extension type parameters: + + * **value_type** = the Arrow data type of individual tensor elements. + + Optional parameters describing the logical layout: + + * **dim_names** = explicit names to tensor dimensions + as an array. The length of it should be equal to the shape + length and equal to the number of dimensions. + + ``dim_names`` can be used if the dimensions have well-known + names and they map to the physical layout (row-major). + + * **permutation** = indices of the desired ordering of the + original dimensions, defined as an array. + + The indices contain a permutation of the values [0, 1, .., N-1] where + N is the number of dimensions. The permutation indicates which + dimension of the logical layout corresponds to which dimension of the + physical tensor (the i-th dimension of the logical view corresponds + to the dimension with number ``permutations[i]`` of the physical tensor). + + Permutation can be useful in case the logical order of + the tensor is a permutation of the physical order (row-major). + + When logical and physical layout are equal, the permutation will always + be ([0, 1, .., N-1]) and can therefore be left out. + +* Description of the serialization: + + The metadata must be a valid JSON object including number of + dimensions of the contained tensors as an integer with key **"ndim"** + plus optional dimension names with keys **"dim_names"** and ordering of + the dimensions with key **"permutation"**. + + - Example with ``dim_names`` metadata for NCHW ordered data: + + ``{ "dim_names": ["C", "H", "W"] }`` + + - Example of permuted 3-dimensional tensor: + + ``{ "permutation": [2, 0, 1] }`` + + This is the physical layout shape and the shape of the logical + layout would given an individual tensor of shape [100, 200, 500] + be ``[500, 100, 200]``. + +.. note:: + + Elements in a variable shape tensor extension array are stored + in row-major/C-contiguous order. + ========================= Community Extension Types ========================= From e317bf41162c0d23af646ad986c7ae0d49d12ca9 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 25 Aug 2023 15:59:44 +0200 Subject: [PATCH 02/18] Python wrapper --- docs/source/python/api/arrays.rst | 1 + python/pyarrow/__init__.py | 5 +- python/pyarrow/array.pxi | 133 ++++++++++++++ python/pyarrow/includes/libarrow.pxd | 21 +++ python/pyarrow/lib.pxd | 5 + python/pyarrow/public-api.pxi | 2 + python/pyarrow/tests/test_extension_type.py | 134 ++++++++++++++ python/pyarrow/types.pxi | 185 ++++++++++++++++++++ 8 files changed, 485 insertions(+), 1 deletion(-) diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index 73b5e063ff1..007a931c652 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -81,6 +81,7 @@ may expose data type-specific methods or properties. UnionArray ExtensionArray FixedShapeTensorArray + VariableShapeTensorArray .. _api.scalar: diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index ee0d07bb2c8..f57ca74f91a 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -171,6 +171,7 @@ def print_entry(label, value): dictionary, run_end_encoded, fixed_shape_tensor, + variable_shape_tensor, field, type_for_alias, DataType, DictionaryType, StructType, @@ -180,7 +181,8 @@ def print_entry(label, value): FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, RunEndEncodedType, FixedShapeTensorType, - PyExtensionType, UnknownExtensionType, + VariableShapeTensorType, PyExtensionType, + UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, KeyValueMetadata, @@ -212,6 +214,7 @@ def print_entry(label, value): MonthDayNanoIntervalArray, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, RunEndEncodedArray, FixedShapeTensorArray, + VariableShapeTensorArray, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index e36d8b2f043..d427ae2c9a5 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3507,6 +3507,139 @@ class FixedShapeTensorArray(ExtensionArray): ) +class VariableShapeTensorArray(ExtensionArray): + """ + Concrete class for variable shape tensor extension arrays. + + Examples + -------- + Define the extension type for tensor array + + >>> import pyarrow as pa + >>> tensor_type = pa.variable_shape_tensor(pa.int32(), 2) + + Create an extension array + + >>> shapes = pa.array([[2, 3], [1, 2]], pa.list_(pa.uint32(), 2)) + >>> values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(pa.int32())) + >>> arr = pa.StructArray.from_arrays([shapes, values], names=["shape", "data"]) + >>> pa.ExtensionArray.from_storage(tensor_type, arr) + + -- is_valid: all not null + -- child 0 type: fixed_size_list[2] + [ + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + -- child 1 type: list + [ + [ + 1, + 2, + 3, + 4, + 5, + 6 + ], + [ + 7, + 8 + ] + ] + """ + + def to_numpy_ndarray(self): + """ + Convert variable shape tensor extension array to list of numpy arrays. + + Note: ``permutation`` should be trivial (``None`` or ``[0, 1, ..., len(shape)-1]``). + """ + if self.type.permutation is None or self.type.permutation == list(range(len(self.type.shape))): + storage_iterator = zip(self.storage.field(0), self.storage.field(1)) + return [np.array(v.values.to_numpy()).reshape(s.values.to_numpy()) for s, v in storage_iterator] + else: + raise ValueError( + 'Only non-permuted tensors can be converted to numpy tensors.') + + @staticmethod + def from_numpy_ndarray(obj): + """ + Convert a list of numpy arrays ndarrays to a variable shape tensor extension array. + The length of the list will become the length of the variable shape tensor array. + + Numpy arrays needs to be C-contiguous in memory (``obj.flags["C_CONTIGUOUS"]==True``). + + Parameters + ---------- + obj : list(numpy.ndarray) + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> ndarray_list = [ + ... np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32), + ... np.array([[7, 8]], dtype=np.float32), + ... ] + >>> pa.VariableShapeTensorArray.from_numpy_ndarray(ndarray_list) + + -- is_valid: all not null + -- child 0 type: fixed_size_list[2] + [ + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + -- child 1 type: list + [ + [ + 1, + 2, + 3, + 4, + 5, + 6 + ], + [ + 7, + 8 + ] + ] + """ + if not all([o.flags["C_CONTIGUOUS"] for o in obj]): + raise ValueError('The data in the numpy arrays need to be in a single, ' + 'C-style contiguous segment.') + numpy_type = obj[0].dtype + ndim = obj[0].ndim + + if not all([o.dtype == numpy_type for o in obj]): + raise ValueError('All numpy arrays need to have the same dtype.') + + if not all([o.ndim == ndim for o in obj]): + raise ValueError('All numpy arrays need to have the same ndim.') + + arrow_type = from_numpy_dtype(numpy_type) + values = array([np.ravel(o, order='C') for o in obj], list_(arrow_type)) + shapes = array([o.shape for o in obj], list_(uint32(), list_size=ndim)) + struct_arr = StructArray.from_arrays([shapes, values], names=["shape", "data"]) + + return ExtensionArray.from_storage( + variable_shape_tensor(arrow_type, ndim), + struct_arr + ) + + cdef dict _array_classes = { _Type_NA: NullArray, _Type_BOOL: BooleanArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index f4d6541fa72..bcc145766a9 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2634,6 +2634,27 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() +cdef extern from "arrow/extension/variable_shape_tensor.h" namespace "arrow::extension": + cdef cppclass CVariableShapeTensorType \ + " arrow::extension::VariableShapeTensorType"(CExtensionType): + + @staticmethod + CResult[shared_ptr[CDataType]] Make(const shared_ptr[CDataType]& value_type, + const uint32_t ndim, + const vector[int64_t]& permutation, + const vector[c_string]& dim_names) + + CResult[shared_ptr[CDataType]] Deserialize(const shared_ptr[CDataType] storage_type, + const c_string& serialized_data) const + + c_string Serialize() const + + const shared_ptr[CDataType] value_type() + const uint32_t ndim() + const vector[int64_t] permutation() + const vector[c_string] dim_names() + + cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension": cdef cppclass CFixedShapeTensorType \ " arrow::extension::FixedShapeTensorType"(CExtensionType): diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 63ebe6aea82..51d6e213986 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -200,6 +200,11 @@ cdef class ExtensionType(BaseExtensionType): const CPyExtensionType* cpy_ext_type +cdef class VariableShapeTensorType(BaseExtensionType): + cdef: + const CVariableShapeTensorType* tensor_ext_type + + cdef class FixedShapeTensorType(BaseExtensionType): cdef: const CFixedShapeTensorType* tensor_ext_type diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 72e16f2cec3..4a1a2958491 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -118,6 +118,8 @@ cdef api object pyarrow_wrap_data_type( cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type) if cpy_ext_type != nullptr: return cpy_ext_type.GetInstance() + elif ext_type.extension_name() == b"arrow.variable_shape_tensor": + out = VariableShapeTensorType.__new__(VariableShapeTensorType) elif ext_type.extension_name() == b"arrow.fixed_shape_tensor": out = FixedShapeTensorType.__new__(FixedShapeTensorType) else: diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 1eb7d5fa761..ac7248eed73 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1212,6 +1212,39 @@ def test_tensor_type(): assert tensor_type.dim_names == ['C', 'H', 'W'] assert tensor_type.permutation is None + tensor_type = pa.variable_shape_tensor(pa.int8(), 2) + expected_storage_type = pa.struct([ + pa.field("shape", pa.list_(pa.uint32(), 2)), + pa.field("data", pa.list_(pa.int8())) + ]) + assert tensor_type.extension_name == "arrow.variable_shape_tensor" + assert tensor_type.storage_type == expected_storage_type + assert tensor_type.ndim == 2 + assert tensor_type.dim_names is None + assert tensor_type.permutation is None + + tensor_type = pa.variable_shape_tensor(pa.int64(), 3, dim_names=['C', 'H', 'W']) + expected_storage_type = pa.struct([ + pa.field("shape", pa.list_(pa.uint32(), 3)), + pa.field("data", pa.list_(pa.int64())) + ]) + assert tensor_type.extension_name == "arrow.variable_shape_tensor" + assert tensor_type.storage_type == expected_storage_type + assert tensor_type.ndim == 3 + assert tensor_type.dim_names == ['C', 'H', 'W'] + assert tensor_type.permutation is None + + tensor_type = pa.variable_shape_tensor(pa.bool_(), 2, permutation=[1, 0]) + expected_storage_type = pa.struct([ + pa.field("shape", pa.list_(pa.uint32(), 2)), + pa.field("data", pa.list_(pa.bool_())) + ]) + assert tensor_type.extension_name == "arrow.variable_shape_tensor" + assert tensor_type.storage_type == expected_storage_type + assert tensor_type.ndim == 2 + assert tensor_type.dim_names is None + assert tensor_type.permutation == [1, 0] + def test_tensor_class_methods(): tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3]) @@ -1248,6 +1281,45 @@ def test_tensor_class_methods(): arr.to_numpy_ndarray() +@pytest.mark.parametrize("value_type", (np.int8, np.int32, np.int64, np.float64)) +def test_variable_shape_tensor_class_method(value_type): + ndim = 2 + shape_type = pa.list_(pa.uint32(), ndim) + arrow_type = pa.from_numpy_dtype(value_type) + tensor_type = pa.variable_shape_tensor(arrow_type, ndim) + fields = [pa.field("shape", shape_type), pa.field("data", pa.list_(arrow_type))] + + shapes = pa.array([[2, 3], [1, 2]], shape_type) + values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(arrow_type)) + struct_arr = pa.StructArray.from_arrays([shapes, values], fields=fields) + arr = pa.ExtensionArray.from_storage(tensor_type, struct_arr) + + storage = pa.array( + [([2, 3], [1, 2, 3, 4, 5, 6]), ([1, 2], [7, 8])], type=pa.struct(fields) + ) + assert pa.ExtensionArray.from_storage(tensor_type, storage).equals(arr) + + assert arr.type == tensor_type + + ndarray_list = [ + np.array([[1, 2, 3], [4, 5, 6]], dtype=value_type), + np.array([[7, 8]], dtype=value_type), + ] + assert all(zip(x == y for x, y in zip(arr.to_numpy_ndarray(), ndarray_list))) + + from_ndarray_list = pa.VariableShapeTensorArray.from_numpy_ndarray(ndarray_list) + assert from_ndarray_list.equals(arr) + + assert pa.VariableShapeTensorArray.from_numpy_ndarray( + arr.to_numpy_ndarray() + ).equals(arr) + + assert arr.to_pylist() == [ + {"data": [1, 2, 3, 4, 5, 6], "shape": [2, 3]}, + {"data": [7, 8], "shape": [1, 2]}, + ] + + @pytest.mark.parametrize("tensor_type", ( pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]), pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]), @@ -1278,6 +1350,46 @@ def test_tensor_type_ipc(tensor_type): assert result.type.shape == [2, 2, 3] +@pytest.mark.parametrize("tensor_type", ( + pa.variable_shape_tensor(pa.int8(), 2), + pa.variable_shape_tensor(pa.int8(), 2, permutation=[1, 0]), + pa.variable_shape_tensor(pa.int8(), 2, dim_names=['H', 'W']) +)) +def test_variable_shape_tensor_type_ipc(tensor_type): + shape_type = tensor_type.storage_type.field(0).type + values_type = tensor_type.storage_type.field(1).type + shapes = pa.array([[2, 3], [1, 2]], shape_type) + values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], values_type) + + struct_arr = pa.StructArray.from_arrays([shapes, values], names=["shape", "data"]) + arr = pa.ExtensionArray.from_storage(tensor_type, struct_arr) + batch = pa.RecordBatch.from_arrays([arr], ["ext"]) + + # check the built array has exactly the expected clss + tensor_class = tensor_type.__arrow_ext_class__() + assert isinstance(arr, tensor_class) + + buf = ipc_write_batch(batch) + del batch + batch = ipc_read_batch(buf) + + result = batch.column(0) + # check the deserialized array class is the expected one + assert isinstance(result, tensor_class) + assert result.type.extension_name == "arrow.variable_shape_tensor" + assert arr.storage.to_pylist() == [ + {"data": [1, 2, 3, 4, 5, 6], "shape": [2, 3]}, + {"data": [7, 8], "shape": [1, 2]}, + ] + + # we get back an actual TensorType + assert isinstance(result.type, pa.VariableShapeTensorType) + assert result.type.value_type == pa.int8() + assert result.type.ndim == 2 + assert result.type.permutation == tensor_type.permutation + assert result.type.dim_names == tensor_type.dim_names + + def test_tensor_type_equality(): tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" @@ -1287,6 +1399,14 @@ def test_tensor_type_equality(): assert tensor_type == tensor_type2 assert not tensor_type == tensor_type3 + tensor_type = pa.variable_shape_tensor(pa.int8(), 2) + assert tensor_type.extension_name == "arrow.variable_shape_tensor" + + tensor_type2 = pa.variable_shape_tensor(pa.int8(), 2) + tensor_type3 = pa.variable_shape_tensor(pa.uint8(), 2) + assert tensor_type == tensor_type2 + assert not tensor_type == tensor_type3 + @pytest.mark.pandas def test_extension_to_pandas_storage_type(registered_period_type): @@ -1351,3 +1471,17 @@ def test_tensor_type_is_picklable(pickle_module): result = pickle_module.loads(pickle_module.dumps(expected_arr)) assert result == expected_arr + + expected_type = pa.variable_shape_tensor(pa.int32(), 2) + result = pickle_module.loads(pickle_module.dumps(expected_type)) + + assert result == expected_type + + shapes = pa.array([[2, 3], [1, 2]], pa.list_(pa.uint32(), 2)) + values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(pa.int32())) + arr = pa.StructArray.from_arrays([shapes, values], names=["shape", "data"]) + expected_arr = pa.ExtensionArray.from_storage(expected_type, arr) + + result = pickle_module.loads(pickle_module.dumps(expected_arr)) + + assert result == expected_arr diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 9f8b347d562..6c73c7166ce 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1548,6 +1548,88 @@ cdef class ExtensionType(BaseExtensionType): return ExtensionScalar +cdef class VariableShapeTensorType(BaseExtensionType): + """ + Concrete class for variable shape tensor extension type. + + Examples + -------- + Create an instance of variable shape tensor extension type: + + >>> import pyarrow as pa + >>> pa.variable_shape_tensor(pa.int32(), 2) + VariableShapeTensorType(extension) + + Create an instance of variable shape tensor extension type with + permutation: + + >>> tensor_type = pa.variable_shape_tensor(pa.int8(), 3, + ... permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.tensor_ext_type = type.get() + + @property + def value_type(self): + """ + Data type of an individual tensor. + """ + return pyarrow_wrap_data_type(self.tensor_ext_type.value_type()) + + @property + def ndim(self): + """ + Number of dimensions of the tensors. + """ + return self.tensor_ext_type.ndim() + + @property + def dim_names(self): + """ + Explicit names of the dimensions. + """ + list_of_bytes = self.tensor_ext_type.dim_names() + if len(list_of_bytes) != 0: + return [frombytes(x) for x in list_of_bytes] + else: + return None + + @property + def permutation(self): + """ + Indices of the dimensions ordering. + """ + indices = self.tensor_ext_type.permutation() + if len(indices) != 0: + return indices + else: + return None + + def __arrow_ext_serialize__(self): + """ + Serialized representation of metadata to reconstruct the type object. + """ + return self.tensor_ext_type.Serialize() + + @classmethod + def __arrow_ext_deserialize__(self, storage_type, serialized): + """ + Return an VariableShapeTensor type instance from the storage type and serialized + metadata. + """ + return self.tensor_ext_type.Deserialize(storage_type, serialized) + + def __arrow_ext_class__(self): + return VariableShapeTensorArray + + def __reduce__(self): + return variable_shape_tensor, (self.value_type, self.ndim, + self.dim_names, self.permutation) + cdef class FixedShapeTensorType(BaseExtensionType): """ Concrete class for fixed shape tensor extension type. @@ -4817,6 +4899,109 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N return out +def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation=None): + """ + Create instance of variable shape tensor extension type with number of + dimensions and optional names of tensor dimensions and indices of the + desired logical ordering of dimensions. + + Parameters + ---------- + value_type : DataType + Data type of individual tensor elements. + ndim : integer + The number of dimensions of the contained tensors. + dim_names : tuple or list of strings, default None + Explicit names to tensor dimensions. + permutation : tuple or list integers, default None + Indices of the desired ordering of the original dimensions. + The indices contain a permutation of the values ``[0, 1, .., N-1]`` where + N is the number of dimensions. The permutation indicates which dimension + of the logical layout corresponds to which dimension of the physical tensor. + For more information on this parameter see + :ref:`fixed_shape_tensor_extension`. + + Examples + -------- + Create an instance of variable shape tensor extension type: + + >>> import pyarrow as pa + >>> tensor_type = pa.variable_shape_tensor(pa.int32(), 2) + >>> tensor_type + VariableShapeTensorType(extension) + + Inspect the data type: + + >>> tensor_type.value_type + DataType(int32) + >>> tensor_type.ndim + 2 + + Create a table with variable shape tensor extension array: + + >>> fields = [pa.field("shape", pa.list_(pa.uint32(), 2)), pa.field("data", pa.list_(pa.int32()))] + >>> storage = pa.array([([2, 3], [1, 2, 3, 4, 5, 6]), ([1, 2], [7, 8])], type=pa.struct(fields)) + >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage) + >>> pa.table([tensor], names=["tensor_array"]) + pyarrow.Table + tensor_array: extension + ---- + tensor_array: [ -- is_valid: all not null + -- child 0 type: fixed_size_list[2] + [[2,3],[1,2]] + -- child 1 type: list + [[1,2,3,4,5,6],[7,8]]] + + Create an instance of variable shape tensor extension type with names + of tensor dimensions: + + >>> tensor_type = pa.variable_shape_tensor(pa.int8(), 3, + ... dim_names=['C', 'H', 'W']) + >>> tensor_type.dim_names + ['C', 'H', 'W'] + + Create an instance of variable shape tensor extension type with + permutation: + + >>> tensor_type = pa.variable_shape_tensor(pa.int8(), 3, + ... permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + + Returns + ------- + type : VariableShapeTensorType + """ + + cdef: + uint32_t c_ndim + vector[int64_t] c_permutation + vector[c_string] c_dim_names + shared_ptr[CDataType] c_tensor_ext_type + + assert value_type is not None + assert ndim is not None + + c_ndim = ndim + + if permutation is not None: + for i in permutation: + c_permutation.push_back(i) + + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + cdef VariableShapeTensorType out = VariableShapeTensorType.__new__(VariableShapeTensorType) + + c_tensor_ext_type = GetResultValue(CVariableShapeTensorType.Make( + value_type.sp_type, c_ndim, c_permutation, c_dim_names)) + + out.init(c_tensor_ext_type) + + return out + + cdef dict _type_aliases = { 'null': null, 'bool': bool_, From 1c46c2ebd0c93e114b92e81e9def22c638a859f0 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 3 Sep 2023 22:27:03 +0200 Subject: [PATCH 03/18] Add VariableShapeTensorArray::ToTensor(i) --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 14 ++--- cpp/src/arrow/extension/fixed_shape_tensor.h | 11 ++++ .../extension/tensor_extension_array_test.cc | 54 +++++++++++++++++++ .../arrow/extension/variable_shape_tensor.cc | 34 +++++++++++- .../arrow/extension/variable_shape_tensor.h | 10 ++++ python/pyarrow/array.pxi | 21 ++++---- python/pyarrow/includes/libarrow.pxd | 4 ++ python/pyarrow/lib.pxd | 2 + 8 files changed, 132 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index e4195ea9e66..4d07a61541c 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -35,9 +35,7 @@ namespace rj = arrow::rapidjson; namespace arrow { -namespace extension { - -namespace { +namespace internal { Status ComputeStrides(const FixedWidthType& type, const std::vector& shape, const std::vector& permutation, @@ -78,7 +76,9 @@ Status ComputeStrides(const FixedWidthType& type, const std::vector& sh return Status::OK(); } -} // namespace +} // namespace internal + +namespace extension { bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const { if (extension_name() != other.extension_name()) { @@ -303,7 +303,7 @@ const Result> FixedShapeTensorArray::ToTensor() const { std::vector tensor_strides; auto value_type = internal::checked_pointer_cast(ext_arr->value_type()); ARROW_RETURN_NOT_OK( - ComputeStrides(*value_type.get(), shape, permutation, &tensor_strides)); + internal::ComputeStrides(*value_type.get(), shape, permutation, &tensor_strides)); ARROW_ASSIGN_OR_RAISE(auto buffers, ext_arr->Flatten()); ARROW_ASSIGN_OR_RAISE( auto tensor, Tensor::Make(ext_arr->value_type(), buffers->data()->buffers[1], shape, @@ -332,8 +332,8 @@ const std::vector& FixedShapeTensorType::strides() { if (strides_.empty()) { auto value_type = internal::checked_pointer_cast(this->value_type_); std::vector tensor_strides; - ARROW_CHECK_OK(ComputeStrides(*value_type.get(), this->shape(), this->permutation(), - &tensor_strides)); + ARROW_CHECK_OK(internal::ComputeStrides(*value_type.get(), this->shape(), + this->permutation(), &tensor_strides)); strides_ = tensor_strides; } return strides_; diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 93837f13002..21631755016 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -15,9 +15,20 @@ // specific language governing permissions and limitations // under the License. +#pragma once + #include "arrow/extension_type.h" namespace arrow { +namespace internal { + +ARROW_EXPORT +Status ComputeStrides(const FixedWidthType& type, const std::vector& shape, + const std::vector& permutation, + std::vector* strides); + +} // namespace internal + namespace extension { class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 139fe8c119e..246f7a35a2f 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -29,6 +29,7 @@ #include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" namespace arrow { @@ -448,6 +449,8 @@ class TestVariableShapeTensorType : public ::testing::Test { void SetUp() override { ndim_ = 3; value_type_ = int64(); + data_type_ = list(value_type_); + shape_type_ = fixed_size_list(uint32(), ndim_); permutation_ = {0, 1, 2}; dim_names_ = {"x", "y", "z"}; ext_type_ = internal::checked_pointer_cast( @@ -467,6 +470,8 @@ class TestVariableShapeTensorType : public ::testing::Test { protected: uint32_t ndim_; std::shared_ptr value_type_; + std::shared_ptr data_type_; + std::shared_ptr shape_type_; std::vector permutation_; std::vector dim_names_; std::shared_ptr ext_type_; @@ -595,4 +600,53 @@ TEST_F(TestVariableShapeTensorType, RoudtripBatch) { CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); } +TEST_F(TestVariableShapeTensorType, ComputeStrides) { + auto shapes = ArrayFromJSON(shape_type_, "[[2,3,1],[2,1,2],[3,1,3]]"); + auto data = + ArrayFromJSON(data_type_, "[[1,1,2,3,4,5],[2,7,8,9],[10,11,12,13,14,15,16,17,18]]"); + std::vector> fields = {field("shapes", shape_type_), + field("data", data_type_)}; + ASSERT_OK_AND_ASSIGN(auto storage_arr, StructArray::Make({shapes, data}, fields)); + auto ext_arr = ExtensionType::WrapArray(ext_type_, storage_arr); + auto ext_array = std::static_pointer_cast(ext_arr); + + std::shared_ptr t, tensor; + + ASSERT_OK_AND_ASSIGN(t, ext_array->GetTensor(0)); + ASSERT_EQ(t->shape(), (std::vector{2, 3, 1})); + ASSERT_EQ(t->strides(), (std::vector{24, 8, 8})); + + std::vector shape = {2, 3, 1}; + std::vector strides = {sizeof(int64_t) * 3, sizeof(int64_t) * 1, + sizeof(int64_t) * 1}; + std::vector values = {1, 1, 2, 3, 4, 5}; + auto data_buffer = Buffer::Wrap(values); + ASSERT_OK_AND_ASSIGN(tensor, + Tensor::Make(int64(), data_buffer, shape, strides, dim_names_)); + ASSERT_TRUE(tensor->Equals(*t)); + + ASSERT_OK_AND_ASSIGN(t, ext_array->GetTensor(1)); + ASSERT_EQ(t->shape(), (std::vector{2, 1, 2})); + ASSERT_EQ(t->strides(), (std::vector{16, 16, 8})); + + ASSERT_OK_AND_ASSIGN(t, ext_array->GetTensor(2)); + ASSERT_EQ(t->shape(), (std::vector{3, 1, 3})); + ASSERT_EQ(t->strides(), (std::vector{24, 24, 8})); + + shape = {3, 1, 3}; + strides = {sizeof(int64_t) * 3, sizeof(int64_t) * 3, sizeof(int64_t) * 1}; + values = {10, 11, 12, 13, 14, 15, 16, 17, 18}; + data_buffer = Buffer::Wrap(values); + ASSERT_OK_AND_ASSIGN(tensor, + Tensor::Make(int64(), data_buffer, shape, strides, dim_names_)); + + ASSERT_EQ(tensor->strides(), t->strides()); + ASSERT_EQ(tensor->shape(), t->shape()); + ASSERT_EQ(tensor->dim_names(), t->dim_names()); + ASSERT_EQ(tensor->type(), t->type()); + ASSERT_EQ(tensor->is_contiguous(), t->is_contiguous()); + ASSERT_EQ(tensor->is_column_major(), t->is_column_major()); + ASSERT_TRUE(tensor->Equals(*t)); +} + } // namespace arrow diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 65062132e5c..9803dce0427 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -17,11 +17,13 @@ #include +#include "arrow/extension/fixed_shape_tensor.h" #include "arrow/extension/variable_shape_tensor.h" #include "arrow/array/array_nested.h" #include "arrow/array/array_primitive.h" #include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep +#include "arrow/scalar.h" #include "arrow/tensor.h" #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" @@ -33,10 +35,38 @@ namespace rj = arrow::rapidjson; namespace arrow { - namespace extension { -namespace {} // namespace +const Result> VariableShapeTensorArray::GetTensor( + const int64_t i) const { + auto ext_arr = internal::checked_pointer_cast(this->storage()); + auto ext_type = internal::checked_pointer_cast(this->type()); + auto value_type = + internal::checked_pointer_cast(ext_type->value_type()); + auto ndim = ext_type->ndim(); + auto dim_names = ext_type->dim_names(); + auto shapes = + std::static_pointer_cast(ext_arr->field(0))->value_slice(i); + + std::vector shape; + for (int64_t j = 0; j < ndim; ++j) { + ARROW_ASSIGN_OR_RAISE(auto size, shapes->GetScalar(j)); + shape.push_back( + static_cast(std::static_pointer_cast(size)->value)); + } + + std::vector strides; + ARROW_CHECK_OK(internal::ComputeStrides(*value_type.get(), shape, + ext_type->permutation(), &strides)); + + auto list_arr = + std::static_pointer_cast(ext_arr->field(1))->value_slice(i)->data(); + auto bw = value_type->byte_width(); + auto buffer = + SliceBuffer(list_arr->buffers[1], list_arr->offset * bw, list_arr->length * bw); + + return Tensor::Make(ext_type->value_type(), buffer, shape, strides, dim_names); +} bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const { if (extension_name() != other.extension_name()) { diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 2d981222ea8..accd6cc46a2 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#pragma once + #include "arrow/extension_type.h" namespace arrow { @@ -23,6 +25,14 @@ namespace extension { class ARROW_EXPORT VariableShapeTensorArray : public ExtensionArray { public: using ExtensionArray::ExtensionArray; + + /// \brief Get a Tensor of VariableShapeTensorArray at i + /// + /// This method will return a Tensor from VariableShapeTensorArray with strides + /// derived from shape and permutation of VariableShapeTensorType. Shape and + /// dim_names will be permuted according to permutation stored in the + /// VariableShapeTensorType metadata. + const Result> GetTensor(const int64_t i) const; }; /// \brief Concrete type class for variable-shape Tensor data. diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index d427ae2c9a5..40f9a44e084 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3507,7 +3507,7 @@ class FixedShapeTensorArray(ExtensionArray): ) -class VariableShapeTensorArray(ExtensionArray): +cdef class VariableShapeTensorArray(ExtensionArray): """ Concrete class for variable shape tensor extension arrays. @@ -3557,15 +3557,18 @@ class VariableShapeTensorArray(ExtensionArray): def to_numpy_ndarray(self): """ Convert variable shape tensor extension array to list of numpy arrays. - - Note: ``permutation`` should be trivial (``None`` or ``[0, 1, ..., len(shape)-1]``). """ - if self.type.permutation is None or self.type.permutation == list(range(len(self.type.shape))): - storage_iterator = zip(self.storage.field(0), self.storage.field(1)) - return [np.array(v.values.to_numpy()).reshape(s.values.to_numpy()) for s, v in storage_iterator] - else: - raise ValueError( - 'Only non-permuted tensors can be converted to numpy tensors.') + cdef: + CVariableShapeTensorArray * ext_array = (self.ap) + CResult[shared_ptr[CTensor]] ctensor + + tensors = [] + for i in range(len(self.storage)): + with nogil: + ctensor = ext_array.GetTensor(i) + tensors.append(pyarrow_wrap_tensor(GetResultValue(ctensor))) + + return tensors @staticmethod def from_numpy_ndarray(obj): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index bcc145766a9..284ec009029 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2654,6 +2654,10 @@ cdef extern from "arrow/extension/variable_shape_tensor.h" namespace "arrow::ext const vector[int64_t] permutation() const vector[c_string] dim_names() + cdef cppclass CVariableShapeTensorArray \ + " arrow::extension::VariableShapeTensorArray"(CExtensionArray) nogil: + CResult[shared_ptr[CTensor]] GetTensor(const int64_t i) const + cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension": cdef cppclass CFixedShapeTensorType \ diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 51d6e213986..c30ef95ebb7 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -458,6 +458,8 @@ cdef class DictionaryArray(Array): cdef class ExtensionArray(Array): pass +cdef class VariableShapeTensorArray(ExtensionArray): + pass cdef class MonthDayNanoIntervalArray(Array): pass From 18c88a208f5d92cfd0dbb2640df7128362defc9f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 13 Sep 2023 01:32:11 +0200 Subject: [PATCH 04/18] Add ragged_dimensions --- .../extension/tensor_extension_array_test.cc | 11 ++++-- .../arrow/extension/variable_shape_tensor.cc | 36 +++++++++++++++---- .../arrow/extension/variable_shape_tensor.h | 16 ++++++--- docs/source/format/CanonicalExtensions.rst | 12 +++++++ python/pyarrow/array.pxi | 2 +- python/pyarrow/includes/libarrow.pxd | 4 ++- python/pyarrow/tests/test_extension_type.py | 22 ++++++++---- python/pyarrow/types.pxi | 26 ++++++++++++-- 8 files changed, 105 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 246f7a35a2f..a1678cc4e28 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -453,13 +453,15 @@ class TestVariableShapeTensorType : public ::testing::Test { shape_type_ = fixed_size_list(uint32(), ndim_); permutation_ = {0, 1, 2}; dim_names_ = {"x", "y", "z"}; - ext_type_ = internal::checked_pointer_cast( - variable_shape_tensor(value_type_, ndim_, permutation_, dim_names_)); + ragged_dimensions_ = {1}; + ext_type_ = internal::checked_pointer_cast(variable_shape_tensor( + value_type_, ndim_, permutation_, dim_names_, ragged_dimensions_)); shapes_ = ArrayFromJSON(fixed_size_list(uint32(), ndim_), "[[2,3,1],[1,2,2],[3,1,3]]"); data_ = ArrayFromJSON(list(value_type_), "[[0,1,2,3,4,5],[6,7,8,9],[10,11,12,13,14,15,16,17,18]]"); - serialized_ = R"({"permutation":[0,1,2],"dim_names":["x","y","z"]})"; + serialized_ = + R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"ragged_dimensions":[1]})"; storage_arr_ = ArrayFromJSON( ext_type_->storage_type(), R"([[[2,3,1],[0,1,2,3,4,5]],[[1,2,2],[6,7,8,9]],[[3,1,3],[10,11,12,13,14,15,16,17,18]]])"); @@ -473,6 +475,7 @@ class TestVariableShapeTensorType : public ::testing::Test { std::shared_ptr data_type_; std::shared_ptr shape_type_; std::vector permutation_; + std::vector ragged_dimensions_; std::vector dim_names_; std::shared_ptr ext_type_; std::shared_ptr shapes_; @@ -565,6 +568,8 @@ TEST_F(TestVariableShapeTensorType, MetadataSerializationRoundtrip) { variable_shape_tensor(value_type_, 3, {0, 1, 2}, {"H", "W", "C"})); CheckSerializationRoundtrip( variable_shape_tensor(value_type_, 3, {2, 0, 1}, {"C", "H", "W"})); + CheckSerializationRoundtrip( + variable_shape_tensor(value_type_, 3, {2, 0, 1}, {"C", "H", "W"}, {0, 1, 2})); auto storage_type = ext_type_->storage_type(); CheckDeserializationRaises(ext_type_, boolean(), R"({"shape":[3,4]})", diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 9803dce0427..3547fa0338f 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -56,6 +56,7 @@ const Result> VariableShapeTensorArray::GetTensor( } std::vector strides; + // TODO: optimize ComputeStrides for ragged tensors ARROW_CHECK_OK(internal::ComputeStrides(*value_type.get(), shape, ext_type->permutation(), &strides)); @@ -115,6 +116,15 @@ std::string VariableShapeTensorType::Serialize() const { document.AddMember(rj::Value("dim_names", allocator), dim_names, allocator); } + if (!ragged_dimensions_.empty()) { + rj::Value ragged_dimensions(rj::kArrayType); + for (auto v : ragged_dimensions_) { + ragged_dimensions.PushBack(v, allocator); + } + document.AddMember(rj::Value("ragged_dimensions", allocator), ragged_dimensions, + allocator); + } + rj::StringBuffer buffer; rj::Writer writer(buffer); document.Accept(writer); @@ -156,8 +166,17 @@ Result> VariableShapeTensorType::Deserialize( } } + std::vector ragged_dimensions; + if (document.HasMember("ragged_dimensions")) { + for (auto& x : document["ragged_dimensions"].GetArray()) { + ragged_dimensions.emplace_back(x.GetInt64()); + } + if (ragged_dimensions.size() > ndim) { + return Status::Invalid("Invalid ragged_dimensions"); + } + } return variable_shape_tensor(value_type, static_cast(ndim), permutation, - dim_names); + dim_names, ragged_dimensions); } std::shared_ptr VariableShapeTensorType::MakeArray( @@ -170,7 +189,8 @@ std::shared_ptr VariableShapeTensorType::MakeArray( Result> VariableShapeTensorType::Make( const std::shared_ptr& value_type, const uint32_t& ndim, - const std::vector& permutation, const std::vector& dim_names) { + const std::vector& permutation, const std::vector& dim_names, + const std::vector& ragged_dimensions) { if (!permutation.empty() && permutation.size() != ndim) { return Status::Invalid("permutation size must match ndim. Expected: ", ndim, " Got: ", permutation.size()); @@ -179,15 +199,19 @@ Result> VariableShapeTensorType::Make( return Status::Invalid("dim_names size must match ndim. Expected: ", ndim, " Got: ", dim_names.size()); } + if (ragged_dimensions.size() > ndim) { + return Status::Invalid("ragged_dimensions size must be less or equal ndim."); + } return std::make_shared(value_type, ndim, permutation, - dim_names); + dim_names, ragged_dimensions); } std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, - const std::vector& permutation, const std::vector& dim_names) { - auto maybe_type = - VariableShapeTensorType::Make(value_type, ndim, permutation, dim_names); + const std::vector& permutation, const std::vector& dim_names, + const std::vector& ragged_dimensions) { + auto maybe_type = VariableShapeTensorType::Make(value_type, ndim, permutation, + dim_names, ragged_dimensions); ARROW_DCHECK_OK(maybe_type.status()); return maybe_type.MoveValueUnsafe(); } diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index accd6cc46a2..41baf902586 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -43,12 +43,14 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { VariableShapeTensorType(const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, - const std::vector& dim_names = {}) + const std::vector& dim_names = {}, + const std::vector& ragged_dimensions = {}) : ExtensionType(struct_({::arrow::field("shape", fixed_size_list(uint32(), ndim)), ::arrow::field("data", list(value_type))})), value_type_(value_type), permutation_(permutation), - dim_names_(dim_names) {} + dim_names_(dim_names), + ragged_dimensions_(ragged_dimensions) {} std::string extension_name() const override { return "arrow.variable_shape_tensor"; } @@ -67,6 +69,9 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { /// Dimension names of tensor elements. Dimensions are ordered physically. const std::vector& dim_names() const { return dim_names_; } + /// Indexes of ragged dimensions. + const std::vector& ragged_dimensions() const { return ragged_dimensions_; } + bool ExtensionEquals(const ExtensionType& other) const override; std::string Serialize() const override; @@ -82,20 +87,23 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { static Result> Make( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, - const std::vector& dim_names = {}); + const std::vector& dim_names = {}, + const std::vector& ragged_dimensions = {}); private: std::shared_ptr storage_type_; std::shared_ptr value_type_; std::vector permutation_; std::vector dim_names_; + std::vector ragged_dimensions_; }; /// \brief Return a VariableShapeTensorType instance. ARROW_EXPORT std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, - const std::vector& dim_names = {}); + const std::vector& dim_names = {}, + const std::vector& ragged_dimensions = {}); } // namespace extension } // namespace arrow diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index d6d91929edb..8bf63eb8531 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -194,6 +194,13 @@ Variable shape tensor When logical and physical layout are equal, the permutation will always be ([0, 1, .., N-1]) and can therefore be left out. + * **ragged_dimensions** = indices of ragged dimensions whose sizes may + differ. Dimensions where all elements have the same size are called + uniform dimensions. Indices are a subset of all possible dimension + indices ([0, 1, .., N-1]). + Ragged dimensions list can be left out. In that case all dimensions + are assumed ragged. + * Description of the serialization: The metadata must be a valid JSON object including number of @@ -205,6 +212,11 @@ Variable shape tensor ``{ "dim_names": ["C", "H", "W"] }`` + - Example with ``ragged_dimensions`` metadata for a set of color images + with variable width: + + ``{ "dim_names": ["H", "W", "C"], "ragged_dimensions": [1] }`` + - Example of permuted 3-dimensional tensor: ``{ "permutation": [2, 0, 1] }`` diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 40f9a44e084..6f5d0eb4e67 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3566,7 +3566,7 @@ cdef class VariableShapeTensorArray(ExtensionArray): for i in range(len(self.storage)): with nogil: ctensor = ext_array.GetTensor(i) - tensors.append(pyarrow_wrap_tensor(GetResultValue(ctensor))) + tensors.append(pyarrow_wrap_tensor(GetResultValue(ctensor)).to_numpy()) return tensors diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 284ec009029..e8450c06775 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2642,7 +2642,8 @@ cdef extern from "arrow/extension/variable_shape_tensor.h" namespace "arrow::ext CResult[shared_ptr[CDataType]] Make(const shared_ptr[CDataType]& value_type, const uint32_t ndim, const vector[int64_t]& permutation, - const vector[c_string]& dim_names) + const vector[c_string]& dim_names, + const vector[int64_t]& ragged_dimensions) CResult[shared_ptr[CDataType]] Deserialize(const shared_ptr[CDataType] storage_type, const c_string& serialized_data) const @@ -2653,6 +2654,7 @@ cdef extern from "arrow/extension/variable_shape_tensor.h" namespace "arrow::ext const uint32_t ndim() const vector[int64_t] permutation() const vector[c_string] dim_names() + const vector[int64_t] ragged_dimensions() cdef cppclass CVariableShapeTensorArray \ " arrow::extension::VariableShapeTensorArray"(CExtensionArray) nogil: diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index ac7248eed73..2bb553e75e4 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1286,13 +1286,22 @@ def test_variable_shape_tensor_class_method(value_type): ndim = 2 shape_type = pa.list_(pa.uint32(), ndim) arrow_type = pa.from_numpy_dtype(value_type) - tensor_type = pa.variable_shape_tensor(arrow_type, ndim) + tensor_type = pa.variable_shape_tensor( + arrow_type, + ndim, + dim_names=["H", "W"], + permutation=[0, 1], + ragged_dimensions=[0], + ) fields = [pa.field("shape", shape_type), pa.field("data", pa.list_(arrow_type))] shapes = pa.array([[2, 3], [1, 2]], shape_type) values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(arrow_type)) struct_arr = pa.StructArray.from_arrays([shapes, values], fields=fields) arr = pa.ExtensionArray.from_storage(tensor_type, struct_arr) + basic_arr = pa.ExtensionArray.from_storage( + pa.variable_shape_tensor(arrow_type, ndim), struct_arr + ) storage = pa.array( [([2, 3], [1, 2, 3, 4, 5, 6]), ([1, 2], [7, 8])], type=pa.struct(fields) @@ -1307,12 +1316,12 @@ def test_variable_shape_tensor_class_method(value_type): ] assert all(zip(x == y for x, y in zip(arr.to_numpy_ndarray(), ndarray_list))) - from_ndarray_list = pa.VariableShapeTensorArray.from_numpy_ndarray(ndarray_list) - assert from_ndarray_list.equals(arr) - + assert pa.VariableShapeTensorArray.from_numpy_ndarray(ndarray_list).equals( + basic_arr + ) assert pa.VariableShapeTensorArray.from_numpy_ndarray( arr.to_numpy_ndarray() - ).equals(arr) + ).equals(basic_arr) assert arr.to_pylist() == [ {"data": [1, 2, 3, 4, 5, 6], "shape": [2, 3]}, @@ -1353,7 +1362,8 @@ def test_tensor_type_ipc(tensor_type): @pytest.mark.parametrize("tensor_type", ( pa.variable_shape_tensor(pa.int8(), 2), pa.variable_shape_tensor(pa.int8(), 2, permutation=[1, 0]), - pa.variable_shape_tensor(pa.int8(), 2, dim_names=['H', 'W']) + pa.variable_shape_tensor(pa.int8(), 2, dim_names=['H', 'W']), + pa.variable_shape_tensor(pa.int8(), 2, ragged_dimensions=[0, 1]), )) def test_variable_shape_tensor_type_ipc(tensor_type): shape_type = tensor_type.storage_type.field(0).type diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 6c73c7166ce..6e82700222e 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1609,6 +1609,17 @@ cdef class VariableShapeTensorType(BaseExtensionType): else: return None + @property + def ragged_dimensions(self): + """ + Indices of ragged dimensions. + """ + ragged_dimensions = self.tensor_ext_type.ragged_dimensions() + if len(ragged_dimensions) != 0: + return ragged_dimensions + else: + return None + def __arrow_ext_serialize__(self): """ Serialized representation of metadata to reconstruct the type object. @@ -1628,7 +1639,8 @@ cdef class VariableShapeTensorType(BaseExtensionType): def __reduce__(self): return variable_shape_tensor, (self.value_type, self.ndim, - self.dim_names, self.permutation) + self.dim_names, self.permutation, + self.ragged_dimensions) cdef class FixedShapeTensorType(BaseExtensionType): """ @@ -4899,7 +4911,7 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N return out -def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation=None): +def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation=None, ragged_dimensions=None): """ Create instance of variable shape tensor extension type with number of dimensions and optional names of tensor dimensions and indices of the @@ -4920,6 +4932,9 @@ def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation of the logical layout corresponds to which dimension of the physical tensor. For more information on this parameter see :ref:`fixed_shape_tensor_extension`. + ragged_dimensions : tuple or list of integers, default None + Indices of the dimensions that are ragged. The indices contain a subset + of the values ``[0, 1, .., N-1]`` where N is the number of dimensions. Examples -------- @@ -4977,6 +4992,7 @@ def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation uint32_t c_ndim vector[int64_t] c_permutation vector[c_string] c_dim_names + vector[int64_t] c_ragged_dimensions shared_ptr[CDataType] c_tensor_ext_type assert value_type is not None @@ -4992,10 +5008,14 @@ def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation for x in dim_names: c_dim_names.push_back(tobytes(x)) + if ragged_dimensions is not None: + for i in ragged_dimensions: + c_ragged_dimensions.push_back(i) + cdef VariableShapeTensorType out = VariableShapeTensorType.__new__(VariableShapeTensorType) c_tensor_ext_type = GetResultValue(CVariableShapeTensorType.Make( - value_type.sp_type, c_ndim, c_permutation, c_dim_names)) + value_type.sp_type, c_ndim, c_permutation, c_dim_names, c_ragged_dimensions)) out.init(c_tensor_ext_type) From 4d3eb4459d7efd46904634b0535e14e65402612f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 15 Sep 2023 20:54:27 +0200 Subject: [PATCH 05/18] Replace ragged_dimensions with uniform_dimensions --- .../extension/tensor_extension_array_test.cc | 8 ++--- .../arrow/extension/variable_shape_tensor.cc | 36 +++++++++---------- .../arrow/extension/variable_shape_tensor.h | 12 +++---- docs/source/format/CanonicalExtensions.rst | 19 +++++----- python/pyarrow/includes/libarrow.pxd | 4 +-- python/pyarrow/tests/test_extension_type.py | 4 +-- python/pyarrow/types.pxi | 31 ++++++++-------- 7 files changed, 59 insertions(+), 55 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index a1678cc4e28..69b75b4e3ad 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -453,15 +453,15 @@ class TestVariableShapeTensorType : public ::testing::Test { shape_type_ = fixed_size_list(uint32(), ndim_); permutation_ = {0, 1, 2}; dim_names_ = {"x", "y", "z"}; - ragged_dimensions_ = {1}; + uniform_dimensions_ = {1}; ext_type_ = internal::checked_pointer_cast(variable_shape_tensor( - value_type_, ndim_, permutation_, dim_names_, ragged_dimensions_)); + value_type_, ndim_, permutation_, dim_names_, uniform_dimensions_)); shapes_ = ArrayFromJSON(fixed_size_list(uint32(), ndim_), "[[2,3,1],[1,2,2],[3,1,3]]"); data_ = ArrayFromJSON(list(value_type_), "[[0,1,2,3,4,5],[6,7,8,9],[10,11,12,13,14,15,16,17,18]]"); serialized_ = - R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"ragged_dimensions":[1]})"; + R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"uniform_dimensions":[1]})"; storage_arr_ = ArrayFromJSON( ext_type_->storage_type(), R"([[[2,3,1],[0,1,2,3,4,5]],[[1,2,2],[6,7,8,9]],[[3,1,3],[10,11,12,13,14,15,16,17,18]]])"); @@ -475,7 +475,7 @@ class TestVariableShapeTensorType : public ::testing::Test { std::shared_ptr data_type_; std::shared_ptr shape_type_; std::vector permutation_; - std::vector ragged_dimensions_; + std::vector uniform_dimensions_; std::vector dim_names_; std::shared_ptr ext_type_; std::shared_ptr shapes_; diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 3547fa0338f..ec8d292c012 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -116,12 +116,12 @@ std::string VariableShapeTensorType::Serialize() const { document.AddMember(rj::Value("dim_names", allocator), dim_names, allocator); } - if (!ragged_dimensions_.empty()) { - rj::Value ragged_dimensions(rj::kArrayType); - for (auto v : ragged_dimensions_) { - ragged_dimensions.PushBack(v, allocator); + if (!uniform_dimensions_.empty()) { + rj::Value uniform_dimensions(rj::kArrayType); + for (auto v : uniform_dimensions_) { + uniform_dimensions.PushBack(v, allocator); } - document.AddMember(rj::Value("ragged_dimensions", allocator), ragged_dimensions, + document.AddMember(rj::Value("uniform_dimensions", allocator), uniform_dimensions, allocator); } @@ -166,17 +166,17 @@ Result> VariableShapeTensorType::Deserialize( } } - std::vector ragged_dimensions; - if (document.HasMember("ragged_dimensions")) { - for (auto& x : document["ragged_dimensions"].GetArray()) { - ragged_dimensions.emplace_back(x.GetInt64()); + std::vector uniform_dimensions; + if (document.HasMember("uniform_dimensions")) { + for (auto& x : document["uniform_dimensions"].GetArray()) { + uniform_dimensions.emplace_back(x.GetInt64()); } - if (ragged_dimensions.size() > ndim) { - return Status::Invalid("Invalid ragged_dimensions"); + if (uniform_dimensions.size() > ndim) { + return Status::Invalid("Invalid uniform_dimensions"); } } return variable_shape_tensor(value_type, static_cast(ndim), permutation, - dim_names, ragged_dimensions); + dim_names, uniform_dimensions); } std::shared_ptr VariableShapeTensorType::MakeArray( @@ -190,7 +190,7 @@ std::shared_ptr VariableShapeTensorType::MakeArray( Result> VariableShapeTensorType::Make( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation, const std::vector& dim_names, - const std::vector& ragged_dimensions) { + const std::vector& uniform_dimensions) { if (!permutation.empty() && permutation.size() != ndim) { return Status::Invalid("permutation size must match ndim. Expected: ", ndim, " Got: ", permutation.size()); @@ -199,19 +199,19 @@ Result> VariableShapeTensorType::Make( return Status::Invalid("dim_names size must match ndim. Expected: ", ndim, " Got: ", dim_names.size()); } - if (ragged_dimensions.size() > ndim) { - return Status::Invalid("ragged_dimensions size must be less or equal ndim."); + if (uniform_dimensions.size() > ndim) { + return Status::Invalid("uniform_dimensions size must be less or equal ndim."); } return std::make_shared(value_type, ndim, permutation, - dim_names, ragged_dimensions); + dim_names, uniform_dimensions); } std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation, const std::vector& dim_names, - const std::vector& ragged_dimensions) { + const std::vector& uniform_dimensions) { auto maybe_type = VariableShapeTensorType::Make(value_type, ndim, permutation, - dim_names, ragged_dimensions); + dim_names, uniform_dimensions); ARROW_DCHECK_OK(maybe_type.status()); return maybe_type.MoveValueUnsafe(); } diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 41baf902586..2a7e41f9b3a 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -44,13 +44,13 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& ragged_dimensions = {}) + const std::vector& uniform_dimensions = {}) : ExtensionType(struct_({::arrow::field("shape", fixed_size_list(uint32(), ndim)), ::arrow::field("data", list(value_type))})), value_type_(value_type), permutation_(permutation), dim_names_(dim_names), - ragged_dimensions_(ragged_dimensions) {} + uniform_dimensions_(uniform_dimensions) {} std::string extension_name() const override { return "arrow.variable_shape_tensor"; } @@ -70,7 +70,7 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const std::vector& dim_names() const { return dim_names_; } /// Indexes of ragged dimensions. - const std::vector& ragged_dimensions() const { return ragged_dimensions_; } + const std::vector& uniform_dimensions() const { return uniform_dimensions_; } bool ExtensionEquals(const ExtensionType& other) const override; @@ -88,14 +88,14 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& ragged_dimensions = {}); + const std::vector& uniform_dimensions = {}); private: std::shared_ptr storage_type_; std::shared_ptr value_type_; std::vector permutation_; std::vector dim_names_; - std::vector ragged_dimensions_; + std::vector uniform_dimensions_; }; /// \brief Return a VariableShapeTensorType instance. @@ -103,7 +103,7 @@ ARROW_EXPORT std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& ragged_dimensions = {}); + const std::vector& uniform_dimensions = {}); } // namespace extension } // namespace arrow diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 8bf63eb8531..d4758c3c869 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -194,12 +194,15 @@ Variable shape tensor When logical and physical layout are equal, the permutation will always be ([0, 1, .., N-1]) and can therefore be left out. - * **ragged_dimensions** = indices of ragged dimensions whose sizes may - differ. Dimensions where all elements have the same size are called - uniform dimensions. Indices are a subset of all possible dimension - indices ([0, 1, .., N-1]). - Ragged dimensions list can be left out. In that case all dimensions - are assumed ragged. + * **uniform_dimensions** = indices of dimensions whose sizes are + guaranteed to remain constant. Indices are a subset of all possible + dimension indices ([0, 1, .., N-1]). + The uniform dimensions must still be represented in the `shape` field, + and must always be the same value for all tensors in the array -- this + allows code to interpret the tensor correctly without accounting for + uniform dimensions while still permitting optional optimizations that + take advantage of the uniformity. uniform_dimensions can be left out, + in which case it is assumed that all dimensions might be variable. * Description of the serialization: @@ -212,10 +215,10 @@ Variable shape tensor ``{ "dim_names": ["C", "H", "W"] }`` - - Example with ``ragged_dimensions`` metadata for a set of color images + - Example with ``uniform_dimensions`` metadata for a set of color images with variable width: - ``{ "dim_names": ["H", "W", "C"], "ragged_dimensions": [1] }`` + ``{ "dim_names": ["H", "W", "C"], "uniform_dimensions": [1] }`` - Example of permuted 3-dimensional tensor: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index e8450c06775..7d87cdbd1fc 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2643,7 +2643,7 @@ cdef extern from "arrow/extension/variable_shape_tensor.h" namespace "arrow::ext const uint32_t ndim, const vector[int64_t]& permutation, const vector[c_string]& dim_names, - const vector[int64_t]& ragged_dimensions) + const vector[int64_t]& uniform_dimensions) CResult[shared_ptr[CDataType]] Deserialize(const shared_ptr[CDataType] storage_type, const c_string& serialized_data) const @@ -2654,7 +2654,7 @@ cdef extern from "arrow/extension/variable_shape_tensor.h" namespace "arrow::ext const uint32_t ndim() const vector[int64_t] permutation() const vector[c_string] dim_names() - const vector[int64_t] ragged_dimensions() + const vector[int64_t] uniform_dimensions() cdef cppclass CVariableShapeTensorArray \ " arrow::extension::VariableShapeTensorArray"(CExtensionArray) nogil: diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 2bb553e75e4..82fc340bd0c 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1291,7 +1291,7 @@ def test_variable_shape_tensor_class_method(value_type): ndim, dim_names=["H", "W"], permutation=[0, 1], - ragged_dimensions=[0], + uniform_dimensions=[0], ) fields = [pa.field("shape", shape_type), pa.field("data", pa.list_(arrow_type))] @@ -1363,7 +1363,7 @@ def test_tensor_type_ipc(tensor_type): pa.variable_shape_tensor(pa.int8(), 2), pa.variable_shape_tensor(pa.int8(), 2, permutation=[1, 0]), pa.variable_shape_tensor(pa.int8(), 2, dim_names=['H', 'W']), - pa.variable_shape_tensor(pa.int8(), 2, ragged_dimensions=[0, 1]), + pa.variable_shape_tensor(pa.int8(), 2, uniform_dimensions=[0, 1]), )) def test_variable_shape_tensor_type_ipc(tensor_type): shape_type = tensor_type.storage_type.field(0).type diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 6e82700222e..59c0e57f1c1 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1610,13 +1610,13 @@ cdef class VariableShapeTensorType(BaseExtensionType): return None @property - def ragged_dimensions(self): + def uniform_dimensions(self): """ - Indices of ragged dimensions. + Indices of uniform dimensions. """ - ragged_dimensions = self.tensor_ext_type.ragged_dimensions() - if len(ragged_dimensions) != 0: - return ragged_dimensions + uniform_dimensions = self.tensor_ext_type.uniform_dimensions() + if len(uniform_dimensions) != 0: + return uniform_dimensions else: return None @@ -1640,7 +1640,7 @@ cdef class VariableShapeTensorType(BaseExtensionType): def __reduce__(self): return variable_shape_tensor, (self.value_type, self.ndim, self.dim_names, self.permutation, - self.ragged_dimensions) + self.uniform_dimensions) cdef class FixedShapeTensorType(BaseExtensionType): """ @@ -4911,7 +4911,7 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N return out -def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation=None, ragged_dimensions=None): +def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation=None, uniform_dimensions=None): """ Create instance of variable shape tensor extension type with number of dimensions and optional names of tensor dimensions and indices of the @@ -4932,9 +4932,10 @@ def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation of the logical layout corresponds to which dimension of the physical tensor. For more information on this parameter see :ref:`fixed_shape_tensor_extension`. - ragged_dimensions : tuple or list of integers, default None - Indices of the dimensions that are ragged. The indices contain a subset - of the values ``[0, 1, .., N-1]`` where N is the number of dimensions. + uniform_dimensions : tuple or list of integers, default None + Indices of the dimensions that are guaranteed to remain constant over the + whole array. The indices contain a subset of the values ``[0, 1, .., N-1]`` + where N is the number of dimensions. Examples -------- @@ -4992,7 +4993,7 @@ def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation uint32_t c_ndim vector[int64_t] c_permutation vector[c_string] c_dim_names - vector[int64_t] c_ragged_dimensions + vector[int64_t] c_uniform_dimensions shared_ptr[CDataType] c_tensor_ext_type assert value_type is not None @@ -5008,14 +5009,14 @@ def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation for x in dim_names: c_dim_names.push_back(tobytes(x)) - if ragged_dimensions is not None: - for i in ragged_dimensions: - c_ragged_dimensions.push_back(i) + if uniform_dimensions is not None: + for i in uniform_dimensions: + c_uniform_dimensions.push_back(i) cdef VariableShapeTensorType out = VariableShapeTensorType.__new__(VariableShapeTensorType) c_tensor_ext_type = GetResultValue(CVariableShapeTensorType.Make( - value_type.sp_type, c_ndim, c_permutation, c_dim_names, c_ragged_dimensions)) + value_type.sp_type, c_ndim, c_permutation, c_dim_names, c_uniform_dimensions)) out.init(c_tensor_ext_type) From 5bc32664228ace0dcf090c57ce10aa43b1604dbe Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 15 Sep 2023 21:13:25 +0200 Subject: [PATCH 06/18] Add example for explanation --- docs/source/format/CanonicalExtensions.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index d4758c3c869..6c6a6014c0d 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -230,6 +230,16 @@ Variable shape tensor .. note:: + With the exception of permutation all other parameters and storage + of VariableShapeTensor define the *physical* storage of the tensor. + + For example, consider a tensor with: + shape = [10, 20, 30] + dim_names = [x, y, z] + permutations = [2, 0, 1] + + This means the logical tensor has names [z, x, y] and shape [30, 10, 20]. + Elements in a variable shape tensor extension array are stored in row-major/C-contiguous order. From 02c3108d02c7ea7776d4a7166ea7823c123fc472 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 25 Sep 2023 00:00:53 +0200 Subject: [PATCH 07/18] Add uniform_shape parameter --- .../extension/tensor_extension_array_test.cc | 19 ++++----- .../arrow/extension/variable_shape_tensor.cc | 42 +++++++++++++++---- .../arrow/extension/variable_shape_tensor.h | 16 +++++-- docs/source/format/CanonicalExtensions.rst | 6 +++ python/pyarrow/includes/libarrow.pxd | 4 +- python/pyarrow/tests/test_extension_type.py | 5 ++- python/pyarrow/types.pxi | 26 +++++++++++- 7 files changed, 89 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index 69b75b4e3ad..af619826f00 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -454,14 +454,16 @@ class TestVariableShapeTensorType : public ::testing::Test { permutation_ = {0, 1, 2}; dim_names_ = {"x", "y", "z"}; uniform_dimensions_ = {1}; - ext_type_ = internal::checked_pointer_cast(variable_shape_tensor( - value_type_, ndim_, permutation_, dim_names_, uniform_dimensions_)); + uniform_shape_ = {0, 1, 0}; + ext_type_ = internal::checked_pointer_cast( + variable_shape_tensor(value_type_, ndim_, permutation_, dim_names_, + uniform_dimensions_, uniform_shape_)); shapes_ = - ArrayFromJSON(fixed_size_list(uint32(), ndim_), "[[2,3,1],[1,2,2],[3,1,3]]"); + ArrayFromJSON(fixed_size_list(uint32(), ndim_), "[[2,1,3],[2,1,2],[3,1,3]]"); data_ = ArrayFromJSON(list(value_type_), "[[0,1,2,3,4,5],[6,7,8,9],[10,11,12,13,14,15,16,17,18]]"); serialized_ = - R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"uniform_dimensions":[1]})"; + R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"uniform_dimensions":[1],"uniform_shape":[0,1,0]})"; storage_arr_ = ArrayFromJSON( ext_type_->storage_type(), R"([[[2,3,1],[0,1,2,3,4,5]],[[1,2,2],[6,7,8,9]],[[3,1,3],[10,11,12,13,14,15,16,17,18]]])"); @@ -476,6 +478,7 @@ class TestVariableShapeTensorType : public ::testing::Test { std::shared_ptr shape_type_; std::vector permutation_; std::vector uniform_dimensions_; + std::vector uniform_shape_; std::vector dim_names_; std::shared_ptr ext_type_; std::shared_ptr shapes_; @@ -549,14 +552,6 @@ TEST_F(TestVariableShapeTensorType, EqualsCases) { ASSERT_FALSE(ext_type_permutation_2->Equals(ext_type_permutation_1)); } -TEST_F(TestVariableShapeTensorType, CreateFromArray) { - std::vector field_names = {"shapes", "data"}; - ASSERT_OK_AND_ASSIGN(auto storage_arr, - StructArray::Make({shapes_, data_}, field_names)); - auto arr = ExtensionType::WrapArray(ext_type_, storage_arr); - ASSERT_TRUE(ext_arr_->Equals(*arr)); -} - TEST_F(TestVariableShapeTensorType, MetadataSerializationRoundtrip) { using T = VariableShapeTensorType; diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index ec8d292c012..89bbf4011f9 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -92,7 +92,9 @@ bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const (is_permutation_trivial(permutation_) && other_ext.permutation().empty())); return (storage_type()->Equals(other_ext.storage_type())) && - (dim_names_ == other_ext.dim_names()) && permutation_equivalent; + (dim_names_ == other_ext.dim_names()) && + (uniform_dimensions_ == other_ext.uniform_dimensions_) && + (uniform_shape_ == other_ext.uniform_shape()) && permutation_equivalent; } std::string VariableShapeTensorType::Serialize() const { @@ -125,6 +127,14 @@ std::string VariableShapeTensorType::Serialize() const { allocator); } + if (!uniform_shape_.empty()) { + rj::Value uniform_shape(rj::kArrayType); + for (auto v : uniform_shape_) { + uniform_shape.PushBack(v, allocator); + } + document.AddMember(rj::Value("uniform_shape", allocator), uniform_shape, allocator); + } + rj::StringBuffer buffer; rj::Writer writer(buffer); document.Accept(writer); @@ -175,8 +185,19 @@ Result> VariableShapeTensorType::Deserialize( return Status::Invalid("Invalid uniform_dimensions"); } } + + std::vector uniform_shape; + if (document.HasMember("uniform_shape")) { + for (auto& x : document["uniform_shape"].GetArray()) { + uniform_shape.emplace_back(x.GetInt64()); + } + if (uniform_shape.size() > ndim) { + return Status::Invalid("Invalid uniform_shape"); + } + } + return variable_shape_tensor(value_type, static_cast(ndim), permutation, - dim_names, uniform_dimensions); + dim_names, uniform_dimensions, uniform_shape); } std::shared_ptr VariableShapeTensorType::MakeArray( @@ -190,7 +211,8 @@ std::shared_ptr VariableShapeTensorType::MakeArray( Result> VariableShapeTensorType::Make( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation, const std::vector& dim_names, - const std::vector& uniform_dimensions) { + const std::vector& uniform_dimensions, + const std::vector& uniform_shape) { if (!permutation.empty() && permutation.size() != ndim) { return Status::Invalid("permutation size must match ndim. Expected: ", ndim, " Got: ", permutation.size()); @@ -202,16 +224,20 @@ Result> VariableShapeTensorType::Make( if (uniform_dimensions.size() > ndim) { return Status::Invalid("uniform_dimensions size must be less or equal ndim."); } - return std::make_shared(value_type, ndim, permutation, - dim_names, uniform_dimensions); + if (uniform_shape.size() > ndim) { + return Status::Invalid("uniform_shape size must be less or equal ndim."); + } + return std::make_shared( + value_type, ndim, permutation, dim_names, uniform_dimensions, uniform_shape); } std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation, const std::vector& dim_names, - const std::vector& uniform_dimensions) { - auto maybe_type = VariableShapeTensorType::Make(value_type, ndim, permutation, - dim_names, uniform_dimensions); + const std::vector& uniform_dimensions, + const std::vector& uniform_shape) { + auto maybe_type = VariableShapeTensorType::Make( + value_type, ndim, permutation, dim_names, uniform_dimensions, uniform_shape); ARROW_DCHECK_OK(maybe_type.status()); return maybe_type.MoveValueUnsafe(); } diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h index 2a7e41f9b3a..c2c40b364f8 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -44,13 +44,15 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& uniform_dimensions = {}) + const std::vector& uniform_dimensions = {}, + const std::vector& uniform_shape = {}) : ExtensionType(struct_({::arrow::field("shape", fixed_size_list(uint32(), ndim)), ::arrow::field("data", list(value_type))})), value_type_(value_type), permutation_(permutation), dim_names_(dim_names), - uniform_dimensions_(uniform_dimensions) {} + uniform_dimensions_(uniform_dimensions), + uniform_shape_(uniform_shape) {} std::string extension_name() const override { return "arrow.variable_shape_tensor"; } @@ -72,6 +74,9 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { /// Indexes of ragged dimensions. const std::vector& uniform_dimensions() const { return uniform_dimensions_; } + /// Shape of uniform dimensions. + const std::vector& uniform_shape() const { return uniform_shape_; } + bool ExtensionEquals(const ExtensionType& other) const override; std::string Serialize() const override; @@ -88,7 +93,8 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& uniform_dimensions = {}); + const std::vector& uniform_dimensions = {}, + const std::vector& uniform_shape = {}); private: std::shared_ptr storage_type_; @@ -96,6 +102,7 @@ class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { std::vector permutation_; std::vector dim_names_; std::vector uniform_dimensions_; + std::vector uniform_shape_; }; /// \brief Return a VariableShapeTensorType instance. @@ -103,7 +110,8 @@ ARROW_EXPORT std::shared_ptr variable_shape_tensor( const std::shared_ptr& value_type, const uint32_t& ndim, const std::vector& permutation = {}, const std::vector& dim_names = {}, - const std::vector& uniform_dimensions = {}); + const std::vector& uniform_dimensions = {}, + const std::vector& uniform_shape = {}); } // namespace extension } // namespace arrow diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 6c6a6014c0d..3af4429e81a 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -204,6 +204,12 @@ Variable shape tensor take advantage of the uniformity. uniform_dimensions can be left out, in which case it is assumed that all dimensions might be variable. + * **uniform_shape** = shape over dimensions that are guaranteed to stay + constant over of all tensors in the array if all their ragged dimension + sizes were replaced by 0. + An array containing tensor with shape (2, 3, 4) and uniform dimensions + (0, 2) would have uniform shape (2, 0, 4). + * Description of the serialization: The metadata must be a valid JSON object including number of diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 7d87cdbd1fc..9487372eb8c 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2643,7 +2643,8 @@ cdef extern from "arrow/extension/variable_shape_tensor.h" namespace "arrow::ext const uint32_t ndim, const vector[int64_t]& permutation, const vector[c_string]& dim_names, - const vector[int64_t]& uniform_dimensions) + const vector[int64_t]& uniform_dimensions, + const vector[int64_t]& uniform_shape) CResult[shared_ptr[CDataType]] Deserialize(const shared_ptr[CDataType] storage_type, const c_string& serialized_data) const @@ -2655,6 +2656,7 @@ cdef extern from "arrow/extension/variable_shape_tensor.h" namespace "arrow::ext const vector[int64_t] permutation() const vector[c_string] dim_names() const vector[int64_t] uniform_dimensions() + const vector[int64_t] uniform_shape() cdef cppclass CVariableShapeTensorArray \ " arrow::extension::VariableShapeTensorArray"(CExtensionArray) nogil: diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 82fc340bd0c..77b6d469c99 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1292,10 +1292,11 @@ def test_variable_shape_tensor_class_method(value_type): dim_names=["H", "W"], permutation=[0, 1], uniform_dimensions=[0], + uniform_shape=[2, 0], ) fields = [pa.field("shape", shape_type), pa.field("data", pa.list_(arrow_type))] - shapes = pa.array([[2, 3], [1, 2]], shape_type) + shapes = pa.array([[2, 3], [2, 1]], shape_type) values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(arrow_type)) struct_arr = pa.StructArray.from_arrays([shapes, values], fields=fields) arr = pa.ExtensionArray.from_storage(tensor_type, struct_arr) @@ -1304,7 +1305,7 @@ def test_variable_shape_tensor_class_method(value_type): ) storage = pa.array( - [([2, 3], [1, 2, 3, 4, 5, 6]), ([1, 2], [7, 8])], type=pa.struct(fields) + [([2, 3], [1, 2, 3, 4, 5, 6]), ([2, 1], [7, 8])], type=pa.struct(fields) ) assert pa.ExtensionArray.from_storage(tensor_type, storage).equals(arr) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 59c0e57f1c1..c98a5536f8d 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1620,6 +1620,17 @@ cdef class VariableShapeTensorType(BaseExtensionType): else: return None + @property + def uniform_shape(self): + """ + Shape over dimensions that are guaranteed to be constant. + """ + uniform_shape = self.tensor_ext_type.uniform_shape() + if len(uniform_shape) != 0: + return uniform_shape + else: + return None + def __arrow_ext_serialize__(self): """ Serialized representation of metadata to reconstruct the type object. @@ -4911,7 +4922,8 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N return out -def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation=None, uniform_dimensions=None): +def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation=None, + uniform_dimensions=None, uniform_shape=None): """ Create instance of variable shape tensor extension type with number of dimensions and optional names of tensor dimensions and indices of the @@ -4936,6 +4948,11 @@ def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation Indices of the dimensions that are guaranteed to remain constant over the whole array. The indices contain a subset of the values ``[0, 1, .., N-1]`` where N is the number of dimensions. + uniform_shape : tuple or list of integers, default None + Shape over dimensions that are guaranteed to stay constant over all tensors + in the array if all their ragged dimensions sizes were replaced by 0. + An array containing tensor with shape (2, 3, 4) and uniform dimensions + (0, 2) would have uniform shape (2, 0, 4). Examples -------- @@ -4994,6 +5011,7 @@ def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation vector[int64_t] c_permutation vector[c_string] c_dim_names vector[int64_t] c_uniform_dimensions + vector[int64_t] c_uniform_shape shared_ptr[CDataType] c_tensor_ext_type assert value_type is not None @@ -5013,10 +5031,14 @@ def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation for i in uniform_dimensions: c_uniform_dimensions.push_back(i) + if uniform_shape is not None: + for i in uniform_shape: + c_uniform_shape.push_back(i) + cdef VariableShapeTensorType out = VariableShapeTensorType.__new__(VariableShapeTensorType) c_tensor_ext_type = GetResultValue(CVariableShapeTensorType.Make( - value_type.sp_type, c_ndim, c_permutation, c_dim_names, c_uniform_dimensions)) + value_type.sp_type, c_ndim, c_permutation, c_dim_names, c_uniform_dimensions, c_uniform_shape)) out.init(c_tensor_ext_type) From 10d20a3c1928c498c4f4896508511e4ee238e60b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 25 Sep 2023 15:25:35 +0200 Subject: [PATCH 08/18] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- docs/source/format/CanonicalExtensions.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 3af4429e81a..9dc13fcc0f9 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -160,8 +160,7 @@ Variable shape tensor tensor per row: * **data** is a ``List`` holding tensor elements of a single tensor. - Data type of the list elements is uniform across the entire column - and also provided in metadata. + Data type of the list elements is uniform across the entire column. * **shape** is a ``FixedSizeList[ndim]`` of the tensor shape where the size of the list ``ndim`` is equal to the number of dimensions of the tensor. @@ -204,9 +203,9 @@ Variable shape tensor take advantage of the uniformity. uniform_dimensions can be left out, in which case it is assumed that all dimensions might be variable. - * **uniform_shape** = shape over dimensions that are guaranteed to stay - constant over of all tensors in the array if all their ragged dimension - sizes were replaced by 0. + * **uniform_shape** = shape of the dimensions that are guaranteed to stay + constant over all tensors in the array, with the shape of the ragged dimensions + set to 0. An array containing tensor with shape (2, 3, 4) and uniform dimensions (0, 2) would have uniform shape (2, 0, 4). From d73713012a361b7266e8dbc9773830e0bcc12d2d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 25 Sep 2023 16:27:14 +0200 Subject: [PATCH 09/18] Review feedback --- docs/source/format/CanonicalExtensions.rst | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 9dc13fcc0f9..f00368ef93a 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -211,10 +211,16 @@ Variable shape tensor * Description of the serialization: - The metadata must be a valid JSON object including number of - dimensions of the contained tensors as an integer with key **"ndim"** - plus optional dimension names with keys **"dim_names"** and ordering of - the dimensions with key **"permutation"**. + The metadata must be a valid JSON object, that optionally includes + dimension names with keys **"dim_names"**, ordering of + dimensions with key **"permutation"**, indices of dimensions whose sizes + are guaranteed to remain constant with key **"uniform_dimensions"** and + shape of those dimensions with key **"uniform_shape"**. + Minimal metadata is an empty JSON object. + + - Example of minimal metadata is: + + ``{}`` - Example with ``dim_names`` metadata for NCHW ordered data: From e646a79b1dd1b1d1c66b9c8ce55357ac54f4bce6 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 3 Oct 2023 14:25:20 +0200 Subject: [PATCH 10/18] Update docs/source/format/CanonicalExtensions.rst Co-authored-by: Antoine Pitrou --- docs/source/format/CanonicalExtensions.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index f00368ef93a..37d848dc931 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -241,8 +241,8 @@ Variable shape tensor .. note:: - With the exception of permutation all other parameters and storage - of VariableShapeTensor define the *physical* storage of the tensor. + With the exception of ``permutation``, the parameters and storage + of VariableShapeTensor relate to the *physical* storage of the tensor. For example, consider a tensor with: shape = [10, 20, 30] From 0fafccae4489b14c2c657c9e3e1bc8e834503235 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 4 Oct 2023 11:43:16 +0200 Subject: [PATCH 11/18] Review feedback --- docs/source/format/CanonicalExtensions.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 37d848dc931..127943a0e68 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -161,7 +161,7 @@ Variable shape tensor * **data** is a ``List`` holding tensor elements of a single tensor. Data type of the list elements is uniform across the entire column. - * **shape** is a ``FixedSizeList[ndim]`` of the tensor shape where + * **shape** is a ``FixedSizeList[ndim]`` of the tensor shape where the size of the list ``ndim`` is equal to the number of dimensions of the tensor. From 18984fe8115064ee5c9e05522dc61624bcedb389 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 5 Oct 2023 13:06:08 +0200 Subject: [PATCH 12/18] Removing implementation --- cpp/src/arrow/CMakeLists.txt | 1 - cpp/src/arrow/extension/CMakeLists.txt | 4 +- cpp/src/arrow/extension/fixed_shape_tensor.cc | 14 +- cpp/src/arrow/extension/fixed_shape_tensor.h | 11 - ...ray_test.cc => fixed_shape_tensor_test.cc} | 253 ++---------------- .../arrow/extension/variable_shape_tensor.cc | 246 ----------------- .../arrow/extension/variable_shape_tensor.h | 117 -------- cpp/src/arrow/extension_type.cc | 11 +- docs/source/python/api/arrays.rst | 1 - python/pyarrow/__init__.py | 5 +- python/pyarrow/array.pxi | 136 ---------- python/pyarrow/includes/libarrow.pxd | 29 -- python/pyarrow/lib.pxd | 7 - python/pyarrow/public-api.pxi | 2 - python/pyarrow/tests/test_extension_type.py | 145 ---------- python/pyarrow/types.pxi | 228 ---------------- 16 files changed, 33 insertions(+), 1177 deletions(-) rename cpp/src/arrow/extension/{tensor_extension_array_test.cc => fixed_shape_tensor_test.cc} (64%) delete mode 100644 cpp/src/arrow/extension/variable_shape_tensor.cc delete mode 100644 cpp/src/arrow/extension/variable_shape_tensor.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 20bb550bc5c..9a611701153 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -548,7 +548,6 @@ if(ARROW_JSON) list(APPEND ARROW_SRCS extension/fixed_shape_tensor.cc - extension/variable_shape_tensor.cc json/options.cc json/chunked_builder.cc json/chunker.cc diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index 2e29e1f2b2e..c15c42874d4 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -17,8 +17,8 @@ add_arrow_test(test SOURCES - tensor_extension_array_test.cc + fixed_shape_tensor_test.cc PREFIX - "arrow-canonical-extensions") + "arrow-fixed-shape-tensor") arrow_install_all_headers("arrow/extension") diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 4d07a61541c..e4195ea9e66 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -35,7 +35,9 @@ namespace rj = arrow::rapidjson; namespace arrow { -namespace internal { +namespace extension { + +namespace { Status ComputeStrides(const FixedWidthType& type, const std::vector& shape, const std::vector& permutation, @@ -76,9 +78,7 @@ Status ComputeStrides(const FixedWidthType& type, const std::vector& sh return Status::OK(); } -} // namespace internal - -namespace extension { +} // namespace bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const { if (extension_name() != other.extension_name()) { @@ -303,7 +303,7 @@ const Result> FixedShapeTensorArray::ToTensor() const { std::vector tensor_strides; auto value_type = internal::checked_pointer_cast(ext_arr->value_type()); ARROW_RETURN_NOT_OK( - internal::ComputeStrides(*value_type.get(), shape, permutation, &tensor_strides)); + ComputeStrides(*value_type.get(), shape, permutation, &tensor_strides)); ARROW_ASSIGN_OR_RAISE(auto buffers, ext_arr->Flatten()); ARROW_ASSIGN_OR_RAISE( auto tensor, Tensor::Make(ext_arr->value_type(), buffers->data()->buffers[1], shape, @@ -332,8 +332,8 @@ const std::vector& FixedShapeTensorType::strides() { if (strides_.empty()) { auto value_type = internal::checked_pointer_cast(this->value_type_); std::vector tensor_strides; - ARROW_CHECK_OK(internal::ComputeStrides(*value_type.get(), this->shape(), - this->permutation(), &tensor_strides)); + ARROW_CHECK_OK(ComputeStrides(*value_type.get(), this->shape(), this->permutation(), + &tensor_strides)); strides_ = tensor_strides; } return strides_; diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 21631755016..93837f13002 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -15,20 +15,9 @@ // specific language governing permissions and limitations // under the License. -#pragma once - #include "arrow/extension_type.h" namespace arrow { -namespace internal { - -ARROW_EXPORT -Status ComputeStrides(const FixedWidthType& type, const std::vector& shape, - const std::vector& permutation, - std::vector* strides); - -} // namespace internal - namespace extension { class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc similarity index 64% rename from cpp/src/arrow/extension/tensor_extension_array_test.cc rename to cpp/src/arrow/extension/fixed_shape_tensor_test.cc index af619826f00..c3c97bc6e57 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -16,7 +16,6 @@ // under the License. #include "arrow/extension/fixed_shape_tensor.h" -#include "arrow/extension/variable_shape_tensor.h" #include "arrow/testing/matchers.h" @@ -29,7 +28,6 @@ #include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/key_value_metadata.h" -#include "arrow/util/logging.h" namespace arrow { @@ -37,10 +35,6 @@ using FixedShapeTensorType = extension::FixedShapeTensorType; using extension::fixed_shape_tensor; using extension::FixedShapeTensorArray; -using VariableShapeTensorType = extension::VariableShapeTensorType; -using extension::variable_shape_tensor; -using extension::VariableShapeTensorArray; - class TestExtensionType : public ::testing::Test { public: void SetUp() override { @@ -160,47 +154,43 @@ TEST_F(TestExtensionType, CreateFromArray) { ASSERT_EQ(ext_arr->null_count(), 0); } -template void CheckSerializationRoundtrip(const std::shared_ptr& ext_type) { - auto type = internal::checked_pointer_cast(ext_type); - auto serialized = type->Serialize(); + auto fst_type = internal::checked_pointer_cast(ext_type); + auto serialized = fst_type->Serialize(); ASSERT_OK_AND_ASSIGN(auto deserialized, - type->Deserialize(type->storage_type(), serialized)); - ASSERT_TRUE(type->Equals(*deserialized)); + fst_type->Deserialize(fst_type->storage_type(), serialized)); + ASSERT_TRUE(fst_type->Equals(*deserialized)); } -void CheckDeserializationRaises(const std::shared_ptr& extension_type, - const std::shared_ptr& storage_type, +void CheckDeserializationRaises(const std::shared_ptr& storage_type, const std::string& serialized, const std::string& expected_message) { - auto ext_type = internal::checked_pointer_cast(extension_type); + auto fst_type = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), {3, 4})); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr(expected_message), - ext_type->Deserialize(storage_type, serialized)); + fst_type->Deserialize(storage_type, serialized)); } TEST_F(TestExtensionType, MetadataSerializationRoundtrip) { - using T = FixedShapeTensorType; - CheckSerializationRoundtrip(ext_type_); - CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {}, {}, {})); - CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {0}, {}, {})); - CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {1}, {0}, {"x"})); - CheckSerializationRoundtrip( + CheckSerializationRoundtrip(ext_type_); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {}, {}, {})); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {0}, {}, {})); + CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {1}, {0}, {"x"})); + CheckSerializationRoundtrip( fixed_shape_tensor(value_type_, {256, 256, 3}, {0, 1, 2}, {"H", "W", "C"})); - CheckSerializationRoundtrip( + CheckSerializationRoundtrip( fixed_shape_tensor(value_type_, {256, 256, 3}, {2, 0, 1}, {"C", "H", "W"})); auto storage_type = fixed_size_list(int64(), 12); - CheckDeserializationRaises(ext_type_, boolean(), R"({"shape":[3,4]})", + CheckDeserializationRaises(boolean(), R"({"shape":[3,4]})", "Expected FixedSizeList storage type, got bool"); - CheckDeserializationRaises(ext_type_, storage_type, R"({"dim_names":["x","y"]})", + CheckDeserializationRaises(storage_type, R"({"dim_names":["x","y"]})", "Invalid serialized JSON data"); - CheckDeserializationRaises(ext_type_, storage_type, R"({"shape":(3,4)})", + CheckDeserializationRaises(storage_type, R"({"shape":(3,4)})", "Invalid serialized JSON data"); - CheckDeserializationRaises(ext_type_, storage_type, - R"({"shape":[3,4],"permutation":[1,0,2]})", + CheckDeserializationRaises(storage_type, R"({"shape":[3,4],"permutation":[1,0,2]})", "Invalid permutation"); - CheckDeserializationRaises(ext_type_, storage_type, - R"({"shape":[3],"dim_names":["x","y"]})", + CheckDeserializationRaises(storage_type, R"({"shape":[3],"dim_names":["x","y"]})", "Invalid dim_names"); } @@ -444,209 +434,4 @@ TEST_F(TestExtensionType, ComputeStrides) { ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); } -class TestVariableShapeTensorType : public ::testing::Test { - public: - void SetUp() override { - ndim_ = 3; - value_type_ = int64(); - data_type_ = list(value_type_); - shape_type_ = fixed_size_list(uint32(), ndim_); - permutation_ = {0, 1, 2}; - dim_names_ = {"x", "y", "z"}; - uniform_dimensions_ = {1}; - uniform_shape_ = {0, 1, 0}; - ext_type_ = internal::checked_pointer_cast( - variable_shape_tensor(value_type_, ndim_, permutation_, dim_names_, - uniform_dimensions_, uniform_shape_)); - shapes_ = - ArrayFromJSON(fixed_size_list(uint32(), ndim_), "[[2,1,3],[2,1,2],[3,1,3]]"); - data_ = ArrayFromJSON(list(value_type_), - "[[0,1,2,3,4,5],[6,7,8,9],[10,11,12,13,14,15,16,17,18]]"); - serialized_ = - R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"uniform_dimensions":[1],"uniform_shape":[0,1,0]})"; - storage_arr_ = ArrayFromJSON( - ext_type_->storage_type(), - R"([[[2,3,1],[0,1,2,3,4,5]],[[1,2,2],[6,7,8,9]],[[3,1,3],[10,11,12,13,14,15,16,17,18]]])"); - ext_arr_ = internal::checked_pointer_cast( - ExtensionType::WrapArray(ext_type_, storage_arr_)); - } - - protected: - uint32_t ndim_; - std::shared_ptr value_type_; - std::shared_ptr data_type_; - std::shared_ptr shape_type_; - std::vector permutation_; - std::vector uniform_dimensions_; - std::vector uniform_shape_; - std::vector dim_names_; - std::shared_ptr ext_type_; - std::shared_ptr shapes_; - std::shared_ptr data_; - std::string serialized_; - std::shared_ptr storage_arr_; - std::shared_ptr ext_arr_; -}; - -TEST_F(TestVariableShapeTensorType, CheckDummyRegistration) { - // We need a registered dummy type at runtime to allow for IPC deserialization - auto registered_type = GetExtensionType("arrow.variable_shape_tensor"); - ASSERT_TRUE(registered_type->type_id == Type::EXTENSION); -} - -TEST_F(TestVariableShapeTensorType, CreateExtensionType) { - auto exact_ext_type = - internal::checked_pointer_cast(ext_type_); - - // Test ExtensionType methods - ASSERT_EQ(ext_type_->extension_name(), "arrow.variable_shape_tensor"); - ASSERT_TRUE(ext_type_->Equals(*exact_ext_type)); - auto expected_type = struct_({ - ::arrow::field("shape", fixed_size_list(uint32(), ndim_)), - ::arrow::field("data", list(value_type_)), - }); - - ASSERT_TRUE(ext_type_->storage_type()->Equals(*expected_type)); - ASSERT_EQ(ext_type_->Serialize(), serialized_); - ASSERT_OK_AND_ASSIGN(auto ds, - ext_type_->Deserialize(ext_type_->storage_type(), serialized_)); - auto deserialized = internal::checked_pointer_cast(ds); - ASSERT_TRUE(deserialized->Equals(*exact_ext_type)); - ASSERT_TRUE(deserialized->Equals(*ext_type_)); - - // Test FixedShapeTensorType methods - ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); - ASSERT_EQ(exact_ext_type->ndim(), ndim_); - ASSERT_EQ(exact_ext_type->value_type(), value_type_); - ASSERT_EQ(exact_ext_type->permutation(), permutation_); - ASSERT_EQ(exact_ext_type->dim_names(), dim_names_); - - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, - testing::HasSubstr("Invalid: permutation size must match ndim. Expected: 3 Got: 1"), - VariableShapeTensorType::Make(value_type_, ndim_, {0})); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("Invalid: dim_names size must match ndim."), - VariableShapeTensorType::Make(value_type_, ndim_, {}, {"x"})); -} - -TEST_F(TestVariableShapeTensorType, EqualsCases) { - auto ext_type_permutation_1 = variable_shape_tensor(int64(), 2, {0, 1}, {"x", "y"}); - auto ext_type_permutation_2 = variable_shape_tensor(int64(), 2, {1, 0}, {"x", "y"}); - auto ext_type_no_permutation = variable_shape_tensor(int64(), 2, {}, {"x", "y"}); - - ASSERT_TRUE(ext_type_permutation_1->Equals(ext_type_permutation_1)); - - ASSERT_FALSE( - variable_shape_tensor(int32(), 2, {}, {"x", "y"})->Equals(ext_type_no_permutation)); - ASSERT_FALSE(variable_shape_tensor(int64(), 2, {}, {}) - ->Equals(variable_shape_tensor(int64(), 3, {}, {}))); - ASSERT_FALSE( - variable_shape_tensor(int64(), 2, {}, {"H", "W"})->Equals(ext_type_no_permutation)); - - ASSERT_TRUE(ext_type_no_permutation->Equals(ext_type_permutation_1)); - ASSERT_TRUE(ext_type_permutation_1->Equals(ext_type_no_permutation)); - ASSERT_FALSE(ext_type_no_permutation->Equals(ext_type_permutation_2)); - ASSERT_FALSE(ext_type_permutation_2->Equals(ext_type_no_permutation)); - ASSERT_FALSE(ext_type_permutation_1->Equals(ext_type_permutation_2)); - ASSERT_FALSE(ext_type_permutation_2->Equals(ext_type_permutation_1)); -} - -TEST_F(TestVariableShapeTensorType, MetadataSerializationRoundtrip) { - using T = VariableShapeTensorType; - - CheckSerializationRoundtrip(ext_type_); - CheckSerializationRoundtrip(variable_shape_tensor(value_type_, {}, {}, {})); - CheckSerializationRoundtrip(variable_shape_tensor(value_type_, {0}, {}, {})); - CheckSerializationRoundtrip(variable_shape_tensor(value_type_, {1}, {0}, {"x"})); - CheckSerializationRoundtrip( - variable_shape_tensor(value_type_, 3, {0, 1, 2}, {"H", "W", "C"})); - CheckSerializationRoundtrip( - variable_shape_tensor(value_type_, 3, {2, 0, 1}, {"C", "H", "W"})); - CheckSerializationRoundtrip( - variable_shape_tensor(value_type_, 3, {2, 0, 1}, {"C", "H", "W"}, {0, 1, 2})); - - auto storage_type = ext_type_->storage_type(); - CheckDeserializationRaises(ext_type_, boolean(), R"({"shape":[3,4]})", - "Expected Struct storage type, got bool"); - CheckDeserializationRaises(ext_type_, storage_type, R"({"shape":(3,4)})", - "Invalid serialized JSON data"); - CheckDeserializationRaises(ext_type_, storage_type, R"({"permutation":[1,0]})", - "Invalid permutation"); - CheckDeserializationRaises(ext_type_, storage_type, R"({"dim_names":["x","y"]})", - "Invalid dim_names"); -} - -TEST_F(TestVariableShapeTensorType, RoudtripBatch) { - auto exact_ext_type = - internal::checked_pointer_cast(ext_type_); - - // Pass extension array, expect getting back extension array - std::shared_ptr read_batch; - auto ext_field = field(/*name=*/"f0", /*type=*/ext_type_); - auto batch = RecordBatch::Make(schema({ext_field}), ext_arr_->length(), {ext_arr_}); - RoundtripBatch(batch, &read_batch); - CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); - - // Pass extension metadata and storage array, expect getting back extension array - std::shared_ptr read_batch2; - auto ext_metadata = - key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, - {"ARROW:extension:metadata", serialized_}}); - ext_field = field(/*name=*/"f0", /*type=*/ext_type_->storage_type(), /*nullable=*/true, - /*metadata=*/ext_metadata); - auto batch2 = RecordBatch::Make(schema({ext_field}), ext_arr_->length(), {ext_arr_}); - RoundtripBatch(batch2, &read_batch2); - CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); -} - -TEST_F(TestVariableShapeTensorType, ComputeStrides) { - auto shapes = ArrayFromJSON(shape_type_, "[[2,3,1],[2,1,2],[3,1,3]]"); - auto data = - ArrayFromJSON(data_type_, "[[1,1,2,3,4,5],[2,7,8,9],[10,11,12,13,14,15,16,17,18]]"); - std::vector> fields = {field("shapes", shape_type_), - field("data", data_type_)}; - ASSERT_OK_AND_ASSIGN(auto storage_arr, StructArray::Make({shapes, data}, fields)); - auto ext_arr = ExtensionType::WrapArray(ext_type_, storage_arr); - auto ext_array = std::static_pointer_cast(ext_arr); - - std::shared_ptr t, tensor; - - ASSERT_OK_AND_ASSIGN(t, ext_array->GetTensor(0)); - ASSERT_EQ(t->shape(), (std::vector{2, 3, 1})); - ASSERT_EQ(t->strides(), (std::vector{24, 8, 8})); - - std::vector shape = {2, 3, 1}; - std::vector strides = {sizeof(int64_t) * 3, sizeof(int64_t) * 1, - sizeof(int64_t) * 1}; - std::vector values = {1, 1, 2, 3, 4, 5}; - auto data_buffer = Buffer::Wrap(values); - ASSERT_OK_AND_ASSIGN(tensor, - Tensor::Make(int64(), data_buffer, shape, strides, dim_names_)); - ASSERT_TRUE(tensor->Equals(*t)); - - ASSERT_OK_AND_ASSIGN(t, ext_array->GetTensor(1)); - ASSERT_EQ(t->shape(), (std::vector{2, 1, 2})); - ASSERT_EQ(t->strides(), (std::vector{16, 16, 8})); - - ASSERT_OK_AND_ASSIGN(t, ext_array->GetTensor(2)); - ASSERT_EQ(t->shape(), (std::vector{3, 1, 3})); - ASSERT_EQ(t->strides(), (std::vector{24, 24, 8})); - - shape = {3, 1, 3}; - strides = {sizeof(int64_t) * 3, sizeof(int64_t) * 3, sizeof(int64_t) * 1}; - values = {10, 11, 12, 13, 14, 15, 16, 17, 18}; - data_buffer = Buffer::Wrap(values); - ASSERT_OK_AND_ASSIGN(tensor, - Tensor::Make(int64(), data_buffer, shape, strides, dim_names_)); - - ASSERT_EQ(tensor->strides(), t->strides()); - ASSERT_EQ(tensor->shape(), t->shape()); - ASSERT_EQ(tensor->dim_names(), t->dim_names()); - ASSERT_EQ(tensor->type(), t->type()); - ASSERT_EQ(tensor->is_contiguous(), t->is_contiguous()); - ASSERT_EQ(tensor->is_column_major(), t->is_column_major()); - ASSERT_TRUE(tensor->Equals(*t)); -} - } // namespace arrow diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc deleted file mode 100644 index 89bbf4011f9..00000000000 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ /dev/null @@ -1,246 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include - -#include "arrow/extension/fixed_shape_tensor.h" -#include "arrow/extension/variable_shape_tensor.h" - -#include "arrow/array/array_nested.h" -#include "arrow/array/array_primitive.h" -#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep -#include "arrow/scalar.h" -#include "arrow/tensor.h" -#include "arrow/util/int_util_overflow.h" -#include "arrow/util/logging.h" -#include "arrow/util/sort.h" - -#include -#include - -namespace rj = arrow::rapidjson; - -namespace arrow { -namespace extension { - -const Result> VariableShapeTensorArray::GetTensor( - const int64_t i) const { - auto ext_arr = internal::checked_pointer_cast(this->storage()); - auto ext_type = internal::checked_pointer_cast(this->type()); - auto value_type = - internal::checked_pointer_cast(ext_type->value_type()); - auto ndim = ext_type->ndim(); - auto dim_names = ext_type->dim_names(); - auto shapes = - std::static_pointer_cast(ext_arr->field(0))->value_slice(i); - - std::vector shape; - for (int64_t j = 0; j < ndim; ++j) { - ARROW_ASSIGN_OR_RAISE(auto size, shapes->GetScalar(j)); - shape.push_back( - static_cast(std::static_pointer_cast(size)->value)); - } - - std::vector strides; - // TODO: optimize ComputeStrides for ragged tensors - ARROW_CHECK_OK(internal::ComputeStrides(*value_type.get(), shape, - ext_type->permutation(), &strides)); - - auto list_arr = - std::static_pointer_cast(ext_arr->field(1))->value_slice(i)->data(); - auto bw = value_type->byte_width(); - auto buffer = - SliceBuffer(list_arr->buffers[1], list_arr->offset * bw, list_arr->length * bw); - - return Tensor::Make(ext_type->value_type(), buffer, shape, strides, dim_names); -} - -bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const { - if (extension_name() != other.extension_name()) { - return false; - } - const auto& other_ext = static_cast(other); - if (this->ndim() != other_ext.ndim()) { - return false; - } - - auto is_permutation_trivial = [](const std::vector& permutation) { - for (size_t i = 1; i < permutation.size(); ++i) { - if (permutation[i - 1] + 1 != permutation[i]) { - return false; - } - } - return true; - }; - const bool permutation_equivalent = - ((permutation_ == other_ext.permutation()) || - (permutation_.empty() && is_permutation_trivial(other_ext.permutation())) || - (is_permutation_trivial(permutation_) && other_ext.permutation().empty())); - - return (storage_type()->Equals(other_ext.storage_type())) && - (dim_names_ == other_ext.dim_names()) && - (uniform_dimensions_ == other_ext.uniform_dimensions_) && - (uniform_shape_ == other_ext.uniform_shape()) && permutation_equivalent; -} - -std::string VariableShapeTensorType::Serialize() const { - rj::Document document; - document.SetObject(); - rj::Document::AllocatorType& allocator = document.GetAllocator(); - - if (!permutation_.empty()) { - rj::Value permutation(rj::kArrayType); - for (auto v : permutation_) { - permutation.PushBack(v, allocator); - } - document.AddMember(rj::Value("permutation", allocator), permutation, allocator); - } - - if (!dim_names_.empty()) { - rj::Value dim_names(rj::kArrayType); - for (std::string v : dim_names_) { - dim_names.PushBack(rj::Value{}.SetString(v.c_str(), allocator), allocator); - } - document.AddMember(rj::Value("dim_names", allocator), dim_names, allocator); - } - - if (!uniform_dimensions_.empty()) { - rj::Value uniform_dimensions(rj::kArrayType); - for (auto v : uniform_dimensions_) { - uniform_dimensions.PushBack(v, allocator); - } - document.AddMember(rj::Value("uniform_dimensions", allocator), uniform_dimensions, - allocator); - } - - if (!uniform_shape_.empty()) { - rj::Value uniform_shape(rj::kArrayType); - for (auto v : uniform_shape_) { - uniform_shape.PushBack(v, allocator); - } - document.AddMember(rj::Value("uniform_shape", allocator), uniform_shape, allocator); - } - - rj::StringBuffer buffer; - rj::Writer writer(buffer); - document.Accept(writer); - return buffer.GetString(); -} - -Result> VariableShapeTensorType::Deserialize( - std::shared_ptr storage_type, const std::string& serialized_data) const { - if (storage_type->id() != Type::STRUCT) { - return Status::Invalid("Expected Struct storage type, got ", - storage_type->ToString()); - } - auto value_type = storage_type->field(1)->type()->field(0)->type(); - const size_t ndim = - std::static_pointer_cast(storage_type->field(0)->type()) - ->list_size(); - - rj::Document document; - if (document.Parse(serialized_data.data(), serialized_data.length()).HasParseError()) { - return Status::Invalid("Invalid serialized JSON data: ", serialized_data); - } - - std::vector permutation; - if (document.HasMember("permutation")) { - for (auto& x : document["permutation"].GetArray()) { - permutation.emplace_back(x.GetInt64()); - } - if (permutation.size() != ndim) { - return Status::Invalid("Invalid permutation"); - } - } - std::vector dim_names; - if (document.HasMember("dim_names")) { - for (auto& x : document["dim_names"].GetArray()) { - dim_names.emplace_back(x.GetString()); - } - if (dim_names.size() != ndim) { - return Status::Invalid("Invalid dim_names"); - } - } - - std::vector uniform_dimensions; - if (document.HasMember("uniform_dimensions")) { - for (auto& x : document["uniform_dimensions"].GetArray()) { - uniform_dimensions.emplace_back(x.GetInt64()); - } - if (uniform_dimensions.size() > ndim) { - return Status::Invalid("Invalid uniform_dimensions"); - } - } - - std::vector uniform_shape; - if (document.HasMember("uniform_shape")) { - for (auto& x : document["uniform_shape"].GetArray()) { - uniform_shape.emplace_back(x.GetInt64()); - } - if (uniform_shape.size() > ndim) { - return Status::Invalid("Invalid uniform_shape"); - } - } - - return variable_shape_tensor(value_type, static_cast(ndim), permutation, - dim_names, uniform_dimensions, uniform_shape); -} - -std::shared_ptr VariableShapeTensorType::MakeArray( - std::shared_ptr data) const { - DCHECK_EQ(data->type->id(), Type::EXTENSION); - DCHECK_EQ("arrow.variable_shape_tensor", - static_cast(*data->type).extension_name()); - return std::make_shared(data); -} - -Result> VariableShapeTensorType::Make( - const std::shared_ptr& value_type, const uint32_t& ndim, - const std::vector& permutation, const std::vector& dim_names, - const std::vector& uniform_dimensions, - const std::vector& uniform_shape) { - if (!permutation.empty() && permutation.size() != ndim) { - return Status::Invalid("permutation size must match ndim. Expected: ", ndim, - " Got: ", permutation.size()); - } - if (!dim_names.empty() && dim_names.size() != ndim) { - return Status::Invalid("dim_names size must match ndim. Expected: ", ndim, - " Got: ", dim_names.size()); - } - if (uniform_dimensions.size() > ndim) { - return Status::Invalid("uniform_dimensions size must be less or equal ndim."); - } - if (uniform_shape.size() > ndim) { - return Status::Invalid("uniform_shape size must be less or equal ndim."); - } - return std::make_shared( - value_type, ndim, permutation, dim_names, uniform_dimensions, uniform_shape); -} - -std::shared_ptr variable_shape_tensor( - const std::shared_ptr& value_type, const uint32_t& ndim, - const std::vector& permutation, const std::vector& dim_names, - const std::vector& uniform_dimensions, - const std::vector& uniform_shape) { - auto maybe_type = VariableShapeTensorType::Make( - value_type, ndim, permutation, dim_names, uniform_dimensions, uniform_shape); - ARROW_DCHECK_OK(maybe_type.status()); - return maybe_type.MoveValueUnsafe(); -} - -} // namespace extension -} // namespace arrow diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h deleted file mode 100644 index c2c40b364f8..00000000000 --- a/cpp/src/arrow/extension/variable_shape_tensor.h +++ /dev/null @@ -1,117 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/extension_type.h" - -namespace arrow { -namespace extension { - -class ARROW_EXPORT VariableShapeTensorArray : public ExtensionArray { - public: - using ExtensionArray::ExtensionArray; - - /// \brief Get a Tensor of VariableShapeTensorArray at i - /// - /// This method will return a Tensor from VariableShapeTensorArray with strides - /// derived from shape and permutation of VariableShapeTensorType. Shape and - /// dim_names will be permuted according to permutation stored in the - /// VariableShapeTensorType metadata. - const Result> GetTensor(const int64_t i) const; -}; - -/// \brief Concrete type class for variable-shape Tensor data. -/// This is a canonical arrow extension type. -/// See: https://arrow.apache.org/docs/format/CanonicalExtensions.html -class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { - public: - VariableShapeTensorType(const std::shared_ptr& value_type, - const uint32_t& ndim, - const std::vector& permutation = {}, - const std::vector& dim_names = {}, - const std::vector& uniform_dimensions = {}, - const std::vector& uniform_shape = {}) - : ExtensionType(struct_({::arrow::field("shape", fixed_size_list(uint32(), ndim)), - ::arrow::field("data", list(value_type))})), - value_type_(value_type), - permutation_(permutation), - dim_names_(dim_names), - uniform_dimensions_(uniform_dimensions), - uniform_shape_(uniform_shape) {} - - std::string extension_name() const override { return "arrow.variable_shape_tensor"; } - - /// Number of dimensions of tensor elements - uint32_t ndim() const { - std::shared_ptr storage_type = this->storage_type()->field(0)->type(); - return std::static_pointer_cast(storage_type)->list_size(); - } - - /// Value type of tensor elements - const std::shared_ptr value_type() const { return value_type_; } - - /// Permutation mapping from logical to physical memory layout of tensor elements - const std::vector& permutation() const { return permutation_; } - - /// Dimension names of tensor elements. Dimensions are ordered physically. - const std::vector& dim_names() const { return dim_names_; } - - /// Indexes of ragged dimensions. - const std::vector& uniform_dimensions() const { return uniform_dimensions_; } - - /// Shape of uniform dimensions. - const std::vector& uniform_shape() const { return uniform_shape_; } - - bool ExtensionEquals(const ExtensionType& other) const override; - - std::string Serialize() const override; - - Result> Deserialize( - std::shared_ptr storage_type, - const std::string& serialized_data) const override; - - /// Create a VariableShapeTensorArray from ArrayData - std::shared_ptr MakeArray(std::shared_ptr data) const override; - - /// \brief Create a VariableShapeTensorType instance - static Result> Make( - const std::shared_ptr& value_type, const uint32_t& ndim, - const std::vector& permutation = {}, - const std::vector& dim_names = {}, - const std::vector& uniform_dimensions = {}, - const std::vector& uniform_shape = {}); - - private: - std::shared_ptr storage_type_; - std::shared_ptr value_type_; - std::vector permutation_; - std::vector dim_names_; - std::vector uniform_dimensions_; - std::vector uniform_shape_; -}; - -/// \brief Return a VariableShapeTensorType instance. -ARROW_EXPORT std::shared_ptr variable_shape_tensor( - const std::shared_ptr& value_type, const uint32_t& ndim, - const std::vector& permutation = {}, - const std::vector& dim_names = {}, - const std::vector& uniform_dimensions = {}, - const std::vector& uniform_shape = {}); - -} // namespace extension -} // namespace arrow diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index b3a3dfc6ef6..1199336763d 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -29,7 +29,6 @@ #include "arrow/config.h" #ifdef ARROW_JSON #include "arrow/extension/fixed_shape_tensor.h" -#include "arrow/extension/variable_shape_tensor.h" #endif #include "arrow/status.h" #include "arrow/type.h" @@ -147,12 +146,10 @@ static void CreateGlobalRegistry() { #ifdef ARROW_JSON // Register canonical extension types - auto ext_types = {extension::fixed_shape_tensor(int64(), {}), - extension::variable_shape_tensor(int64(), 0)}; - for (const auto& ext_type : ext_types) { - ARROW_CHECK_OK( - g_registry->RegisterType(checked_pointer_cast(ext_type))); - } + auto ext_type = + checked_pointer_cast(extension::fixed_shape_tensor(int64(), {})); + + ARROW_CHECK_OK(g_registry->RegisterType(ext_type)); #endif } diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index 007a931c652..73b5e063ff1 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -81,7 +81,6 @@ may expose data type-specific methods or properties. UnionArray ExtensionArray FixedShapeTensorArray - VariableShapeTensorArray .. _api.scalar: diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index f57ca74f91a..ee0d07bb2c8 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -171,7 +171,6 @@ def print_entry(label, value): dictionary, run_end_encoded, fixed_shape_tensor, - variable_shape_tensor, field, type_for_alias, DataType, DictionaryType, StructType, @@ -181,8 +180,7 @@ def print_entry(label, value): FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, RunEndEncodedType, FixedShapeTensorType, - VariableShapeTensorType, PyExtensionType, - UnknownExtensionType, + PyExtensionType, UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, KeyValueMetadata, @@ -214,7 +212,6 @@ def print_entry(label, value): MonthDayNanoIntervalArray, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, RunEndEncodedArray, FixedShapeTensorArray, - VariableShapeTensorArray, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 6f5d0eb4e67..e36d8b2f043 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3507,142 +3507,6 @@ class FixedShapeTensorArray(ExtensionArray): ) -cdef class VariableShapeTensorArray(ExtensionArray): - """ - Concrete class for variable shape tensor extension arrays. - - Examples - -------- - Define the extension type for tensor array - - >>> import pyarrow as pa - >>> tensor_type = pa.variable_shape_tensor(pa.int32(), 2) - - Create an extension array - - >>> shapes = pa.array([[2, 3], [1, 2]], pa.list_(pa.uint32(), 2)) - >>> values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(pa.int32())) - >>> arr = pa.StructArray.from_arrays([shapes, values], names=["shape", "data"]) - >>> pa.ExtensionArray.from_storage(tensor_type, arr) - - -- is_valid: all not null - -- child 0 type: fixed_size_list[2] - [ - [ - 2, - 3 - ], - [ - 1, - 2 - ] - ] - -- child 1 type: list - [ - [ - 1, - 2, - 3, - 4, - 5, - 6 - ], - [ - 7, - 8 - ] - ] - """ - - def to_numpy_ndarray(self): - """ - Convert variable shape tensor extension array to list of numpy arrays. - """ - cdef: - CVariableShapeTensorArray * ext_array = (self.ap) - CResult[shared_ptr[CTensor]] ctensor - - tensors = [] - for i in range(len(self.storage)): - with nogil: - ctensor = ext_array.GetTensor(i) - tensors.append(pyarrow_wrap_tensor(GetResultValue(ctensor)).to_numpy()) - - return tensors - - @staticmethod - def from_numpy_ndarray(obj): - """ - Convert a list of numpy arrays ndarrays to a variable shape tensor extension array. - The length of the list will become the length of the variable shape tensor array. - - Numpy arrays needs to be C-contiguous in memory (``obj.flags["C_CONTIGUOUS"]==True``). - - Parameters - ---------- - obj : list(numpy.ndarray) - - Examples - -------- - >>> import pyarrow as pa - >>> import numpy as np - >>> ndarray_list = [ - ... np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32), - ... np.array([[7, 8]], dtype=np.float32), - ... ] - >>> pa.VariableShapeTensorArray.from_numpy_ndarray(ndarray_list) - - -- is_valid: all not null - -- child 0 type: fixed_size_list[2] - [ - [ - 2, - 3 - ], - [ - 1, - 2 - ] - ] - -- child 1 type: list - [ - [ - 1, - 2, - 3, - 4, - 5, - 6 - ], - [ - 7, - 8 - ] - ] - """ - if not all([o.flags["C_CONTIGUOUS"] for o in obj]): - raise ValueError('The data in the numpy arrays need to be in a single, ' - 'C-style contiguous segment.') - numpy_type = obj[0].dtype - ndim = obj[0].ndim - - if not all([o.dtype == numpy_type for o in obj]): - raise ValueError('All numpy arrays need to have the same dtype.') - - if not all([o.ndim == ndim for o in obj]): - raise ValueError('All numpy arrays need to have the same ndim.') - - arrow_type = from_numpy_dtype(numpy_type) - values = array([np.ravel(o, order='C') for o in obj], list_(arrow_type)) - shapes = array([o.shape for o in obj], list_(uint32(), list_size=ndim)) - struct_arr = StructArray.from_arrays([shapes, values], names=["shape", "data"]) - - return ExtensionArray.from_storage( - variable_shape_tensor(arrow_type, ndim), - struct_arr - ) - - cdef dict _array_classes = { _Type_NA: NullArray, _Type_BOOL: BooleanArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9487372eb8c..f4d6541fa72 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2634,35 +2634,6 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() -cdef extern from "arrow/extension/variable_shape_tensor.h" namespace "arrow::extension": - cdef cppclass CVariableShapeTensorType \ - " arrow::extension::VariableShapeTensorType"(CExtensionType): - - @staticmethod - CResult[shared_ptr[CDataType]] Make(const shared_ptr[CDataType]& value_type, - const uint32_t ndim, - const vector[int64_t]& permutation, - const vector[c_string]& dim_names, - const vector[int64_t]& uniform_dimensions, - const vector[int64_t]& uniform_shape) - - CResult[shared_ptr[CDataType]] Deserialize(const shared_ptr[CDataType] storage_type, - const c_string& serialized_data) const - - c_string Serialize() const - - const shared_ptr[CDataType] value_type() - const uint32_t ndim() - const vector[int64_t] permutation() - const vector[c_string] dim_names() - const vector[int64_t] uniform_dimensions() - const vector[int64_t] uniform_shape() - - cdef cppclass CVariableShapeTensorArray \ - " arrow::extension::VariableShapeTensorArray"(CExtensionArray) nogil: - CResult[shared_ptr[CTensor]] GetTensor(const int64_t i) const - - cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension": cdef cppclass CFixedShapeTensorType \ " arrow::extension::FixedShapeTensorType"(CExtensionType): diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index c30ef95ebb7..63ebe6aea82 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -200,11 +200,6 @@ cdef class ExtensionType(BaseExtensionType): const CPyExtensionType* cpy_ext_type -cdef class VariableShapeTensorType(BaseExtensionType): - cdef: - const CVariableShapeTensorType* tensor_ext_type - - cdef class FixedShapeTensorType(BaseExtensionType): cdef: const CFixedShapeTensorType* tensor_ext_type @@ -458,8 +453,6 @@ cdef class DictionaryArray(Array): cdef class ExtensionArray(Array): pass -cdef class VariableShapeTensorArray(ExtensionArray): - pass cdef class MonthDayNanoIntervalArray(Array): pass diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 4a1a2958491..72e16f2cec3 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -118,8 +118,6 @@ cdef api object pyarrow_wrap_data_type( cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type) if cpy_ext_type != nullptr: return cpy_ext_type.GetInstance() - elif ext_type.extension_name() == b"arrow.variable_shape_tensor": - out = VariableShapeTensorType.__new__(VariableShapeTensorType) elif ext_type.extension_name() == b"arrow.fixed_shape_tensor": out = FixedShapeTensorType.__new__(FixedShapeTensorType) else: diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 77b6d469c99..1eb7d5fa761 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1212,39 +1212,6 @@ def test_tensor_type(): assert tensor_type.dim_names == ['C', 'H', 'W'] assert tensor_type.permutation is None - tensor_type = pa.variable_shape_tensor(pa.int8(), 2) - expected_storage_type = pa.struct([ - pa.field("shape", pa.list_(pa.uint32(), 2)), - pa.field("data", pa.list_(pa.int8())) - ]) - assert tensor_type.extension_name == "arrow.variable_shape_tensor" - assert tensor_type.storage_type == expected_storage_type - assert tensor_type.ndim == 2 - assert tensor_type.dim_names is None - assert tensor_type.permutation is None - - tensor_type = pa.variable_shape_tensor(pa.int64(), 3, dim_names=['C', 'H', 'W']) - expected_storage_type = pa.struct([ - pa.field("shape", pa.list_(pa.uint32(), 3)), - pa.field("data", pa.list_(pa.int64())) - ]) - assert tensor_type.extension_name == "arrow.variable_shape_tensor" - assert tensor_type.storage_type == expected_storage_type - assert tensor_type.ndim == 3 - assert tensor_type.dim_names == ['C', 'H', 'W'] - assert tensor_type.permutation is None - - tensor_type = pa.variable_shape_tensor(pa.bool_(), 2, permutation=[1, 0]) - expected_storage_type = pa.struct([ - pa.field("shape", pa.list_(pa.uint32(), 2)), - pa.field("data", pa.list_(pa.bool_())) - ]) - assert tensor_type.extension_name == "arrow.variable_shape_tensor" - assert tensor_type.storage_type == expected_storage_type - assert tensor_type.ndim == 2 - assert tensor_type.dim_names is None - assert tensor_type.permutation == [1, 0] - def test_tensor_class_methods(): tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3]) @@ -1281,55 +1248,6 @@ def test_tensor_class_methods(): arr.to_numpy_ndarray() -@pytest.mark.parametrize("value_type", (np.int8, np.int32, np.int64, np.float64)) -def test_variable_shape_tensor_class_method(value_type): - ndim = 2 - shape_type = pa.list_(pa.uint32(), ndim) - arrow_type = pa.from_numpy_dtype(value_type) - tensor_type = pa.variable_shape_tensor( - arrow_type, - ndim, - dim_names=["H", "W"], - permutation=[0, 1], - uniform_dimensions=[0], - uniform_shape=[2, 0], - ) - fields = [pa.field("shape", shape_type), pa.field("data", pa.list_(arrow_type))] - - shapes = pa.array([[2, 3], [2, 1]], shape_type) - values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(arrow_type)) - struct_arr = pa.StructArray.from_arrays([shapes, values], fields=fields) - arr = pa.ExtensionArray.from_storage(tensor_type, struct_arr) - basic_arr = pa.ExtensionArray.from_storage( - pa.variable_shape_tensor(arrow_type, ndim), struct_arr - ) - - storage = pa.array( - [([2, 3], [1, 2, 3, 4, 5, 6]), ([2, 1], [7, 8])], type=pa.struct(fields) - ) - assert pa.ExtensionArray.from_storage(tensor_type, storage).equals(arr) - - assert arr.type == tensor_type - - ndarray_list = [ - np.array([[1, 2, 3], [4, 5, 6]], dtype=value_type), - np.array([[7, 8]], dtype=value_type), - ] - assert all(zip(x == y for x, y in zip(arr.to_numpy_ndarray(), ndarray_list))) - - assert pa.VariableShapeTensorArray.from_numpy_ndarray(ndarray_list).equals( - basic_arr - ) - assert pa.VariableShapeTensorArray.from_numpy_ndarray( - arr.to_numpy_ndarray() - ).equals(basic_arr) - - assert arr.to_pylist() == [ - {"data": [1, 2, 3, 4, 5, 6], "shape": [2, 3]}, - {"data": [7, 8], "shape": [1, 2]}, - ] - - @pytest.mark.parametrize("tensor_type", ( pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]), pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]), @@ -1360,47 +1278,6 @@ def test_tensor_type_ipc(tensor_type): assert result.type.shape == [2, 2, 3] -@pytest.mark.parametrize("tensor_type", ( - pa.variable_shape_tensor(pa.int8(), 2), - pa.variable_shape_tensor(pa.int8(), 2, permutation=[1, 0]), - pa.variable_shape_tensor(pa.int8(), 2, dim_names=['H', 'W']), - pa.variable_shape_tensor(pa.int8(), 2, uniform_dimensions=[0, 1]), -)) -def test_variable_shape_tensor_type_ipc(tensor_type): - shape_type = tensor_type.storage_type.field(0).type - values_type = tensor_type.storage_type.field(1).type - shapes = pa.array([[2, 3], [1, 2]], shape_type) - values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], values_type) - - struct_arr = pa.StructArray.from_arrays([shapes, values], names=["shape", "data"]) - arr = pa.ExtensionArray.from_storage(tensor_type, struct_arr) - batch = pa.RecordBatch.from_arrays([arr], ["ext"]) - - # check the built array has exactly the expected clss - tensor_class = tensor_type.__arrow_ext_class__() - assert isinstance(arr, tensor_class) - - buf = ipc_write_batch(batch) - del batch - batch = ipc_read_batch(buf) - - result = batch.column(0) - # check the deserialized array class is the expected one - assert isinstance(result, tensor_class) - assert result.type.extension_name == "arrow.variable_shape_tensor" - assert arr.storage.to_pylist() == [ - {"data": [1, 2, 3, 4, 5, 6], "shape": [2, 3]}, - {"data": [7, 8], "shape": [1, 2]}, - ] - - # we get back an actual TensorType - assert isinstance(result.type, pa.VariableShapeTensorType) - assert result.type.value_type == pa.int8() - assert result.type.ndim == 2 - assert result.type.permutation == tensor_type.permutation - assert result.type.dim_names == tensor_type.dim_names - - def test_tensor_type_equality(): tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" @@ -1410,14 +1287,6 @@ def test_tensor_type_equality(): assert tensor_type == tensor_type2 assert not tensor_type == tensor_type3 - tensor_type = pa.variable_shape_tensor(pa.int8(), 2) - assert tensor_type.extension_name == "arrow.variable_shape_tensor" - - tensor_type2 = pa.variable_shape_tensor(pa.int8(), 2) - tensor_type3 = pa.variable_shape_tensor(pa.uint8(), 2) - assert tensor_type == tensor_type2 - assert not tensor_type == tensor_type3 - @pytest.mark.pandas def test_extension_to_pandas_storage_type(registered_period_type): @@ -1482,17 +1351,3 @@ def test_tensor_type_is_picklable(pickle_module): result = pickle_module.loads(pickle_module.dumps(expected_arr)) assert result == expected_arr - - expected_type = pa.variable_shape_tensor(pa.int32(), 2) - result = pickle_module.loads(pickle_module.dumps(expected_type)) - - assert result == expected_type - - shapes = pa.array([[2, 3], [1, 2]], pa.list_(pa.uint32(), 2)) - values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(pa.int32())) - arr = pa.StructArray.from_arrays([shapes, values], names=["shape", "data"]) - expected_arr = pa.ExtensionArray.from_storage(expected_type, arr) - - result = pickle_module.loads(pickle_module.dumps(expected_arr)) - - assert result == expected_arr diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index c98a5536f8d..9f8b347d562 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1548,111 +1548,6 @@ cdef class ExtensionType(BaseExtensionType): return ExtensionScalar -cdef class VariableShapeTensorType(BaseExtensionType): - """ - Concrete class for variable shape tensor extension type. - - Examples - -------- - Create an instance of variable shape tensor extension type: - - >>> import pyarrow as pa - >>> pa.variable_shape_tensor(pa.int32(), 2) - VariableShapeTensorType(extension) - - Create an instance of variable shape tensor extension type with - permutation: - - >>> tensor_type = pa.variable_shape_tensor(pa.int8(), 3, - ... permutation=[0, 2, 1]) - >>> tensor_type.permutation - [0, 2, 1] - """ - - cdef void init(self, const shared_ptr[CDataType]& type) except *: - BaseExtensionType.init(self, type) - self.tensor_ext_type = type.get() - - @property - def value_type(self): - """ - Data type of an individual tensor. - """ - return pyarrow_wrap_data_type(self.tensor_ext_type.value_type()) - - @property - def ndim(self): - """ - Number of dimensions of the tensors. - """ - return self.tensor_ext_type.ndim() - - @property - def dim_names(self): - """ - Explicit names of the dimensions. - """ - list_of_bytes = self.tensor_ext_type.dim_names() - if len(list_of_bytes) != 0: - return [frombytes(x) for x in list_of_bytes] - else: - return None - - @property - def permutation(self): - """ - Indices of the dimensions ordering. - """ - indices = self.tensor_ext_type.permutation() - if len(indices) != 0: - return indices - else: - return None - - @property - def uniform_dimensions(self): - """ - Indices of uniform dimensions. - """ - uniform_dimensions = self.tensor_ext_type.uniform_dimensions() - if len(uniform_dimensions) != 0: - return uniform_dimensions - else: - return None - - @property - def uniform_shape(self): - """ - Shape over dimensions that are guaranteed to be constant. - """ - uniform_shape = self.tensor_ext_type.uniform_shape() - if len(uniform_shape) != 0: - return uniform_shape - else: - return None - - def __arrow_ext_serialize__(self): - """ - Serialized representation of metadata to reconstruct the type object. - """ - return self.tensor_ext_type.Serialize() - - @classmethod - def __arrow_ext_deserialize__(self, storage_type, serialized): - """ - Return an VariableShapeTensor type instance from the storage type and serialized - metadata. - """ - return self.tensor_ext_type.Deserialize(storage_type, serialized) - - def __arrow_ext_class__(self): - return VariableShapeTensorArray - - def __reduce__(self): - return variable_shape_tensor, (self.value_type, self.ndim, - self.dim_names, self.permutation, - self.uniform_dimensions) - cdef class FixedShapeTensorType(BaseExtensionType): """ Concrete class for fixed shape tensor extension type. @@ -4922,129 +4817,6 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N return out -def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation=None, - uniform_dimensions=None, uniform_shape=None): - """ - Create instance of variable shape tensor extension type with number of - dimensions and optional names of tensor dimensions and indices of the - desired logical ordering of dimensions. - - Parameters - ---------- - value_type : DataType - Data type of individual tensor elements. - ndim : integer - The number of dimensions of the contained tensors. - dim_names : tuple or list of strings, default None - Explicit names to tensor dimensions. - permutation : tuple or list integers, default None - Indices of the desired ordering of the original dimensions. - The indices contain a permutation of the values ``[0, 1, .., N-1]`` where - N is the number of dimensions. The permutation indicates which dimension - of the logical layout corresponds to which dimension of the physical tensor. - For more information on this parameter see - :ref:`fixed_shape_tensor_extension`. - uniform_dimensions : tuple or list of integers, default None - Indices of the dimensions that are guaranteed to remain constant over the - whole array. The indices contain a subset of the values ``[0, 1, .., N-1]`` - where N is the number of dimensions. - uniform_shape : tuple or list of integers, default None - Shape over dimensions that are guaranteed to stay constant over all tensors - in the array if all their ragged dimensions sizes were replaced by 0. - An array containing tensor with shape (2, 3, 4) and uniform dimensions - (0, 2) would have uniform shape (2, 0, 4). - - Examples - -------- - Create an instance of variable shape tensor extension type: - - >>> import pyarrow as pa - >>> tensor_type = pa.variable_shape_tensor(pa.int32(), 2) - >>> tensor_type - VariableShapeTensorType(extension) - - Inspect the data type: - - >>> tensor_type.value_type - DataType(int32) - >>> tensor_type.ndim - 2 - - Create a table with variable shape tensor extension array: - - >>> fields = [pa.field("shape", pa.list_(pa.uint32(), 2)), pa.field("data", pa.list_(pa.int32()))] - >>> storage = pa.array([([2, 3], [1, 2, 3, 4, 5, 6]), ([1, 2], [7, 8])], type=pa.struct(fields)) - >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage) - >>> pa.table([tensor], names=["tensor_array"]) - pyarrow.Table - tensor_array: extension - ---- - tensor_array: [ -- is_valid: all not null - -- child 0 type: fixed_size_list[2] - [[2,3],[1,2]] - -- child 1 type: list - [[1,2,3,4,5,6],[7,8]]] - - Create an instance of variable shape tensor extension type with names - of tensor dimensions: - - >>> tensor_type = pa.variable_shape_tensor(pa.int8(), 3, - ... dim_names=['C', 'H', 'W']) - >>> tensor_type.dim_names - ['C', 'H', 'W'] - - Create an instance of variable shape tensor extension type with - permutation: - - >>> tensor_type = pa.variable_shape_tensor(pa.int8(), 3, - ... permutation=[0, 2, 1]) - >>> tensor_type.permutation - [0, 2, 1] - - Returns - ------- - type : VariableShapeTensorType - """ - - cdef: - uint32_t c_ndim - vector[int64_t] c_permutation - vector[c_string] c_dim_names - vector[int64_t] c_uniform_dimensions - vector[int64_t] c_uniform_shape - shared_ptr[CDataType] c_tensor_ext_type - - assert value_type is not None - assert ndim is not None - - c_ndim = ndim - - if permutation is not None: - for i in permutation: - c_permutation.push_back(i) - - if dim_names is not None: - for x in dim_names: - c_dim_names.push_back(tobytes(x)) - - if uniform_dimensions is not None: - for i in uniform_dimensions: - c_uniform_dimensions.push_back(i) - - if uniform_shape is not None: - for i in uniform_shape: - c_uniform_shape.push_back(i) - - cdef VariableShapeTensorType out = VariableShapeTensorType.__new__(VariableShapeTensorType) - - c_tensor_ext_type = GetResultValue(CVariableShapeTensorType.Make( - value_type.sp_type, c_ndim, c_permutation, c_dim_names, c_uniform_dimensions, c_uniform_shape)) - - out.init(c_tensor_ext_type) - - return out - - cdef dict _type_aliases = { 'null': null, 'bool': bool_, From ff603493320750effd8bde18925482c99998c8c9 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 5 Oct 2023 14:06:50 +0200 Subject: [PATCH 13/18] Simplify uniform_dimensions/uniform_shape --- docs/source/format/CanonicalExtensions.rst | 39 ++++++++++------------ 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 127943a0e68..9f3c605b743 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -193,29 +193,26 @@ Variable shape tensor When logical and physical layout are equal, the permutation will always be ([0, 1, .., N-1]) and can therefore be left out. - * **uniform_dimensions** = indices of dimensions whose sizes are - guaranteed to remain constant. Indices are a subset of all possible - dimension indices ([0, 1, .., N-1]). - The uniform dimensions must still be represented in the `shape` field, - and must always be the same value for all tensors in the array -- this - allows code to interpret the tensor correctly without accounting for + * **uniform_shape** = sizes of individual tensors dimensions are + guaranteed to stay constant in uniform dimensions and can vary in + non-uniform dimensions. This holds over all tensors in the array. + Sizes in uniform dimensions are represented with int32 values, while + sizes of the non-uniform dimensions are not known in advance and are + represented with 0s. If ``uniform_shape`` is not provided it is assumed + that all dimensions are non-uniform. + An array containing a tensor with shape (2, 3, 4) and whose first and + last dimensions are uniform would have ``uniform_shape`` (2, 0, 4). + This allows for interpreting the tensor correctly without accounting for uniform dimensions while still permitting optional optimizations that - take advantage of the uniformity. uniform_dimensions can be left out, - in which case it is assumed that all dimensions might be variable. - - * **uniform_shape** = shape of the dimensions that are guaranteed to stay - constant over all tensors in the array, with the shape of the ragged dimensions - set to 0. - An array containing tensor with shape (2, 3, 4) and uniform dimensions - (0, 2) would have uniform shape (2, 0, 4). + take advantage of the uniformity. * Description of the serialization: - The metadata must be a valid JSON object, that optionally includes - dimension names with keys **"dim_names"**, ordering of - dimensions with key **"permutation"**, indices of dimensions whose sizes - are guaranteed to remain constant with key **"uniform_dimensions"** and - shape of those dimensions with key **"uniform_shape"**. + The metadata must be a valid JSON object that optionally includes + dimension names with keys **"dim_names"** and ordering of dimensions + with key **"permutation"**. + Shapes of tensors can be defined in a subset of dimensions by providing + key **"uniform_shape"**. Minimal metadata is an empty JSON object. - Example of minimal metadata is: @@ -226,10 +223,10 @@ Variable shape tensor ``{ "dim_names": ["C", "H", "W"] }`` - - Example with ``uniform_dimensions`` metadata for a set of color images + - Example with ``uniform_shape`` metadata for a set of color images with variable width: - ``{ "dim_names": ["H", "W", "C"], "uniform_dimensions": [1] }`` + ``{ "dim_names": ["H", "W", "C"], "uniform_shape": [400, 0, 3] }`` - Example of permuted 3-dimensional tensor: From 78c6bd4b27f6992eb6d9bf66dfb7f19e3f254e8f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 5 Oct 2023 15:45:57 +0200 Subject: [PATCH 14/18] Apply suggestions from code review Co-authored-by: Antoine Pitrou --- docs/source/format/CanonicalExtensions.rst | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 9f3c605b743..bc029efcdc4 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -159,8 +159,9 @@ Variable shape tensor is composed of **data** and **shape** fields describing a single tensor per row: - * **data** is a ``List`` holding tensor elements of a single tensor. - Data type of the list elements is uniform across the entire column. + * **data** is a ``List`` holding tensor elements (each list element is + a single tensor). The List's value type is the value type of the tensor, + such as an integer or floating-point type. * **shape** is a ``FixedSizeList[ndim]`` of the tensor shape where the size of the list ``ndim`` is equal to the number of dimensions of the tensor. @@ -219,12 +220,14 @@ Variable shape tensor ``{}`` - - Example with ``dim_names`` metadata for NCHW ordered data: + - Example with ``dim_names`` metadata for NCHW ordered data (note that the first + logical dimension, ``N``, is mapped to the **data** List array: each element in the List + is a CHW tensor and the List of tensors implicitly constitutes a single NCHW tensor): ``{ "dim_names": ["C", "H", "W"] }`` - Example with ``uniform_shape`` metadata for a set of color images - with variable width: + with fixed height, variable width and three color channels: ``{ "dim_names": ["H", "W", "C"], "uniform_shape": [400, 0, 3] }`` @@ -232,9 +235,9 @@ Variable shape tensor ``{ "permutation": [2, 0, 1] }`` - This is the physical layout shape and the shape of the logical - layout would given an individual tensor of shape [100, 200, 500] - be ``[500, 100, 200]``. + For example, if the physical **shape** of an individual tensor + is ``[100, 200, 500]``, this permutation would denote a logical shape + of ``[500, 100, 200]``. .. note:: @@ -248,8 +251,9 @@ Variable shape tensor This means the logical tensor has names [z, x, y] and shape [30, 10, 20]. - Elements in a variable shape tensor extension array are stored - in row-major/C-contiguous order. +.. note:: + Values inside each **data** tensor element are stored in row-major/C-contiguous + order according to the corresponding **shape**. ========================= Community Extension Types From 90668e06efcc41c911010f43c2b43b4c8ca18fa8 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 5 Oct 2023 15:47:02 +0200 Subject: [PATCH 15/18] Update docs/source/format/CanonicalExtensions.rst Co-authored-by: Antoine Pitrou --- docs/source/format/CanonicalExtensions.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index bc029efcdc4..85e08a23591 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -244,7 +244,7 @@ Variable shape tensor With the exception of ``permutation``, the parameters and storage of VariableShapeTensor relate to the *physical* storage of the tensor. - For example, consider a tensor with: + For example, consider a tensor with:: shape = [10, 20, 30] dim_names = [x, y, z] permutations = [2, 0, 1] From 09fd14f55bc334913e8605c0bd4ee34eacae41ef Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 5 Oct 2023 17:09:16 +0200 Subject: [PATCH 16/18] Update docs/source/format/CanonicalExtensions.rst Co-authored-by: Joris Van den Bossche --- docs/source/format/CanonicalExtensions.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 85e08a23591..2e1dc85d2a3 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -194,7 +194,7 @@ Variable shape tensor When logical and physical layout are equal, the permutation will always be ([0, 1, .., N-1]) and can therefore be left out. - * **uniform_shape** = sizes of individual tensors dimensions are + * **uniform_shape** = sizes of individual tensor's dimensions which are guaranteed to stay constant in uniform dimensions and can vary in non-uniform dimensions. This holds over all tensors in the array. Sizes in uniform dimensions are represented with int32 values, while From 8b80ced526980a8b678161d178c9cc7b7208f726 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 5 Oct 2023 17:10:40 +0200 Subject: [PATCH 17/18] Review feedback --- docs/source/format/CanonicalExtensions.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 2e1dc85d2a3..71faa74f424 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -199,10 +199,10 @@ Variable shape tensor non-uniform dimensions. This holds over all tensors in the array. Sizes in uniform dimensions are represented with int32 values, while sizes of the non-uniform dimensions are not known in advance and are - represented with 0s. If ``uniform_shape`` is not provided it is assumed + represented with null. If ``uniform_shape`` is not provided it is assumed that all dimensions are non-uniform. An array containing a tensor with shape (2, 3, 4) and whose first and - last dimensions are uniform would have ``uniform_shape`` (2, 0, 4). + last dimensions are uniform would have ``uniform_shape`` (2, null, 4). This allows for interpreting the tensor correctly without accounting for uniform dimensions while still permitting optional optimizations that take advantage of the uniformity. @@ -229,7 +229,7 @@ Variable shape tensor - Example with ``uniform_shape`` metadata for a set of color images with fixed height, variable width and three color channels: - ``{ "dim_names": ["H", "W", "C"], "uniform_shape": [400, 0, 3] }`` + ``{ "dim_names": ["H", "W", "C"], "uniform_shape": [400, null, 3] }`` - Example of permuted 3-dimensional tensor: From 23270854f923e1833457d8f889d011d78ef69aa1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 Oct 2023 15:44:04 +0200 Subject: [PATCH 18/18] Update docs/source/format/CanonicalExtensions.rst Co-authored-by: Rok Mihevc --- docs/source/format/CanonicalExtensions.rst | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 71faa74f424..084b6e62895 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -214,11 +214,7 @@ Variable shape tensor with key **"permutation"**. Shapes of tensors can be defined in a subset of dimensions by providing key **"uniform_shape"**. - Minimal metadata is an empty JSON object. - - - Example of minimal metadata is: - - ``{}`` + Minimal metadata is an empty string. - Example with ``dim_names`` metadata for NCHW ordered data (note that the first logical dimension, ``N``, is mapped to the **data** List array: each element in the List