diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 6b0ac8c23c7..a90e20872cf 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -910,6 +910,8 @@ if(ARROW_JSON) arrow_add_object_library(ARROW_JSON extension/fixed_shape_tensor.cc extension/opaque.cc + extension/tensor_internal.cc + extension/variable_shape_tensor.cc json/options.cc json/chunked_builder.cc json/chunker.cc diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index 065ea3f1ddb..5787ad9c38c 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -18,7 +18,7 @@ set(CANONICAL_EXTENSION_TESTS bool8_test.cc uuid_test.cc) if(ARROW_JSON) - list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc) + list(APPEND CANONICAL_EXTENSION_TESTS tensor_extension_array_test.cc opaque_test.cc) endif() add_arrow_test(test diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 944a134a707..8e03525d1e1 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -37,52 +37,7 @@ namespace rj = arrow::rapidjson; -namespace arrow { - -namespace extension { - -namespace { - -Status ComputeStrides(const FixedWidthType& type, const std::vector& shape, - const std::vector& permutation, - std::vector* strides) { - if (permutation.empty()) { - return internal::ComputeRowMajorStrides(type, shape, strides); - } - - const int byte_width = type.byte_width(); - - int64_t remaining = 0; - if (!shape.empty() && shape.front() > 0) { - remaining = byte_width; - for (auto i : permutation) { - if (i > 0) { - if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) { - return Status::Invalid( - "Strides computed from shape would not fit in 64-bit integer"); - } - } - } - } - - if (remaining == 0) { - strides->assign(shape.size(), byte_width); - return Status::OK(); - } - - strides->push_back(remaining); - for (auto i : permutation) { - if (i > 0) { - remaining /= shape[i]; - strides->push_back(remaining); - } - } - internal::Permute(permutation, strides); - - return Status::OK(); -} - -} // namespace +namespace arrow::extension { bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const { if (extension_name() != other.extension_name()) { @@ -237,7 +192,8 @@ Result> FixedShapeTensorType::MakeTensor( } std::vector strides; - RETURN_NOT_OK(ComputeStrides(value_type, shape, permutation, &strides)); + RETURN_NOT_OK( + internal::ComputeStrides(ext_type.value_type(), shape, permutation, &strides)); const auto start_position = array->offset() * byte_width; const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), std::multiplies<>()); @@ -376,9 +332,8 @@ const Result> FixedShapeTensorArray::ToTensor() const { internal::Permute(permutation, &shape); std::vector tensor_strides; - const auto* fw_value_type = internal::checked_cast(value_type.get()); ARROW_RETURN_NOT_OK( - ComputeStrides(*fw_value_type, shape, permutation, &tensor_strides)); + internal::ComputeStrides(value_type, shape, permutation, &tensor_strides)); const auto& raw_buffer = this->storage()->data()->child_data[0]->buffers[1]; ARROW_ASSIGN_OR_RAISE( @@ -412,10 +367,9 @@ Result> FixedShapeTensorType::Make( const std::vector& FixedShapeTensorType::strides() { if (strides_.empty()) { - auto value_type = internal::checked_cast(this->value_type_.get()); std::vector tensor_strides; - ARROW_CHECK_OK( - ComputeStrides(*value_type, this->shape(), this->permutation(), &tensor_strides)); + ARROW_CHECK_OK(internal::ComputeStrides(this->value_type_, this->shape(), + this->permutation(), &tensor_strides)); strides_ = tensor_strides; } return strides_; @@ -430,5 +384,4 @@ std::shared_ptr fixed_shape_tensor(const std::shared_ptr& va return maybe_type.MoveValueUnsafe(); } -} // namespace extension -} // namespace arrow +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 80a602021c6..5098da0405f 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -19,8 +19,7 @@ #include "arrow/extension_type.h" -namespace arrow { -namespace extension { +namespace arrow::extension { class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { public: @@ -126,5 +125,4 @@ ARROW_EXPORT std::shared_ptr fixed_shape_tensor( const std::vector& permutation = {}, const std::vector& dim_names = {}); -} // namespace extension -} // namespace arrow +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc similarity index 66% rename from cpp/src/arrow/extension/fixed_shape_tensor_test.cc rename to cpp/src/arrow/extension/tensor_extension_array_test.cc index 842a78e1a4f..d8e446ea61d 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/extension/fixed_shape_tensor.h" +#include "arrow/extension/variable_shape_tensor.h" #include "arrow/testing/matchers.h" @@ -33,11 +34,14 @@ namespace arrow { using FixedShapeTensorType = extension::FixedShapeTensorType; -using arrow::ipc::test::RoundtripBatch; using extension::fixed_shape_tensor; using extension::FixedShapeTensorArray; -class TestExtensionType : public ::testing::Test { +using VariableShapeTensorType = extension::VariableShapeTensorType; +using extension::variable_shape_tensor; +using extension::VariableShapeTensorArray; + +class TestFixedShapeTensorType : public ::testing::Test { public: void SetUp() override { shape_ = {3, 3, 4}; @@ -72,13 +76,13 @@ class TestExtensionType : public ::testing::Test { std::string serialized_; }; -TEST_F(TestExtensionType, CheckDummyRegistration) { +TEST_F(TestFixedShapeTensorType, CheckDummyRegistration) { // We need a registered dummy type at runtime to allow for IPC deserialization auto registered_type = GetExtensionType("arrow.fixed_shape_tensor"); - ASSERT_TRUE(registered_type->type_id == Type::EXTENSION); + ASSERT_EQ(registered_type->id(), Type::EXTENSION); } -TEST_F(TestExtensionType, CreateExtensionType) { +TEST_F(TestFixedShapeTensorType, CreateExtensionType) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); // Test ExtensionType methods @@ -118,7 +122,7 @@ TEST_F(TestExtensionType, CreateExtensionType) { FixedShapeTensorType::Make(value_type_, {1, 2, 3}, {0, 1, 1})); } -TEST_F(TestExtensionType, EqualsCases) { +TEST_F(TestFixedShapeTensorType, EqualsCases) { auto ext_type_permutation_1 = fixed_shape_tensor(int64(), {3, 4}, {0, 1}, {"x", "y"}); auto ext_type_permutation_2 = fixed_shape_tensor(int64(), {3, 4}, {1, 0}, {"x", "y"}); auto ext_type_no_permutation = fixed_shape_tensor(int64(), {3, 4}, {}, {"x", "y"}); @@ -140,7 +144,7 @@ TEST_F(TestExtensionType, EqualsCases) { ASSERT_FALSE(ext_type_permutation_2->Equals(ext_type_permutation_1)); } -TEST_F(TestExtensionType, CreateFromArray) { +TEST_F(TestFixedShapeTensorType, CreateFromArray) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; @@ -153,23 +157,23 @@ TEST_F(TestExtensionType, CreateFromArray) { } void CheckSerializationRoundtrip(const std::shared_ptr& ext_type) { - auto fst_type = internal::checked_pointer_cast(ext_type); - auto serialized = fst_type->Serialize(); + auto type = internal::checked_pointer_cast(ext_type); + auto serialized = type->Serialize(); ASSERT_OK_AND_ASSIGN(auto deserialized, - fst_type->Deserialize(fst_type->storage_type(), serialized)); - ASSERT_TRUE(fst_type->Equals(*deserialized)); + type->Deserialize(type->storage_type(), serialized)); + ASSERT_TRUE(type->Equals(*deserialized)); } -void CheckDeserializationRaises(const std::shared_ptr& storage_type, +void CheckDeserializationRaises(const std::shared_ptr& extension_type, + const std::shared_ptr& storage_type, const std::string& serialized, const std::string& expected_message) { - auto fst_type = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), {3, 4})); + auto ext_type = internal::checked_pointer_cast(extension_type); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr(expected_message), - fst_type->Deserialize(storage_type, serialized)); + ext_type->Deserialize(storage_type, serialized)); } -TEST_F(TestExtensionType, MetadataSerializationRoundtrip) { +TEST_F(TestFixedShapeTensorType, MetadataSerializationRoundtrip) { CheckSerializationRoundtrip(ext_type_); CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {}, {}, {})); CheckSerializationRoundtrip(fixed_shape_tensor(value_type_, {0}, {}, {})); @@ -180,19 +184,21 @@ TEST_F(TestExtensionType, MetadataSerializationRoundtrip) { fixed_shape_tensor(value_type_, {256, 256, 3}, {2, 0, 1}, {"C", "H", "W"})); auto storage_type = fixed_size_list(int64(), 12); - CheckDeserializationRaises(boolean(), R"({"shape":[3,4]})", + CheckDeserializationRaises(ext_type_, boolean(), R"({"shape":[3,4]})", "Expected FixedSizeList storage type, got bool"); - CheckDeserializationRaises(storage_type, R"({"dim_names":["x","y"]})", + CheckDeserializationRaises(ext_type_, storage_type, R"({"dim_names":["x","y"]})", "Invalid serialized JSON data"); - CheckDeserializationRaises(storage_type, R"({"shape":(3,4)})", + CheckDeserializationRaises(ext_type_, storage_type, R"({"shape":(3,4)})", "Invalid serialized JSON data"); - CheckDeserializationRaises(storage_type, R"({"shape":[3,4],"permutation":[1,0,2]})", + CheckDeserializationRaises(ext_type_, storage_type, + R"({"shape":[3,4],"permutation":[1,0,2]})", "Invalid permutation"); - CheckDeserializationRaises(storage_type, R"({"shape":[3],"dim_names":["x","y"]})", + CheckDeserializationRaises(ext_type_, storage_type, + R"({"shape":[3],"dim_names":["x","y"]})", "Invalid dim_names"); } -TEST_F(TestExtensionType, RoundtripBatch) { +TEST_F(TestFixedShapeTensorType, RoundtripBatch) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; @@ -220,7 +226,7 @@ TEST_F(TestExtensionType, RoundtripBatch) { CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); } -TEST_F(TestExtensionType, CreateFromTensor) { +TEST_F(TestFixedShapeTensorType, CreateFromTensor) { std::vector column_major_strides = {8, 24, 72}; std::vector neither_major_strides = {96, 8, 32}; @@ -298,7 +304,7 @@ void CheckFromTensorType(const std::shared_ptr& tensor, ASSERT_TRUE(generated_ext_type->Equals(ext_type)); } -TEST_F(TestExtensionType, TestFromTensorType) { +TEST_F(TestFixedShapeTensorType, TestFromTensorType) { auto values = Buffer::Wrap(values_); auto shapes = std::vector>{{3, 3, 4}, {3, 3, 4}, {3, 4, 3}, {3, 4, 3}}; @@ -357,7 +363,7 @@ void CheckToTensor(const std::vector& values, const std::shared_ptr ASSERT_TRUE(actual_tensor->Equals(*expected_tensor)); } -TEST_F(TestExtensionType, ToTensor) { +TEST_F(TestFixedShapeTensorType, ToTensor) { std::vector float_values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; @@ -408,7 +414,7 @@ void CheckTensorRoundtrip(const std::shared_ptr& tensor) { ASSERT_TRUE(tensor->Equals(*tensor_from_array)); } -TEST_F(TestExtensionType, RoundtripTensor) { +TEST_F(TestFixedShapeTensorType, RoundtripTensor) { auto values = Buffer::Wrap(values_); auto shapes = std::vector>{ @@ -429,7 +435,7 @@ TEST_F(TestExtensionType, RoundtripTensor) { } } -TEST_F(TestExtensionType, SliceTensor) { +TEST_F(TestFixedShapeTensorType, SliceTensor) { ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); ASSERT_OK_AND_ASSIGN( @@ -456,7 +462,7 @@ TEST_F(TestExtensionType, SliceTensor) { ASSERT_EQ(sliced->length(), partial->length()); } -TEST_F(TestExtensionType, RoundtripBatchFromTensor) { +TEST_F(TestFixedShapeTensorType, RoundtripBatchFromTensor) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); ASSERT_OK_AND_ASSIGN(auto tensor, Tensor::Make(value_type_, Buffer::Wrap(values_), shape_, {}, {"n", "x", "y"})); @@ -473,7 +479,7 @@ TEST_F(TestExtensionType, RoundtripBatchFromTensor) { CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); } -TEST_F(TestExtensionType, ComputeStrides) { +TEST_F(TestFixedShapeTensorType, ComputeStrides) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); auto ext_type_1 = internal::checked_pointer_cast( @@ -507,7 +513,7 @@ TEST_F(TestExtensionType, ComputeStrides) { ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})"); } -TEST_F(TestExtensionType, ToString) { +TEST_F(TestFixedShapeTensorType, FixedShapeTensorToString) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); auto ext_type_1 = internal::checked_pointer_cast( @@ -535,7 +541,7 @@ TEST_F(TestExtensionType, ToString) { ASSERT_EQ(expected_3, result_3); } -TEST_F(TestExtensionType, GetTensor) { +TEST_F(TestFixedShapeTensorType, GetTensor) { auto arr = ArrayFromJSON(element_type_, "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]," "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]"); @@ -627,4 +633,284 @@ TEST_F(TestExtensionType, GetTensor) { exact_ext_type->MakeTensor(ext_scalar)); } +class TestVariableShapeTensorType : public ::testing::Test { + public: + void SetUp() override { + ndim_ = 3; + value_type_ = int64(); + data_type_ = list(value_type_); + shape_type_ = fixed_size_list(int32(), ndim_); + permutation_ = {0, 1, 2}; + dim_names_ = {"x", "y", "z"}; + uniform_shape_ = {std::nullopt, std::optional(1), std::nullopt}; + ext_type_ = internal::checked_pointer_cast(variable_shape_tensor( + value_type_, ndim_, permutation_, dim_names_, uniform_shape_)); + values_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; + shapes_ = ArrayFromJSON(fixed_size_list(int32(), ndim_), "[[2,1,3],[2,1,2],[3,1,3]]"); + data_ = ArrayFromJSON(list(value_type_), + "[[0,1,2,3,4,5],[6,7,8,9],[10,11,12,13,14,15,16,17,18]]"); + serialized_ = + R"({"permutation":[0,1,2],"dim_names":["x","y","z"],"uniform_shape":[null,1,null]})"; + storage_arr_ = ArrayFromJSON( + ext_type_->storage_type(), + R"([[[0,1,2,3,4,5],[2,3,1]],[[6,7,8,9],[1,2,2]],[[10,11,12,13,14,15,16,17,18],[3,1,3]]])"); + ext_arr_ = internal::checked_pointer_cast( + ExtensionType::WrapArray(ext_type_, storage_arr_)); + } + + protected: + int32_t ndim_; + std::shared_ptr value_type_; + std::shared_ptr data_type_; + std::shared_ptr shape_type_; + std::vector permutation_; + std::vector> uniform_shape_; + std::vector dim_names_; + std::shared_ptr ext_type_; + std::vector values_; + std::shared_ptr shapes_; + std::shared_ptr data_; + std::string serialized_; + std::shared_ptr storage_arr_; + std::shared_ptr ext_arr_; +}; + +TEST_F(TestVariableShapeTensorType, CheckDummyRegistration) { + // We need a registered dummy type at runtime to allow for IPC deserialization + auto registered_type = GetExtensionType("arrow.variable_shape_tensor"); + ASSERT_EQ(registered_type->id(), Type::EXTENSION); +} + +TEST_F(TestVariableShapeTensorType, CreateExtensionType) { + auto exact_ext_type = + internal::checked_pointer_cast(ext_type_); + + // Test ExtensionType methods + ASSERT_EQ(ext_type_->extension_name(), "arrow.variable_shape_tensor"); + ASSERT_TRUE(ext_type_->Equals(*exact_ext_type)); + auto expected_type = + struct_({::arrow::field("data", list(value_type_)), + ::arrow::field("shape", fixed_size_list(int32(), ndim_))}); + + ASSERT_TRUE(ext_type_->storage_type()->Equals(*expected_type)); + ASSERT_EQ(ext_type_->Serialize(), serialized_); + ASSERT_OK_AND_ASSIGN(auto ds, + ext_type_->Deserialize(ext_type_->storage_type(), serialized_)); + auto deserialized = internal::checked_pointer_cast(ds); + ASSERT_TRUE(deserialized->Equals(*exact_ext_type)); + ASSERT_TRUE(deserialized->Equals(*ext_type_)); + + // Test FixedShapeTensorType methods + ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); + ASSERT_EQ(exact_ext_type->ndim(), ndim_); + ASSERT_EQ(exact_ext_type->value_type(), value_type_); + ASSERT_EQ(exact_ext_type->permutation(), permutation_); + ASSERT_EQ(exact_ext_type->dim_names(), dim_names_); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: permutation size must match ndim. Expected: 3 Got: 1"), + VariableShapeTensorType::Make(value_type_, ndim_, {0})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid: dim_names size must match ndim."), + VariableShapeTensorType::Make(value_type_, ndim_, {}, {"x"})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: Permutation indices for 3 dimensional tensors must be " + "unique and within [0, 2] range. Got: [2,0,0]"), + VariableShapeTensorType::Make(value_type_, 3, {2, 0, 0}, {"C", "H", "W"})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: Permutation indices for 3 dimensional tensors must be " + "unique and within [0, 2] range. Got: [1,2,3]"), + VariableShapeTensorType::Make(value_type_, 3, {1, 2, 3}, {"C", "H", "W"})); +} + +TEST_F(TestVariableShapeTensorType, EqualsCases) { + auto ext_type_permutation_1 = variable_shape_tensor(int64(), 2, {0, 1}, {"x", "y"}); + auto ext_type_permutation_2 = variable_shape_tensor(int64(), 2, {1, 0}, {"x", "y"}); + auto ext_type_no_permutation = variable_shape_tensor(int64(), 2, {}, {"x", "y"}); + + ASSERT_TRUE(ext_type_permutation_1->Equals(ext_type_permutation_1)); + + ASSERT_FALSE( + variable_shape_tensor(int32(), 2, {}, {"x", "y"})->Equals(ext_type_no_permutation)); + ASSERT_FALSE(variable_shape_tensor(int64(), 2, {}, {}) + ->Equals(variable_shape_tensor(int64(), 3, {}, {}))); + ASSERT_FALSE( + variable_shape_tensor(int64(), 2, {}, {"H", "W"})->Equals(ext_type_no_permutation)); + + ASSERT_TRUE(ext_type_no_permutation->Equals(ext_type_permutation_1)); + ASSERT_TRUE(ext_type_permutation_1->Equals(ext_type_no_permutation)); + ASSERT_FALSE(ext_type_no_permutation->Equals(ext_type_permutation_2)); + ASSERT_FALSE(ext_type_permutation_2->Equals(ext_type_no_permutation)); + ASSERT_FALSE(ext_type_permutation_1->Equals(ext_type_permutation_2)); + ASSERT_FALSE(ext_type_permutation_2->Equals(ext_type_permutation_1)); +} + +TEST_F(TestVariableShapeTensorType, MetadataSerializationRoundtrip) { + CheckSerializationRoundtrip(ext_type_); + CheckSerializationRoundtrip( + variable_shape_tensor(value_type_, 3, {1, 2, 0}, {"x", "y", "z"})); + CheckSerializationRoundtrip(variable_shape_tensor(value_type_, 0, {}, {})); + CheckSerializationRoundtrip(variable_shape_tensor(value_type_, 1, {0}, {"x"})); + CheckSerializationRoundtrip( + variable_shape_tensor(value_type_, 3, {0, 1, 2}, {"H", "W", "C"})); + CheckSerializationRoundtrip( + variable_shape_tensor(value_type_, 3, {2, 0, 1}, {"C", "H", "W"})); + CheckSerializationRoundtrip( + variable_shape_tensor(value_type_, 3, {2, 0, 1}, {"C", "H", "W"}, {0, 1, 2})); + + auto storage_type = ext_type_->storage_type(); + CheckDeserializationRaises(ext_type_, boolean(), R"({"shape":[3,4]})", + "Expected Struct storage type, got bool"); + CheckDeserializationRaises(ext_type_, storage_type, R"({"shape":(3,4)})", + "Invalid serialized JSON data"); + CheckDeserializationRaises(ext_type_, storage_type, R"({"permutation":[1,0]})", + "Invalid: permutation"); + CheckDeserializationRaises(ext_type_, storage_type, R"({"dim_names":["x","y"]})", + "Invalid: dim_names"); +} + +TEST_F(TestVariableShapeTensorType, RoudtripBatch) { + auto exact_ext_type = + internal::checked_pointer_cast(ext_type_); + + // Pass extension array, expect getting back extension array + std::shared_ptr read_batch; + auto ext_field = field(/*name=*/"f0", /*type=*/ext_type_); + auto batch = RecordBatch::Make(schema({ext_field}), ext_arr_->length(), {ext_arr_}); + RoundtripBatch(batch, &read_batch); + CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); + + // Pass extension metadata and storage array, expect getting back extension array + std::shared_ptr read_batch2; + auto ext_metadata = + key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, + {"ARROW:extension:metadata", serialized_}}); + ext_field = field(/*name=*/"f0", /*type=*/ext_type_->storage_type(), /*nullable=*/true, + /*metadata=*/ext_metadata); + auto batch2 = RecordBatch::Make(schema({ext_field}), ext_arr_->length(), {ext_arr_}); + RoundtripBatch(batch2, &read_batch2); + CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); +} + +TEST_F(TestVariableShapeTensorType, ComputeStrides) { + auto shapes = ArrayFromJSON(shape_type_, "[[2,3,1],[2,1,2],[3,1,3],null]"); + auto data = ArrayFromJSON( + data_type_, "[[1,1,2,3,4,5],[2,7,8,9],[10,11,12,13,14,15,16,17,18],null]"); + std::vector> fields = {field("data", data_type_), + field("shapes", shape_type_)}; + ASSERT_OK_AND_ASSIGN(auto storage_arr, StructArray::Make({data, shapes}, fields)); + auto ext_arr = ExtensionType::WrapArray(ext_type_, storage_arr); + auto exact_ext_type = + internal::checked_pointer_cast(ext_type_); + auto ext_array = std::static_pointer_cast(ext_arr); + + std::shared_ptr t, tensor; + + ASSERT_OK_AND_ASSIGN(auto scalar, ext_array->GetScalar(0)); + auto ext_scalar = internal::checked_pointer_cast(scalar); + ASSERT_OK_AND_ASSIGN(t, exact_ext_type->MakeTensor(ext_scalar)); + ASSERT_EQ(t->shape(), (std::vector{2, 3, 1})); + ASSERT_EQ(t->strides(), (std::vector{24, 8, 8})); + + std::vector strides = {sizeof(int64_t) * 3, sizeof(int64_t) * 1, + sizeof(int64_t) * 1}; + tensor = TensorFromJSON(int64(), R"([1,1,2,3,4,5])", {2, 3, 1}, strides, dim_names_); + + ASSERT_TRUE(tensor->Equals(*t)); + + ASSERT_OK_AND_ASSIGN(scalar, ext_array->GetScalar(1)); + ext_scalar = internal::checked_pointer_cast(scalar); + ASSERT_OK_AND_ASSIGN(t, exact_ext_type->MakeTensor(ext_scalar)); + ASSERT_EQ(t->shape(), (std::vector{2, 1, 2})); + ASSERT_EQ(t->strides(), (std::vector{16, 16, 8})); + + ASSERT_OK_AND_ASSIGN(scalar, ext_array->GetScalar(2)); + ext_scalar = internal::checked_pointer_cast(scalar); + ASSERT_OK_AND_ASSIGN(t, exact_ext_type->MakeTensor(ext_scalar)); + ASSERT_EQ(t->shape(), (std::vector{3, 1, 3})); + ASSERT_EQ(t->strides(), (std::vector{24, 24, 8})); + + strides = {sizeof(int64_t) * 3, sizeof(int64_t) * 3, sizeof(int64_t) * 1}; + tensor = TensorFromJSON(int64(), R"([10,11,12,13,14,15,16,17,18])", {3, 1, 3}, strides, + dim_names_); + + ASSERT_EQ(tensor->strides(), t->strides()); + ASSERT_EQ(tensor->shape(), t->shape()); + ASSERT_EQ(tensor->dim_names(), t->dim_names()); + ASSERT_EQ(tensor->type(), t->type()); + ASSERT_EQ(tensor->is_contiguous(), t->is_contiguous()); + ASSERT_EQ(tensor->is_column_major(), t->is_column_major()); + ASSERT_TRUE(tensor->Equals(*t)); + + ASSERT_OK_AND_ASSIGN(auto sc, ext_arr->GetScalar(2)); + auto s = internal::checked_pointer_cast(sc); + ASSERT_OK_AND_ASSIGN(t, exact_ext_type->MakeTensor(s)); + ASSERT_EQ(tensor->strides(), t->strides()); + ASSERT_EQ(tensor->shape(), t->shape()); + ASSERT_EQ(tensor->dim_names(), t->dim_names()); + ASSERT_EQ(tensor->type(), t->type()); + ASSERT_EQ(tensor->is_contiguous(), t->is_contiguous()); + ASSERT_EQ(tensor->is_column_major(), t->is_column_major()); + ASSERT_TRUE(tensor->Equals(*t)); + + // Null value in VariableShapeTensorArray produces a tensor with shape {0, 0, 0} + strides = {sizeof(int64_t), sizeof(int64_t), sizeof(int64_t)}; + tensor = TensorFromJSON(int64(), R"([10,11,12,13,14,15,16,17,18])", {0, 0, 0}, strides, + dim_names_); + + ASSERT_OK_AND_ASSIGN(sc, ext_arr->GetScalar(3)); + ASSERT_OK_AND_ASSIGN( + t, exact_ext_type->MakeTensor(internal::checked_pointer_cast(sc))); + ASSERT_EQ(tensor->strides(), t->strides()); + ASSERT_EQ(tensor->shape(), t->shape()); + ASSERT_EQ(tensor->dim_names(), t->dim_names()); + ASSERT_EQ(tensor->type(), t->type()); + ASSERT_EQ(tensor->is_contiguous(), t->is_contiguous()); + ASSERT_EQ(tensor->is_column_major(), t->is_column_major()); + ASSERT_TRUE(tensor->Equals(*t)); +} + +TEST_F(TestVariableShapeTensorType, ToString) { + auto exact_ext_type = + internal::checked_pointer_cast(ext_type_); + + auto uniform_shape = std::vector>{ + std::nullopt, std::optional(1), std::nullopt}; + auto ext_type_1 = internal::checked_pointer_cast( + variable_shape_tensor(int16(), 3)); + auto ext_type_2 = internal::checked_pointer_cast( + variable_shape_tensor(int32(), 3, {1, 0, 2})); + auto ext_type_3 = internal::checked_pointer_cast( + variable_shape_tensor(int64(), 3, {}, {"C", "H", "W"})); + auto ext_type_4 = internal::checked_pointer_cast( + variable_shape_tensor(int64(), 3, {}, {}, uniform_shape)); + + std::string result_1 = ext_type_1->ToString(); + std::string expected_1 = + "extension"; + ASSERT_EQ(expected_1, result_1); + + std::string result_2 = ext_type_2->ToString(); + std::string expected_2 = + "extension"; + ASSERT_EQ(expected_2, result_2); + + std::string result_3 = ext_type_3->ToString(); + std::string expected_3 = + "extension"; + ASSERT_EQ(expected_3, result_3); + + std::string result_4 = ext_type_4->ToString(); + std::string expected_4 = + "extension"; + ASSERT_EQ(expected_4, result_4); +} + } // namespace arrow diff --git a/cpp/src/arrow/extension/tensor_internal.cc b/cpp/src/arrow/extension/tensor_internal.cc new file mode 100644 index 00000000000..9fb8d983969 --- /dev/null +++ b/cpp/src/arrow/extension/tensor_internal.cc @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/tensor_internal.h" + +#include "arrow/tensor.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/int_util_overflow.h" +#include "arrow/util/sort.h" + +#include "arrow/status.h" +#include "arrow/util/logging.h" +#include "arrow/util/print.h" + +namespace arrow::internal { + +Status IsPermutationValid(const std::vector& permutation) { + const auto size = static_cast(permutation.size()); + std::vector dim_seen(size, 0); + + for (const auto p : permutation) { + if (p < 0 || p >= size || dim_seen[p] != 0) { + return Status::Invalid( + "Permutation indices for ", size, + " dimensional tensors must be unique and within [0, ", size - 1, + "] range. Got: ", ::arrow::internal::PrintVector{permutation, ","}); + } + dim_seen[p] = 1; + } + return Status::OK(); +} + +Status ComputeStrides(const std::shared_ptr& value_type, + const std::vector& shape, + const std::vector& permutation, + std::vector* strides) { + auto fixed_width_type = internal::checked_pointer_cast(value_type); + if (permutation.empty()) { + return internal::ComputeRowMajorStrides(*fixed_width_type.get(), shape, strides); + } + + const int byte_width = value_type->byte_width(); + + int64_t remaining = 0; + if (!shape.empty() && shape.front() > 0) { + remaining = byte_width; + for (auto i : permutation) { + if (i > 0) { + if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) { + return Status::Invalid( + "Strides computed from shape would not fit in 64-bit integer"); + } + } + } + } + + if (remaining == 0) { + strides->assign(shape.size(), byte_width); + return Status::OK(); + } + + strides->push_back(remaining); + for (auto i : permutation) { + if (i > 0) { + remaining /= shape[i]; + strides->push_back(remaining); + } + } + internal::Permute(permutation, strides); + + return Status::OK(); +} + +} // namespace arrow::internal diff --git a/cpp/src/arrow/extension/tensor_internal.h b/cpp/src/arrow/extension/tensor_internal.h index 069880cb17c..1a0bd0b29c2 100644 --- a/cpp/src/arrow/extension/tensor_internal.h +++ b/cpp/src/arrow/extension/tensor_internal.h @@ -20,26 +20,17 @@ #include #include -#include "arrow/status.h" -#include "arrow/util/print.h" +#include "arrow/array/array_nested.h" namespace arrow::internal { ARROW_EXPORT -Status IsPermutationValid(const std::vector& permutation) { - const auto size = static_cast(permutation.size()); - std::vector dim_seen(size, 0); +Status IsPermutationValid(const std::vector& permutation); - for (const auto p : permutation) { - if (p < 0 || p >= size || dim_seen[p] != 0) { - return Status::Invalid( - "Permutation indices for ", size, - " dimensional tensors must be unique and within [0, ", size - 1, - "] range. Got: ", ::arrow::internal::PrintVector{permutation, ","}); - } - dim_seen[p] = 1; - } - return Status::OK(); -} +ARROW_EXPORT +Status ComputeStrides(const std::shared_ptr& value_type, + const std::vector& shape, + const std::vector& permutation, + std::vector* strides); } // namespace arrow::internal diff --git a/cpp/src/arrow/extension/uuid_test.cc b/cpp/src/arrow/extension/uuid_test.cc index 3bbb6eeb4ae..697d3f9bf2a 100644 --- a/cpp/src/arrow/extension/uuid_test.cc +++ b/cpp/src/arrow/extension/uuid_test.cc @@ -29,8 +29,6 @@ namespace arrow { -using arrow::ipc::test::RoundtripBatch; - TEST(TestUuuidExtensionType, ExtensionTypeTest) { auto type = uuid(); ASSERT_EQ(type->id(), Type::EXTENSION); diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc new file mode 100644 index 00000000000..96e82689183 --- /dev/null +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -0,0 +1,314 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/extension/tensor_internal.h" +#include "arrow/extension/variable_shape_tensor.h" + +#include "arrow/array/array_primitive.h" +#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep +#include "arrow/scalar.h" +#include "arrow/tensor.h" +#include "arrow/util/int_util_overflow.h" +#include "arrow/util/logging.h" +#include "arrow/util/print.h" +#include "arrow/util/sort.h" +#include "arrow/util/string.h" + +#include +#include + +namespace rj = arrow::rapidjson; + +namespace arrow::extension { + +bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const { + if (extension_name() != other.extension_name()) { + return false; + } + const auto& other_ext = internal::checked_cast(other); + if (this->ndim() != other_ext.ndim()) { + return false; + } + + auto is_permutation_trivial = [](const std::vector& permutation) { + for (size_t i = 1; i < permutation.size(); ++i) { + if (permutation[i - 1] + 1 != permutation[i]) { + return false; + } + } + return true; + }; + const bool permutation_equivalent = + ((permutation_ == other_ext.permutation()) || + (permutation_.empty() && is_permutation_trivial(other_ext.permutation())) || + (is_permutation_trivial(permutation_) && other_ext.permutation().empty())); + + return (storage_type()->Equals(other_ext.storage_type())) && + (dim_names_ == other_ext.dim_names()) && + (uniform_shape_ == other_ext.uniform_shape()) && permutation_equivalent; +} + +std::string VariableShapeTensorType::ToString(bool show_metadata) const { + std::stringstream ss; + ss << "extension<" << this->extension_name() + << "[value_type=" << value_type_->ToString() << ", ndim=" << ndim_; + + if (!permutation_.empty()) { + ss << ", permutation=" << ::arrow::internal::PrintVector{permutation_, ","}; + } + if (!dim_names_.empty()) { + ss << ", dim_names=[" << internal::JoinStrings(dim_names_, ",") << "]"; + } + if (!uniform_shape_.empty()) { + std::vector uniform_shape; + for (const auto& v : uniform_shape_) { + if (v.has_value()) { + uniform_shape.emplace_back(std::to_string(v.value())); + } else { + uniform_shape.emplace_back("null"); + } + } + ss << ", uniform_shape=[" << internal::JoinStrings(uniform_shape, ",") << "]"; + } + ss << "]>"; + return ss.str(); +} + +std::string VariableShapeTensorType::Serialize() const { + rj::Document document; + document.SetObject(); + rj::Document::AllocatorType& allocator = document.GetAllocator(); + + if (!permutation_.empty()) { + rj::Value permutation(rj::kArrayType); + for (auto v : permutation_) { + permutation.PushBack(v, allocator); + } + document.AddMember(rj::Value("permutation", allocator), permutation, allocator); + } + + if (!dim_names_.empty()) { + rj::Value dim_names(rj::kArrayType); + for (std::string v : dim_names_) { + dim_names.PushBack(rj::Value{}.SetString(v.c_str(), allocator), allocator); + } + document.AddMember(rj::Value("dim_names", allocator), dim_names, allocator); + } + + if (!uniform_shape_.empty()) { + rj::Value uniform_shape(rj::kArrayType); + for (auto v : uniform_shape_) { + if (v.has_value()) { + uniform_shape.PushBack(v.value(), allocator); + } else { + uniform_shape.PushBack(rj::Value{}.SetNull(), allocator); + } + } + document.AddMember(rj::Value("uniform_shape", allocator), uniform_shape, allocator); + } + + rj::StringBuffer buffer; + rj::Writer writer(buffer); + document.Accept(writer); + return buffer.GetString(); +} + +Result> VariableShapeTensorType::Deserialize( + std::shared_ptr storage_type, const std::string& serialized_data) const { + if (storage_type->id() != Type::STRUCT) { + return Status::Invalid("Expected Struct storage type, got ", + storage_type->ToString()); + } + if (storage_type->num_fields() != 2) { + return Status::Invalid("Expected Struct storage type with 2 fields, got ", + storage_type->num_fields()); + } + if (storage_type->field(0)->type()->id() != Type::LIST) { + return Status::Invalid("Expected List storage type, got ", + storage_type->field(0)->type()->ToString()); + } + if (storage_type->field(1)->type()->id() != Type::FIXED_SIZE_LIST) { + return Status::Invalid("Expected FixedSizeList storage type, got ", + storage_type->field(1)->type()->ToString()); + } + if (internal::checked_cast(*storage_type->field(1)->type()) + .value_type() != int32()) { + return Status::Invalid("Expected FixedSizeList value type int32, got ", + storage_type->field(1)->type()->ToString()); + } + + const auto value_type = storage_type->field(0)->type()->field(0)->type(); + const uint32_t ndim = + internal::checked_cast(*storage_type->field(1)->type()) + .list_size(); + + rj::Document document; + if (document.Parse(serialized_data.data(), serialized_data.length()).HasParseError()) { + return Status::Invalid("Invalid serialized JSON data: ", serialized_data); + } + + std::vector permutation; + if (document.HasMember("permutation")) { + permutation.reserve(ndim); + for (const auto& x : document["permutation"].GetArray()) { + permutation.emplace_back(x.GetInt64()); + } + } + std::vector dim_names; + if (document.HasMember("dim_names")) { + dim_names.reserve(ndim); + for (const auto& x : document["dim_names"].GetArray()) { + dim_names.emplace_back(x.GetString()); + } + } + + std::vector> uniform_shape; + if (document.HasMember("uniform_shape")) { + uniform_shape.reserve(ndim); + for (const auto& x : document["uniform_shape"].GetArray()) { + if (x.IsNull()) { + uniform_shape.emplace_back(std::nullopt); + } else { + uniform_shape.emplace_back(x.GetInt64()); + } + } + } + + return VariableShapeTensorType::Make(value_type, ndim, permutation, dim_names, + uniform_shape); +} + +std::shared_ptr VariableShapeTensorType::MakeArray( + std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK_EQ("arrow.variable_shape_tensor", + internal::checked_cast(*data->type).extension_name()); + return std::make_shared(data); +} + +Result> VariableShapeTensorType::MakeTensor( + const std::shared_ptr& scalar) { + const auto& tensor_scalar = internal::checked_cast(*scalar->value); + const auto& ext_type = + internal::checked_cast(*scalar->type); + + ARROW_ASSIGN_OR_RAISE(const auto data_scalar, tensor_scalar.field(0)); + ARROW_ASSIGN_OR_RAISE(const auto shape_scalar, tensor_scalar.field(1)); + ARROW_CHECK(tensor_scalar.is_valid); + const auto data_array = + internal::checked_pointer_cast(data_scalar)->value; + const auto shape_array = internal::checked_pointer_cast( + internal::checked_pointer_cast(shape_scalar)->value); + + const auto& value_type = + internal::checked_cast(*ext_type.value_type()); + + if (data_array->null_count() > 0) { + return Status::Invalid("Cannot convert data with nulls to Tensor."); + } + + auto permutation = ext_type.permutation(); + if (permutation.empty()) { + permutation.resize(ext_type.ndim()); + std::iota(permutation.begin(), permutation.end(), 0); + } + + ARROW_CHECK_EQ(shape_array->length(), ext_type.ndim()); + std::vector shape; + shape.reserve(ext_type.ndim()); + for (int64_t j = 0; j < static_cast(ext_type.ndim()); ++j) { + const auto size_value = shape_array->Value(j); + if (size_value < 0) { + return Status::Invalid("shape must have non-negative values"); + } + shape.push_back(std::move(size_value)); + } + + std::vector dim_names = ext_type.dim_names(); + if (!dim_names.empty()) { + internal::Permute(permutation, &dim_names); + } + + std::vector strides; + ARROW_RETURN_NOT_OK( + internal::ComputeStrides(ext_type.value_type(), shape, permutation, &strides)); + internal::Permute(permutation, &shape); + + const auto byte_width = value_type.byte_width(); + const auto start_position = data_array->offset() * byte_width; + const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), + std::multiplies<>()); + ARROW_CHECK_EQ(size * byte_width, data_array->length() * byte_width); + ARROW_ASSIGN_OR_RAISE( + const auto buffer, + SliceBufferSafe(data_array->data()->buffers[1], start_position, size * byte_width)); + + return Tensor::Make(ext_type.value_type(), std::move(buffer), std::move(shape), + std::move(strides), ext_type.dim_names()); +} + +Result> VariableShapeTensorType::Make( + const std::shared_ptr& value_type, const int32_t ndim, + const std::vector& permutation, const std::vector& dim_names, + const std::vector>& uniform_shape) { + if (!is_fixed_width(*value_type)) { + return Status::Invalid("Cannot convert non-fixed-width values to Tensor."); + } + + if (!dim_names.empty() && dim_names.size() != static_cast(ndim)) { + return Status::Invalid("dim_names size must match ndim. Expected: ", ndim, + " Got: ", dim_names.size()); + } + if (!uniform_shape.empty() && uniform_shape.size() != static_cast(ndim)) { + return Status::Invalid("uniform_shape size must match ndim. Expected: ", ndim, + " Got: ", uniform_shape.size()); + } + if (!uniform_shape.empty()) { + for (const auto& v : uniform_shape) { + if (v.has_value() && v.value() < 0) { + return Status::Invalid("uniform_shape must have non-negative values"); + } + } + } + if (!permutation.empty()) { + if (permutation.size() != static_cast(ndim)) { + return Status::Invalid("permutation size must match ndim. Expected: ", ndim, + " Got: ", permutation.size()); + } + RETURN_NOT_OK(internal::IsPermutationValid(permutation)); + } + + return std::make_shared( + value_type, std::move(ndim), std::move(permutation), std::move(dim_names), + std::move(uniform_shape)); +} + +std::shared_ptr variable_shape_tensor( + const std::shared_ptr& value_type, const int32_t ndim, + const std::vector permutation, const std::vector dim_names, + const std::vector> uniform_shape) { + auto maybe_type = + VariableShapeTensorType::Make(value_type, std::move(ndim), std::move(permutation), + std::move(dim_names), std::move(uniform_shape)); + ARROW_CHECK_OK(maybe_type.status()); + return maybe_type.MoveValueUnsafe(); +} + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/variable_shape_tensor.h b/cpp/src/arrow/extension/variable_shape_tensor.h new file mode 100644 index 00000000000..7b3e14fbc7e --- /dev/null +++ b/cpp/src/arrow/extension/variable_shape_tensor.h @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/extension_type.h" + +namespace arrow { +namespace extension { + +class ARROW_EXPORT VariableShapeTensorArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +/// \brief Concrete type class for variable-shape Tensor data. +/// This is a canonical arrow extension type. +/// See: https://arrow.apache.org/docs/format/CanonicalExtensions.html +class ARROW_EXPORT VariableShapeTensorType : public ExtensionType { + public: + VariableShapeTensorType(const std::shared_ptr& value_type, const int32_t ndim, + const std::vector permutation = {}, + const std::vector dim_names = {}, + const std::vector> uniform_shape = {}) + : ExtensionType(struct_({::arrow::field("data", list(value_type)), + ::arrow::field("shape", fixed_size_list(int32(), ndim))})), + value_type_(value_type), + ndim_(std::move(ndim)), + permutation_(std::move(permutation)), + dim_names_(std::move(dim_names)), + uniform_shape_(std::move(uniform_shape)) {} + + std::string extension_name() const override { return "arrow.variable_shape_tensor"; } + std::string ToString(bool show_metadata = false) const override; + + /// Number of dimensions of tensor elements + int32_t ndim() const { return ndim_; } + + /// Value type of tensor elements + const std::shared_ptr& value_type() const { return value_type_; } + + /// Permutation mapping from logical to physical memory layout of tensor elements + const std::vector& permutation() const { return permutation_; } + + /// Dimension names of tensor elements. Dimensions are ordered physically. + const std::vector& dim_names() const { return dim_names_; } + + /// Shape of uniform dimensions. + const std::vector>& uniform_shape() const { + return uniform_shape_; + } + + bool ExtensionEquals(const ExtensionType& other) const override; + + std::string Serialize() const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override; + + /// Create a VariableShapeTensorArray from ArrayData + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + /// \brief Convert an ExtensionScalar to a Tensor + /// + /// This method will return a Tensor from ExtensionScalar with strides derived + /// from shape and permutation stored. Shape and dim_names will be permuted + /// according to permutation stored in the VariableShapeTensorType. + static Result> MakeTensor( + const std::shared_ptr&); + + /// \brief Create a VariableShapeTensorType instance + static Result> Make( + const std::shared_ptr& value_type, const int32_t ndim, + const std::vector& permutation = {}, + const std::vector& dim_names = {}, + const std::vector>& uniform_shape = {}); + + private: + std::shared_ptr storage_type_; + std::shared_ptr value_type_; + int32_t ndim_; + std::vector permutation_; + std::vector dim_names_; + std::vector> uniform_shape_; +}; + +/// \brief Return a VariableShapeTensorType instance. +ARROW_EXPORT std::shared_ptr variable_shape_tensor( + const std::shared_ptr& value_type, const int32_t ndim, + const std::vector permutation = {}, + const std::vector dim_names = {}, + const std::vector> uniform_shape = {}); + +} // namespace extension +} // namespace arrow diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index fc220f73a6b..0ffdc79ef0e 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -31,6 +31,7 @@ #ifdef ARROW_JSON #include "arrow/extension/fixed_shape_tensor.h" #include "arrow/extension/opaque.h" +#include "arrow/extension/variable_shape_tensor.h" #endif #include "arrow/extension/uuid.h" #include "arrow/status.h" @@ -153,6 +154,7 @@ static void CreateGlobalRegistry() { #ifdef ARROW_JSON ext_types.push_back(extension::fixed_shape_tensor(int64(), {})); ext_types.push_back(extension::opaque(null(), "", "")); + ext_types.push_back(extension::variable_shape_tensor(int64(), 0)); #endif for (const auto& ext_type : ext_types) { diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index f49ffc5cba5..333b699b8f0 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -42,8 +42,6 @@ namespace arrow { -using arrow::ipc::test::RoundtripBatch; - class Parametric1Array : public ExtensionArray { public: using ExtensionArray::ExtensionArray; diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index ae2e53b30a3..f896f52c15e 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -590,6 +590,20 @@ void ApproxCompareBatch(const RecordBatch& left, const RecordBatch& right, }); } +void RoundtripBatch(const std::shared_ptr& batch, + std::shared_ptr* out) { + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); + ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), + out_stream.get())); + + ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); + + io::BufferReader reader(complete_ipc_stream); + std::shared_ptr batch_reader; + ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); + ASSERT_OK(batch_reader->ReadNext(out)); +} + std::shared_ptr TweakValidityBit(const std::shared_ptr& array, int64_t index, bool validity) { auto data = array->data()->Copy(); diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 85b4c1f1f01..e77694d12a0 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -309,6 +309,9 @@ ARROW_TESTING_EXPORT void ApproxCompareBatch( const RecordBatch& left, const RecordBatch& right, bool compare_metadata = true, const EqualOptions& options = TestingEqualOptions()); +ARROW_TESTING_EXPORT void RoundtripBatch(const std::shared_ptr& batch, + std::shared_ptr* out); + // Check if the padding of the buffers of the array is zero. // Also cause valgrind warnings if the padding bytes are uninitialized. ARROW_TESTING_EXPORT void AssertZeroPadded(const Array& array); diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 1106f8aaffd..ff7e0cd4b58 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -248,8 +248,8 @@ Variable shape tensor This means the logical tensor has names [z, x, y] and shape [30, 10, 20]. .. note:: - Values inside each **data** tensor element are stored in row-major/C-contiguous - order according to the corresponding **shape**. + Elements in a variable shape tensor extension array are stored + in row-major/C-contiguous order. .. _json_extension: diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index 4ad35b190cd..4d6d3adb11f 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -86,6 +86,7 @@ may expose data type-specific methods or properties. ExtensionArray FixedShapeTensorArray OpaqueArray + VariableShapeTensorArray .. _api.scalar: diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index d31c93119b7..bf290d3460f 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -173,6 +173,7 @@ def print_entry(label, value): dictionary, run_end_encoded, bool8, fixed_shape_tensor, opaque, uuid, + variable_shape_tensor, field, type_for_alias, DataType, DictionaryType, StructType, @@ -185,6 +186,7 @@ def print_entry(label, value): RunEndEncodedType, Bool8Type, FixedShapeTensorType, OpaqueType, UuidType, PyExtensionType, UnknownExtensionType, + VariableShapeTensorType, register_extension_type, unregister_extension_type, DictionaryMemo, KeyValueMetadata, @@ -219,6 +221,7 @@ def print_entry(label, value): Decimal128Array, Decimal256Array, StructArray, ExtensionArray, RunEndEncodedArray, Bool8Array, FixedShapeTensorArray, OpaqueArray, UuidArray, + VariableShapeTensorArray, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 1587de0e6b7..949096b70d4 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4392,7 +4392,7 @@ cdef class FixedShapeTensorArray(ExtensionArray): and the rest of the dimensions will match the permuted shape of the fixed shape tensor. - The conversion is zero-copy. + The conversion is zero-copy if data is primitive numeric and without nulls. Returns ------- @@ -4630,6 +4630,137 @@ cdef class Bool8Array(ExtensionArray): return Bool8Array.from_storage(storage_arr) +cdef class VariableShapeTensorArray(ExtensionArray): + """ + Concrete class for variable shape tensor extension arrays. + + Examples + -------- + Define the extension type for tensor array + + >>> import pyarrow as pa + >>> tensor_type = pa.variable_shape_tensor(pa.float64(), 2) + + Create an extension array + + >>> shapes = pa.array([[2, 3], [1, 2]], pa.list_(pa.int32(), 2)) + >>> values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(pa.float64())) + >>> arr = pa.StructArray.from_arrays([values, shapes], names=["data", "shape"]) + >>> pa.ExtensionArray.from_storage(tensor_type, arr) + + -- is_valid: all not null + -- child 0 type: list + [ + [ + 1, + 2, + 3, + 4, + 5, + 6 + ], + [ + 7, + 8 + ] + ] + -- child 1 type: fixed_size_list[2] + [ + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + """ + + @staticmethod + def from_numpy_ndarray(obj): + """ + Convert a list of numpy.ndarrays to a variable shape tensor extension array. + The length of the input list will become the length of the variable shape tensor array. + + Parameters + ---------- + obj : list of numpy.ndarray + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + + >>> ndarray_list = [ + ... np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32), + ... np.array([[7, 8]], dtype=np.float32), + ... ] + >>> arr = pa.VariableShapeTensorArray.from_numpy_ndarray(ndarray_list) + >>> assert len(ndarray_list) == len(arr) + >>> arr.type + VariableShapeTensorType(extension) + >>> arr + + -- is_valid: all not null + -- child 0 type: list + [ + [ + 1, + 2, + 3, + 4, + 5, + 6 + ], + [ + 7, + 8 + ] + ] + -- child 1 type: fixed_size_list[2] + [ + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + """ + assert isinstance(obj, list), 'obj must be a list of numpy arrays' + numpy_type = obj[0].dtype + arrow_type = from_numpy_dtype(numpy_type) + ndim = obj[0].ndim + permutations = [(-np.array(o.strides)).argsort(kind="stable") for o in obj] + permutation = permutations[0] + shapes = [np.take(o.shape, permutation) for o in obj] + + if not all([o.dtype == numpy_type for o in obj]): + raise TypeError('All numpy arrays must have matching dtype.') + + if not all([o.ndim == ndim for o in obj]): + raise ValueError('All numpy arrays must have matching ndim.') + + if not all([np.array_equal(p, permutation) for p in permutations]): + raise ValueError('All numpy arrays must have matching permutation.') + + for shape in shapes: + if len(shape) < 2: + raise ValueError( + "Cannot convert 1D array or scalar to fixed shape tensor array") + if np.prod(shape) == 0: + raise ValueError("Expected a non-empty ndarray") + + values = array([np.ravel(o, order="K") for o in obj], list_(arrow_type)) + shapes = array(shapes, list_(int32(), list_size=ndim)) + struct_arr = StructArray.from_arrays([values, shapes], names=["data", "shape"]) + + return ExtensionArray.from_storage(variable_shape_tensor(arrow_type, ndim, permutation=permutation), struct_arr) + + cdef dict _array_classes = { _Type_NA: NullArray, _Type_BOOL: BooleanArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index c2346750a19..5a8771ca1e9 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -840,6 +840,14 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: const shared_ptr[CBuffer] null_bitmap, ) + @staticmethod + CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( + shared_ptr[CDataType], + const shared_ptr[CArray]& offsets, + const shared_ptr[CArray]& keys, + const shared_ptr[CArray]& items, + CMemoryPool* pool) + shared_ptr[CArray] keys() shared_ptr[CArray] items() CMapType* map_type() @@ -1114,6 +1122,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: void set_chunksize(int64_t chunksize) cdef cppclass CTensor" arrow::Tensor": + CTensor(const shared_ptr[CDataType]& type, + const shared_ptr[CBuffer]& data, + const vector[int64_t]& shape, + const vector[int64_t]& strides, + const vector[c_string]& dim_names) shared_ptr[CDataType] type() shared_ptr[CBuffer] data() @@ -2864,6 +2877,24 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() +cdef extern from "arrow/extension/variable_shape_tensor.h" namespace "arrow::extension" nogil: + cdef cppclass CVariableShapeTensorType \ + " arrow::extension::VariableShapeTensorType"(CExtensionType): + + CResult[shared_ptr[CTensor]] MakeTensor(const shared_ptr[CExtensionScalar]& scalar) const + + @staticmethod + CResult[shared_ptr[CDataType]] Make(const shared_ptr[CDataType]& value_type, + const int32_t ndim, + const vector[int64_t] permutation, + const vector[c_string] dim_names, + const vector[optional[int64_t]] uniform_shape) + + const shared_ptr[CDataType] value_type() + const int32_t ndim() + const vector[int64_t] permutation() + const vector[c_string] dim_names() + const vector[optional[int64_t]] uniform_shape() cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil: cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType): @@ -2877,7 +2908,7 @@ cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil: cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension" nogil: cdef cppclass CFixedShapeTensorType \ - " arrow::extension::FixedShapeTensorType"(CExtensionType): + " arrow::extension::FixedShapeTensorType"(CExtensionType) nogil: CResult[shared_ptr[CTensor]] MakeTensor(const shared_ptr[CExtensionScalar]& scalar) const diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 5c3d981c3ad..903c48981fd 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -210,6 +210,11 @@ cdef class ExtensionType(BaseExtensionType): const CPyExtensionType* cpy_ext_type +cdef class VariableShapeTensorType(BaseExtensionType): + cdef: + const CVariableShapeTensorType* tensor_ext_type + + cdef class FixedShapeTensorType(BaseExtensionType): cdef: const CFixedShapeTensorType* tensor_ext_type diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index d3e2ff2e99d..24f2b14b2ba 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -127,6 +127,8 @@ cdef api object pyarrow_wrap_data_type( out = Bool8Type.__new__(Bool8Type) elif extension_name == b"arrow.fixed_shape_tensor": out = FixedShapeTensorType.__new__(FixedShapeTensorType) + elif extension_name == b"arrow.variable_shape_tensor": + out = VariableShapeTensorType.__new__(VariableShapeTensorType) elif extension_name == b"arrow.opaque": out = OpaqueType.__new__(OpaqueType) elif extension_name == b"arrow.uuid": diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 68f77832c43..f9a486743d9 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1064,7 +1064,7 @@ cdef class FixedShapeTensorScalar(ExtensionScalar): The resulting ndarray's shape matches the permuted shape of the fixed shape tensor scalar. - The conversion is zero-copy. + The conversion is zero-copy if data is primitive numeric and without nulls. Returns ------- @@ -1113,6 +1113,43 @@ cdef class Bool8Scalar(ExtensionScalar): py_val = super().as_py() return None if py_val is None else py_val != 0 + +cdef class VariableShapeTensorScalar(ExtensionScalar): + """ + Concrete class for variable shape tensor extension scalar. + """ + + def to_numpy_ndarray(self): + """ + Convert variable shape tensor extension scalar to a numpy array. + + The conversion is zero-copy if data is primitive numeric and without nulls. + + Returns + ------- + numpy.ndarray + """ + return self.to_tensor().to_numpy() + + def to_tensor(self): + """ + Convert variable shape tensor extension scalar to a pyarrow.Tensor. + + Returns + ------- + tensor : pyarrow.Tensor + """ + cdef: + CVariableShapeTensorType* c_type = static_pointer_cast[CVariableShapeTensorType, CDataType]( + self.wrapped.get().type).get() + shared_ptr[CExtensionScalar] scalar = static_pointer_cast[CExtensionScalar, CScalar](self.wrapped) + shared_ptr[CTensor] ctensor + + with nogil: + ctensor = GetResultValue(c_type.MakeTensor(scalar)) + return pyarrow_wrap_tensor(ctensor) + + cdef dict _scalar_classes = { _Type_BOOL: BooleanScalar, _Type_UINT8: UInt8Scalar, diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index aacbd2cb6e7..7468197a796 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -24,6 +24,7 @@ import sys import numpy as np +from numpy.lib.stride_tricks import as_strided import pyarrow as pa from pyarrow.vendored.version import Version @@ -1405,7 +1406,7 @@ def test_uuid_extension(): assert isinstance(array[0], pa.UuidScalar) -def test_tensor_type(): +def test_fixed_shape_tensor_type(): tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3]) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" assert tensor_type.storage_type == pa.list_(pa.int8(), 6) @@ -1430,9 +1431,58 @@ def test_tensor_type(): assert tensor_type.permutation is None +def test_variable_shape_tensor_type(): + tensor_type = pa.variable_shape_tensor(pa.int8(), 2) + expected_storage_type = pa.struct([ + pa.field("data", pa.list_(pa.int8())), + pa.field("shape", pa.list_(pa.int32(), 2)) + ]) + assert tensor_type.extension_name == "arrow.variable_shape_tensor" + assert tensor_type.storage_type == expected_storage_type + assert tensor_type.ndim == 2 + assert tensor_type.dim_names is None + assert tensor_type.permutation is None + assert tensor_type.uniform_shape is None + + tensor_type = pa.variable_shape_tensor(pa.int64(), 3, dim_names=['C', 'H', 'W']) + expected_storage_type = pa.struct([ + pa.field("data", pa.list_(pa.int64())), + pa.field("shape", pa.list_(pa.int32(), 3)) + ]) + assert tensor_type.extension_name == "arrow.variable_shape_tensor" + assert tensor_type.storage_type == expected_storage_type + assert tensor_type.ndim == 3 + assert tensor_type.dim_names == ['C', 'H', 'W'] + assert tensor_type.permutation is None + assert tensor_type.uniform_shape is None + + tensor_type = pa.variable_shape_tensor(pa.bool_(), 2, permutation=[1, 0]) + expected_storage_type = pa.struct([ + pa.field("data", pa.list_(pa.bool_())), + pa.field("shape", pa.list_(pa.int32(), 2)) + ]) + assert tensor_type.extension_name == "arrow.variable_shape_tensor" + assert tensor_type.storage_type == expected_storage_type + assert tensor_type.ndim == 2 + assert tensor_type.dim_names is None + assert tensor_type.permutation == [1, 0] + assert tensor_type.uniform_shape is None + + tensor_type = pa.variable_shape_tensor(pa.float64(), 2, uniform_shape=[1, None]) + expected_storage_type = pa.struct([ + pa.field("data", pa.list_(pa.float64())), + pa.field("shape", pa.list_(pa.int32(), 2)) + ]) + assert tensor_type.extension_name == "arrow.variable_shape_tensor" + assert tensor_type.storage_type == expected_storage_type + assert tensor_type.ndim == 2 + assert tensor_type.dim_names is None + assert tensor_type.permutation is None + assert tensor_type.uniform_shape == [1, None] + + @pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32())) -def test_tensor_class_methods(value_type): - from numpy.lib.stride_tricks import as_strided +def test_fixed_shape_tensor_class_methods(value_type): arrow_type = pa.from_numpy_dtype(value_type) tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 3]) @@ -1481,10 +1531,20 @@ def test_tensor_class_methods(value_type): assert result.to_tensor().shape == (1, 3, 2, 2) assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw) + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[2, 1, 0]) + result = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = as_strided(flat_arr, shape=(1, 3, 2, 2), + strides=(bw * 12, bw, bw * 3, bw * 6)) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + assert result.type.permutation == [2, 1, 0] + assert result.type.shape == [2, 2, 3] + assert result.to_tensor().shape == (1, 3, 2, 2) + assert result.to_tensor().strides == (12 * bw, 1 * bw, 3 * bw, 6 * bw) + @pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32())) -def test_tensor_array_from_numpy(value_type): - from numpy.lib.stride_tricks import as_strided +def test_fixed_shape_tensor_array_from_numpy(value_type): arrow_type = pa.from_numpy_dtype(value_type) arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], @@ -1546,6 +1606,137 @@ def test_tensor_array_from_numpy(value_type): pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((3, 0, 2))) +@pytest.mark.parametrize("value_type", (np.int8, np.int32, np.int64, np.float64)) +def test_variable_shape_tensor_class_methods(value_type): + ndim = 2 + shape_type = pa.list_(pa.int32(), ndim) + arrow_type = pa.from_numpy_dtype(value_type) + tensor_type = pa.variable_shape_tensor( + arrow_type, + ndim, + dim_names=["H", "W"], + permutation=[0, 1], + uniform_shape=[None, None], + ) + fields = [pa.field("data", pa.list_(arrow_type)), pa.field("shape", shape_type)] + + shapes = pa.array([[2, 3], [2, 1]], shape_type) + values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(arrow_type)) + struct_arr = pa.StructArray.from_arrays([values, shapes], fields=fields) + arr = pa.ExtensionArray.from_storage(tensor_type, struct_arr) + basic_arr = pa.ExtensionArray.from_storage( + pa.variable_shape_tensor(arrow_type, ndim), struct_arr + ) + + storage = pa.array( + [([1, 2, 3, 4, 5, 6], [2, 3]), ([7, 8], [2, 1])], type=pa.struct(fields) + ) + assert pa.ExtensionArray.from_storage(tensor_type, storage).equals(arr) + + assert arr.type == tensor_type + + ndarray_list = [ + np.array([[1, 2, 3], [4, 5, 6]], dtype=value_type), + np.array([[7], [8]], dtype=value_type), + ] + list(np.testing.assert_array_equal(x.to_numpy_ndarray(), y) for x, y in + zip(arr, ndarray_list)) + + assert pa.VariableShapeTensorArray.from_numpy_ndarray(ndarray_list).equals( + basic_arr + ) + + assert arr.to_pylist() == [ + {"data": [1, 2, 3, 4, 5, 6], "shape": [2, 3]}, + {"data": [7, 8], "shape": [2, 1]}, + ] + + expected_0 = np.array([[1, 2, 3], [4, 5, 6]], dtype=value_type) + expected_1 = np.array([[7], [8]], dtype=value_type) + + np.testing.assert_array_equal(arr[0].to_tensor().to_numpy(), expected_0) + np.testing.assert_array_equal(arr[1].to_tensor().to_numpy(), expected_1) + + np.testing.assert_array_equal(arr[0].to_numpy_ndarray(), expected_0) + np.testing.assert_array_equal(arr[1].to_numpy_ndarray(), expected_1) + + assert arr[0].to_tensor().equals( + pa.Tensor.from_numpy(expected_0, dim_names=["H", "W"])) + + assert arr[1].to_tensor().equals( + pa.Tensor.from_numpy(expected_1, dim_names=["H", "W"])) + + shapes = pa.array([[2, 3], [0, 0]], shape_type) + values = pa.array([[1, 2, 3, 4, 5, 6], []], pa.list_(arrow_type)) + struct_arr = pa.StructArray.from_arrays([values, shapes], fields=fields) + arr = pa.ExtensionArray.from_storage(tensor_type, struct_arr) + np.testing.assert_array_equal(arr[1].to_tensor().to_numpy(), np.array( + [], dtype=value_type).reshape(shapes[1].as_py())) + + +@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32())) +def test_variable_shape_tensor_array_from_numpy(value_type): + arrow_type = pa.from_numpy_dtype(value_type) + + arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], + dtype=value_type, order="C") + tensor_array_from_numpy = pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + assert isinstance(tensor_array_from_numpy.type, pa.VariableShapeTensorType) + assert tensor_array_from_numpy.type.value_type == arrow_type + assert tensor_array_from_numpy.type.ndim == 3 + assert tensor_array_from_numpy.type.permutation == [0, 1, 2] + + f_arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], + dtype=value_type, order="F") + with pytest.raises(ValueError, match="numpy arrays must have matching permutation"): + pa.VariableShapeTensorArray.from_numpy_ndarray([f_arr, arr]) + with pytest.raises(ValueError, match="numpy arrays must have matching ndim"): + pa.VariableShapeTensorArray.from_numpy_ndarray([arr.reshape((12, 1)), arr]) + with pytest.raises(TypeError, match="numpy arrays must have matching dtype"): + pa.VariableShapeTensorArray.from_numpy_ndarray([arr.astype(np.int32()), arr]) + + flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type) + bw = value_type.itemsize + + arr = flat_arr.reshape(1, 3, 4) + tensor_array_from_numpy = pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + assert tensor_array_from_numpy.type.ndim == 3 + assert tensor_array_from_numpy.type.permutation == [0, 1, 2] + assert tensor_array_from_numpy[0].to_tensor() == pa.Tensor.from_numpy(arr) + + arr = as_strided(flat_arr, shape=(1, 2, 3, 2), + strides=(bw * 12, bw * 6, bw, bw * 3)) + tensor_array_from_numpy = pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + assert tensor_array_from_numpy.type.ndim == 4 + assert tensor_array_from_numpy.type.permutation == [0, 1, 3, 2] + assert tensor_array_from_numpy[0].to_tensor() == pa.Tensor.from_numpy(arr) + + arr = flat_arr.reshape(1, 2, 3, 2) + result = pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + expected = np.array( + [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type) + np.testing.assert_array_equal(result[0].to_numpy_ndarray(), expected) + + arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type) + with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"): + pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + + arr = np.array(1, dtype=value_type) + with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"): + pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + + arr = np.array([], dtype=value_type) + + with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"): + pa.VariableShapeTensorArray.from_numpy_ndarray([arr.reshape((0))]) + + with pytest.raises(ValueError, match="Expected a non-empty ndarray"): + pa.VariableShapeTensorArray.from_numpy_ndarray([arr.reshape((0, 3, 2))]) + + with pytest.raises(ValueError, match="Expected a non-empty ndarray"): + pa.VariableShapeTensorArray.from_numpy_ndarray([arr.reshape((3, 0, 2))]) + + @pytest.mark.parametrize("tensor_type", ( pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]), pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]), @@ -1576,6 +1767,47 @@ def test_tensor_type_ipc(tensor_type): assert result.type.shape == [2, 2, 3] +@pytest.mark.parametrize("tensor_type", ( + pa.variable_shape_tensor(pa.int8(), 2), + pa.variable_shape_tensor(pa.int8(), 2, permutation=[1, 0]), + pa.variable_shape_tensor(pa.int8(), 2, dim_names=['H', 'W']), + pa.variable_shape_tensor(pa.int8(), 2, uniform_shape=[None, None]), +)) +def test_variable_shape_tensor_type_ipc(tensor_type): + values_type = tensor_type.storage_type.field(0).type + shape_type = tensor_type.storage_type.field(1).type + values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], values_type) + shapes = pa.array([[2, 3], [1, 2]], shape_type) + + struct_arr = pa.StructArray.from_arrays([values, shapes], names=["data", "shape"]) + arr = pa.ExtensionArray.from_storage(tensor_type, struct_arr) + batch = pa.RecordBatch.from_arrays([arr], ["ext"]) + + # check the built array has exactly the expected clss + tensor_class = tensor_type.__arrow_ext_class__() + assert isinstance(arr, tensor_class) + + buf = ipc_write_batch(batch) + del batch + batch = ipc_read_batch(buf) + + result = batch.column(0) + # check the deserialized array class is the expected one + assert isinstance(result, tensor_class) + assert result.type.extension_name == "arrow.variable_shape_tensor" + assert arr.storage.to_pylist() == [ + {"data": [1, 2, 3, 4, 5, 6], "shape": [2, 3]}, + {"data": [7, 8], "shape": [1, 2]}, + ] + + # we get back an actual TensorType + assert isinstance(result.type, pa.VariableShapeTensorType) + assert result.type.value_type == pa.int8() + assert result.type.ndim == 2 + assert result.type.permutation == tensor_type.permutation + assert result.type.dim_names == tensor_type.dim_names + + def test_tensor_type_equality(): tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" @@ -1585,6 +1817,14 @@ def test_tensor_type_equality(): assert tensor_type == tensor_type2 assert not tensor_type == tensor_type3 + tensor_type = pa.variable_shape_tensor(pa.int8(), 2) + assert tensor_type.extension_name == "arrow.variable_shape_tensor" + + tensor_type2 = pa.variable_shape_tensor(pa.int8(), 2) + tensor_type3 = pa.variable_shape_tensor(pa.uint8(), 2) + assert tensor_type == tensor_type2 + assert not tensor_type == tensor_type3 + def test_tensor_type_cast(): tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3]) @@ -1650,7 +1890,7 @@ def test_extension_to_pandas_storage_type(registered_period_type): assert isinstance(result["ext"].dtype, pd.ArrowDtype) -def test_tensor_type_is_picklable(pickle_module): +def test_fixed_shape_tensor_type_is_picklable(pickle_module): # GH-35599 expected_type = pa.fixed_shape_tensor(pa.int32(), (2, 2)) @@ -1666,6 +1906,22 @@ def test_tensor_type_is_picklable(pickle_module): assert result == expected_arr +def test_variable_shape_tensor_type_is_picklable(pickle_module): + expected_type = pa.variable_shape_tensor(pa.int32(), 2) + result = pickle_module.loads(pickle_module.dumps(expected_type)) + + assert result == expected_type + + shapes = pa.array([[2, 3], [1, 2]], pa.list_(pa.int32(), 2)) + values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(pa.int32())) + arr = pa.StructArray.from_arrays([values, shapes], names=["data", "shape"]) + expected_arr = pa.ExtensionArray.from_storage(expected_type, arr) + + result = pickle_module.loads(pickle_module.dumps(expected_arr)) + + assert result == expected_arr + + @pytest.mark.parametrize(("tensor_type", "text"), [ ( pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]), @@ -1680,7 +1936,7 @@ def test_tensor_type_is_picklable(pickle_module): 'fixed_shape_tensor[value_type=int64, shape=[2,2,3], dim_names=[C,H,W]]' ) ]) -def test_tensor_type_str(tensor_type, text): +def test_tensor_type_str(tensor_type, text, pickle_module): tensor_type_str = tensor_type.__str__() assert text in tensor_type_str diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index f83ecc3aa43..9130915afc0 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1784,6 +1784,99 @@ cdef class UuidType(BaseExtensionType): return UuidScalar +cdef class VariableShapeTensorType(BaseExtensionType): + """ + Concrete class for variable shape tensor extension type. + + Examples + -------- + Create an instance of variable shape tensor extension type: + + >>> import pyarrow as pa + >>> pa.variable_shape_tensor(pa.int32(), 2) + VariableShapeTensorType(extension) + + Create an instance of variable shape tensor extension type with + permutation: + + >>> tensor_type = pa.variable_shape_tensor(pa.int8(), 3, + ... permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.tensor_ext_type = type.get() + + @property + def value_type(self): + """ + Data type of an individual tensor. + """ + return pyarrow_wrap_data_type(self.tensor_ext_type.value_type()) + + @property + def ndim(self): + """ + Number of dimensions of the tensors. + """ + return self.tensor_ext_type.ndim() + + @property + def dim_names(self): + """ + Explicit names of the dimensions. + """ + list_of_bytes = self.tensor_ext_type.dim_names() + if len(list_of_bytes) != 0: + return [frombytes(x) for x in list_of_bytes] + else: + return None + + @property + def permutation(self): + """ + Indices of the dimensions ordering. + """ + indices = self.tensor_ext_type.permutation() + if len(indices) != 0: + return indices + else: + return None + + @property + def uniform_shape(self): + """ + Shape over dimensions that are guaranteed to be constant. + """ + cdef: + vector[optional[int64_t]] c_uniform_shape = self.tensor_ext_type.uniform_shape() + length = c_uniform_shape.size() + + if length == 0: + return None + + uniform_shape = [] + for i in range(length): + if c_uniform_shape[i].has_value(): + uniform_shape.append(c_uniform_shape[i].value()) + else: + uniform_shape.append(None) + + return uniform_shape + + def __arrow_ext_class__(self): + return VariableShapeTensorArray + + def __reduce__(self): + return variable_shape_tensor, (self.value_type, self.ndim, + self.permutation, self.dim_names, self.uniform_shape) + + def __arrow_ext_scalar_class__(self): + return VariableShapeTensorScalar + + cdef class FixedShapeTensorType(BaseExtensionType): """ Concrete class for fixed shape tensor extension type. @@ -5444,6 +5537,121 @@ def opaque(DataType storage_type, str type_name not None, str vendor_name not No return out +def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation=None, + uniform_shape=None): + """ + Create instance of variable shape tensor extension type with number of + dimensions and optional names of tensor dimensions and indices of the + desired logical ordering of dimensions. + + Parameters + ---------- + value_type : DataType + Data type of individual tensor elements. + ndim : integer + The number of dimensions of the contained tensors. + dim_names : tuple or list of strings, default None + Explicit names to tensor dimensions. + permutation : tuple or list integers, default None + Indices of the desired ordering of the original dimensions. + The indices contain a permutation of the values ``[0, 1, .., N-1]`` where + N is the number of dimensions. The permutation indicates which dimension + of the logical layout corresponds to which dimension of the physical tensor. + For more information on this parameter see + :ref:`variable_shape_tensor_extension`. + uniform_shape : tuple or list of integers, default None + Shape of dimensions that are guaranteed to stay constant over all tensors + in the array if all their non-uniform sizes were replaced by None. + + Examples + -------- + Create an instance of variable shape tensor extension type: + + >>> import pyarrow as pa + >>> tensor_type = pa.variable_shape_tensor(pa.int32(), 2) + >>> tensor_type + VariableShapeTensorType(extension) + + Inspect the data type: + + >>> tensor_type.value_type + DataType(int32) + >>> tensor_type.ndim + 2 + + Create a table with variable shape tensor extension array: + + >>> fields = [pa.field("data", pa.list_(pa.int32())), pa.field("shape", pa.list_(pa.int32(), 2))] + >>> storage = pa.array([([1, 2, 3, 4, 5, 6], [2, 3]), ([7, 8], [1, 2])], type=pa.struct(fields)) + >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage) + >>> pa.table([tensor], names=["tensor_array"]) + pyarrow.Table + tensor_array: extension + ---- + tensor_array: [ -- is_valid: all not null + -- child 0 type: list + [[1,2,3,4,5,6],[7,8]] + -- child 1 type: fixed_size_list[2] + [[2,3],[1,2]]] + + Create an instance of variable shape tensor extension type with names + of tensor dimensions: + + >>> tensor_type = pa.variable_shape_tensor(pa.int8(), 3, + ... dim_names=['C', 'H', 'W']) + >>> tensor_type.dim_names + ['C', 'H', 'W'] + + Create an instance of variable shape tensor extension type with + permutation: + + >>> tensor_type = pa.variable_shape_tensor(pa.int8(), 3, + ... permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + + Returns + ------- + type : VariableShapeTensorType + """ + + cdef: + int32_t c_ndim + vector[int64_t] c_permutation + vector[c_string] c_dim_names + vector[optional[int64_t]] c_uniform_shape + shared_ptr[CDataType] c_tensor_ext_type + + assert value_type is not None + assert ndim is not None + + c_ndim = ndim + + if permutation is not None: + for i in permutation: + c_permutation.push_back(i) + + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + if uniform_shape is not None: + for x in uniform_shape: + if x is None: + c_uniform_shape.push_back(nullopt) + else: + c_uniform_shape.push_back((x)) + + cdef VariableShapeTensorType out = VariableShapeTensorType.__new__(VariableShapeTensorType) + + with nogil: + c_tensor_ext_type = GetResultValue(CVariableShapeTensorType.Make( + value_type.sp_type, c_ndim, c_permutation, c_dim_names, c_uniform_shape)) + + out.init(c_tensor_ext_type) + return out + + cdef dict _type_aliases = { 'null': null, 'bool': bool_,