From 3618369ff28855b6f4a19d7d009f20119b815f16 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 31 Jul 2025 10:13:58 +0800 Subject: [PATCH 1/3] GH-47241: [C++][Parquet] Fix VariantExtentionType conversion --- cpp/src/parquet/arrow/arrow_schema_test.cc | 62 +++++++++++++--------- cpp/src/parquet/arrow/schema.cc | 4 ++ cpp/src/parquet/schema_test.cc | 2 +- cpp/src/parquet/types.cc | 5 +- 4 files changed, 45 insertions(+), 28 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 390d8bc77ef..70d1b1110db 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -25,6 +25,7 @@ #include "parquet/arrow/reader.h" #include "parquet/arrow/reader_internal.h" #include "parquet/arrow/schema.h" +#include "parquet/arrow/variant_internal.h" #include "parquet/file_reader.h" #include "parquet/schema.h" #include "parquet/schema_internal.h" @@ -941,46 +942,57 @@ TEST_F(TestConvertParquetSchema, ParquetVariant) { PrimitiveNode::Make("metadata", Repetition::REQUIRED, ParquetType::BYTE_ARRAY); auto value = PrimitiveNode::Make("value", Repetition::REQUIRED, ParquetType::BYTE_ARRAY); - - auto variant = - GroupNode::Make("variant_unshredded", Repetition::OPTIONAL, {metadata, value}); + auto variant = GroupNode::Make("variant_unshredded", Repetition::OPTIONAL, + {metadata, value}, LogicalType::Variant()); parquet_fields.push_back(variant); - { - // Test converting from parquet schema to arrow schema. - std::vector> arrow_fields; - auto arrow_metadata = - ::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false); - auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false); - auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value}); - arrow_fields.push_back( - ::arrow::field("variant_unshredded", arrow_variant, /*nullable=*/true)); - auto arrow_schema = ::arrow::schema(arrow_fields); + // Arrow schema for unshredded variant struct. + auto arrow_metadata = ::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false); + auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false); + auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value}); + + // Register the variant extension type since it is not registered by default. + auto variant_extension = std::make_shared(arrow_variant); + ASSERT_OK(::arrow::ExtensionTypeRegistry::GetGlobalRegistry()->RegisterType( + variant_extension)); + { + // Parquet file does not contain Arrow schema. + // By default, field should be treated as a normal struct in Arrow. + auto arrow_schema = + ::arrow::schema({::arrow::field("variant_unshredded", arrow_variant)}); ASSERT_OK(ConvertSchema(parquet_fields)); - ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true)); + } + + { + // Parquet file does not contain Arrow schema. + // If Arrow extensions are enabled, field should be interpreted as Parquet Variant + // extension type. + ArrowReaderProperties props; + props.set_arrow_extensions_enabled(true); + + auto arrow_schema = + ::arrow::schema({::arrow::field("variant_unshredded", variant_extension)}); + + ASSERT_OK(ConvertSchema(parquet_fields, /*metadata=*/nullptr, props)); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true)); } { - // Test converting from parquet schema to arrow schema even though + // Parquet file does contain Arrow schema. + // Field should be interpreted as Parquet Variant extension even though // extensions are not enabled. ArrowReaderProperties props; props.set_arrow_extensions_enabled(false); - // Test converting from parquet schema to arrow schema. - std::vector> arrow_fields; - auto arrow_metadata = - ::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false); - auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false); - auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value}); - arrow_fields.push_back( - ::arrow::field("variant_unshredded", arrow_variant, /*nullable=*/true)); - auto arrow_schema = ::arrow::schema(arrow_fields); + auto arrow_schema = + ::arrow::schema({::arrow::field("variant_unshredded", variant_extension)}); std::shared_ptr metadata; ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata)); ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); - CheckFlatSchema(arrow_schema, true /* check_metadata */); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true)); } } diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index cd026b3bea8..ab810aa0d9e 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -591,6 +591,10 @@ Status GroupToStruct(const GroupNode& node, LevelInfo current_levels, arrow_fields.push_back(out->children[i].field); } auto struct_type = ::arrow::struct_(arrow_fields); + if (ctx->properties.get_arrow_extensions_enabled() && + node.logical_type()->is_variant()) { + ARROW_ASSIGN_OR_RAISE(struct_type, VariantExtensionType::Make(struct_type)); + } out->field = ::arrow::field(node.name(), struct_type, node.is_optional(), FieldIdMetadata(node.field_id())); out->level_info = current_levels; diff --git a/cpp/src/parquet/schema_test.cc b/cpp/src/parquet/schema_test.cc index 9c155e1763d..c33e5ccf4a5 100644 --- a/cpp/src/parquet/schema_test.cc +++ b/cpp/src/parquet/schema_test.cc @@ -1251,7 +1251,7 @@ TEST(TestLogicalTypeOperation, LogicalTypeProperties) { {BSONLogicalType::Make(), false, true, true}, {UUIDLogicalType::Make(), false, true, true}, {Float16LogicalType::Make(), false, true, true}, - {VariantLogicalType::Make(), false, true, true}, + {VariantLogicalType::Make(), true, true, true}, {NoLogicalType::Make(), false, false, true}, }; diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index e54683084cf..4d6691c0aea 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -835,8 +835,9 @@ bool LogicalType::is_valid() const { } bool LogicalType::is_invalid() const { return !is_valid(); } bool LogicalType::is_nested() const { - return (impl_->type() == LogicalType::Type::LIST) || - (impl_->type() == LogicalType::Type::MAP); + return impl_->type() == LogicalType::Type::LIST || + impl_->type() == LogicalType::Type::MAP || + impl_->type() == LogicalType::Type::VARIANT; } bool LogicalType::is_nonnested() const { return !is_nested(); } bool LogicalType::is_serialized() const { return impl_->is_serialized(); } From 00c754e367a5443f90d4dd5b418440ce4f8720e9 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 24 Aug 2025 13:49:20 +0800 Subject: [PATCH 2/3] use ExtensionTypeGuard and test unregistered --- cpp/src/parquet/arrow/arrow_schema_test.cc | 26 +++++++++++++--------- cpp/src/parquet/arrow/schema.cc | 7 +++++- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 70d1b1110db..24b8a5be78c 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -950,11 +950,7 @@ TEST_F(TestConvertParquetSchema, ParquetVariant) { auto arrow_metadata = ::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false); auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false); auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value}); - - // Register the variant extension type since it is not registered by default. auto variant_extension = std::make_shared(arrow_variant); - ASSERT_OK(::arrow::ExtensionTypeRegistry::GetGlobalRegistry()->RegisterType( - variant_extension)); { // Parquet file does not contain Arrow schema. @@ -965,29 +961,37 @@ TEST_F(TestConvertParquetSchema, ParquetVariant) { ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true)); } - { + for (bool register_extension : {true, false}) { + ::arrow::ExtensionTypeGuard guard(register_extension + ? ::arrow::DataTypeVector{variant_extension} + : ::arrow::DataTypeVector{}); + // Parquet file does not contain Arrow schema. // If Arrow extensions are enabled, field should be interpreted as Parquet Variant - // extension type. + // extension type if registered. ArrowReaderProperties props; props.set_arrow_extensions_enabled(true); - auto arrow_schema = - ::arrow::schema({::arrow::field("variant_unshredded", variant_extension)}); + auto arrow_schema = ::arrow::schema({::arrow::field( + "variant_unshredded", register_extension ? variant_extension : arrow_variant)}); ASSERT_OK(ConvertSchema(parquet_fields, /*metadata=*/nullptr, props)); ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true)); } - { + for (bool register_extension : {true, false}) { + ::arrow::ExtensionTypeGuard guard(register_extension + ? ::arrow::DataTypeVector{variant_extension} + : ::arrow::DataTypeVector{}); + // Parquet file does contain Arrow schema. // Field should be interpreted as Parquet Variant extension even though // extensions are not enabled. ArrowReaderProperties props; props.set_arrow_extensions_enabled(false); - auto arrow_schema = - ::arrow::schema({::arrow::field("variant_unshredded", variant_extension)}); + auto arrow_schema = ::arrow::schema({::arrow::field( + "variant_unshredded", register_extension ? variant_extension : arrow_variant)}); std::shared_ptr metadata; ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata)); diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index ab810aa0d9e..f0dc45b7851 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -593,7 +593,12 @@ Status GroupToStruct(const GroupNode& node, LevelInfo current_levels, auto struct_type = ::arrow::struct_(arrow_fields); if (ctx->properties.get_arrow_extensions_enabled() && node.logical_type()->is_variant()) { - ARROW_ASSIGN_OR_RAISE(struct_type, VariantExtensionType::Make(struct_type)); + auto extension_type = ::arrow::GetExtensionType("parquet.variant"); + if (extension_type) { + ARROW_ASSIGN_OR_RAISE( + struct_type, + extension_type->Deserialize(std::move(struct_type), /*serialized_data=*/"")); + } } out->field = ::arrow::field(node.name(), struct_type, node.is_optional(), FieldIdMetadata(node.field_id())); From 9940c6405d6229e6b2cff666fea7235dcc8c188d Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Mon, 25 Aug 2025 17:43:50 +0800 Subject: [PATCH 3/3] Update cpp/src/parquet/arrow/arrow_schema_test.cc Co-authored-by: Antoine Pitrou --- cpp/src/parquet/arrow/arrow_schema_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 24b8a5be78c..e1a44251495 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -985,8 +985,8 @@ TEST_F(TestConvertParquetSchema, ParquetVariant) { : ::arrow::DataTypeVector{}); // Parquet file does contain Arrow schema. - // Field should be interpreted as Parquet Variant extension even though - // extensions are not enabled. + // Field should be interpreted as Parquet Variant extension, if registered, + // even though extensions are not enabled. ArrowReaderProperties props; props.set_arrow_extensions_enabled(false);