diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 390d8bc77ef..e1a44251495 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -25,6 +25,7 @@ #include "parquet/arrow/reader.h" #include "parquet/arrow/reader_internal.h" #include "parquet/arrow/schema.h" +#include "parquet/arrow/variant_internal.h" #include "parquet/file_reader.h" #include "parquet/schema.h" #include "parquet/schema_internal.h" @@ -941,46 +942,61 @@ TEST_F(TestConvertParquetSchema, ParquetVariant) { PrimitiveNode::Make("metadata", Repetition::REQUIRED, ParquetType::BYTE_ARRAY); auto value = PrimitiveNode::Make("value", Repetition::REQUIRED, ParquetType::BYTE_ARRAY); - - auto variant = - GroupNode::Make("variant_unshredded", Repetition::OPTIONAL, {metadata, value}); + auto variant = GroupNode::Make("variant_unshredded", Repetition::OPTIONAL, + {metadata, value}, LogicalType::Variant()); parquet_fields.push_back(variant); - { - // Test converting from parquet schema to arrow schema. - std::vector> arrow_fields; - auto arrow_metadata = - ::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false); - auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false); - auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value}); - arrow_fields.push_back( - ::arrow::field("variant_unshredded", arrow_variant, /*nullable=*/true)); - auto arrow_schema = ::arrow::schema(arrow_fields); + // Arrow schema for unshredded variant struct. + auto arrow_metadata = ::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false); + auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false); + auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value}); + auto variant_extension = std::make_shared(arrow_variant); + { + // Parquet file does not contain Arrow schema. + // By default, field should be treated as a normal struct in Arrow. + auto arrow_schema = + ::arrow::schema({::arrow::field("variant_unshredded", arrow_variant)}); ASSERT_OK(ConvertSchema(parquet_fields)); - ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true)); } - { - // Test converting from parquet schema to arrow schema even though - // extensions are not enabled. + for (bool register_extension : {true, false}) { + ::arrow::ExtensionTypeGuard guard(register_extension + ? ::arrow::DataTypeVector{variant_extension} + : ::arrow::DataTypeVector{}); + + // Parquet file does not contain Arrow schema. + // If Arrow extensions are enabled, field should be interpreted as Parquet Variant + // extension type if registered. + ArrowReaderProperties props; + props.set_arrow_extensions_enabled(true); + + auto arrow_schema = ::arrow::schema({::arrow::field( + "variant_unshredded", register_extension ? variant_extension : arrow_variant)}); + + ASSERT_OK(ConvertSchema(parquet_fields, /*metadata=*/nullptr, props)); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true)); + } + + for (bool register_extension : {true, false}) { + ::arrow::ExtensionTypeGuard guard(register_extension + ? ::arrow::DataTypeVector{variant_extension} + : ::arrow::DataTypeVector{}); + + // Parquet file does contain Arrow schema. + // Field should be interpreted as Parquet Variant extension, if registered, + // even though extensions are not enabled. ArrowReaderProperties props; props.set_arrow_extensions_enabled(false); - // Test converting from parquet schema to arrow schema. - std::vector> arrow_fields; - auto arrow_metadata = - ::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false); - auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false); - auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value}); - arrow_fields.push_back( - ::arrow::field("variant_unshredded", arrow_variant, /*nullable=*/true)); - auto arrow_schema = ::arrow::schema(arrow_fields); + auto arrow_schema = ::arrow::schema({::arrow::field( + "variant_unshredded", register_extension ? variant_extension : arrow_variant)}); std::shared_ptr metadata; ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata)); ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); - CheckFlatSchema(arrow_schema, true /* check_metadata */); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true)); } } diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index cd026b3bea8..f0dc45b7851 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -591,6 +591,15 @@ Status GroupToStruct(const GroupNode& node, LevelInfo current_levels, arrow_fields.push_back(out->children[i].field); } auto struct_type = ::arrow::struct_(arrow_fields); + if (ctx->properties.get_arrow_extensions_enabled() && + node.logical_type()->is_variant()) { + auto extension_type = ::arrow::GetExtensionType("parquet.variant"); + if (extension_type) { + ARROW_ASSIGN_OR_RAISE( + struct_type, + extension_type->Deserialize(std::move(struct_type), /*serialized_data=*/"")); + } + } out->field = ::arrow::field(node.name(), struct_type, node.is_optional(), FieldIdMetadata(node.field_id())); out->level_info = current_levels; diff --git a/cpp/src/parquet/schema_test.cc b/cpp/src/parquet/schema_test.cc index 9c155e1763d..c33e5ccf4a5 100644 --- a/cpp/src/parquet/schema_test.cc +++ b/cpp/src/parquet/schema_test.cc @@ -1251,7 +1251,7 @@ TEST(TestLogicalTypeOperation, LogicalTypeProperties) { {BSONLogicalType::Make(), false, true, true}, {UUIDLogicalType::Make(), false, true, true}, {Float16LogicalType::Make(), false, true, true}, - {VariantLogicalType::Make(), false, true, true}, + {VariantLogicalType::Make(), true, true, true}, {NoLogicalType::Make(), false, false, true}, }; diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index e54683084cf..4d6691c0aea 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -835,8 +835,9 @@ bool LogicalType::is_valid() const { } bool LogicalType::is_invalid() const { return !is_valid(); } bool LogicalType::is_nested() const { - return (impl_->type() == LogicalType::Type::LIST) || - (impl_->type() == LogicalType::Type::MAP); + return impl_->type() == LogicalType::Type::LIST || + impl_->type() == LogicalType::Type::MAP || + impl_->type() == LogicalType::Type::VARIANT; } bool LogicalType::is_nonnested() const { return !is_nested(); } bool LogicalType::is_serialized() const { return impl_->is_serialized(); }