-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Support parquet canonical extension type roundtrip #8409
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,6 +24,7 @@ | |
| //! with the key "ARROW:extension:name". | ||
|
|
||
| use crate::basic::LogicalType; | ||
| use crate::errors::ParquetError; | ||
| use crate::schema::types::Type; | ||
| use arrow_schema::extension::ExtensionType; | ||
| use arrow_schema::Field; | ||
|
|
@@ -34,23 +35,51 @@ use arrow_schema::Field; | |
| /// Some Parquet logical types, such as Variant, do not map directly to an | ||
| /// Arrow DataType, and instead are represented by an Arrow ExtensionType. | ||
| /// Extension types are attached to Arrow Fields via metadata. | ||
| pub(crate) fn add_extension_type(mut arrow_field: Field, parquet_type: &Type) -> Field { | ||
| match parquet_type.get_basic_info().logical_type() { | ||
| pub(crate) fn try_add_extension_type( | ||
| mut arrow_field: Field, | ||
| parquet_type: &Type, | ||
| ) -> Result<Field, ParquetError> { | ||
| let Some(parquet_logical_type) = parquet_type.get_basic_info().logical_type() else { | ||
| return Ok(arrow_field); | ||
| }; | ||
| match parquet_logical_type { | ||
| #[cfg(feature = "variant_experimental")] | ||
| Some(LogicalType::Variant) => { | ||
| // try to add the Variant extension type, but if that fails (e.g. because the | ||
| // storage type is not supported), just return the field as is | ||
| arrow_field | ||
| .try_with_extension_type(parquet_variant_compute::VariantType) | ||
| .ok(); | ||
| arrow_field | ||
| LogicalType::Variant => { | ||
| arrow_field.try_with_extension_type(parquet_variant_compute::VariantType)?; | ||
| } | ||
| // TODO add other LogicalTypes here | ||
| _ => arrow_field, | ||
| #[cfg(feature = "arrow_canonical_extension_types")] | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The core change of this PR is moving the code that handles extension types from the |
||
| LogicalType::Uuid => { | ||
| arrow_field.try_with_extension_type(arrow_schema::extension::Uuid)?; | ||
| } | ||
| #[cfg(feature = "arrow_canonical_extension_types")] | ||
| LogicalType::Json => { | ||
| arrow_field.try_with_extension_type(arrow_schema::extension::Json::default())?; | ||
| } | ||
| _ => {} | ||
| }; | ||
| Ok(arrow_field) | ||
| } | ||
|
|
||
| /// Returns true if [`try_add_extension_type`] would add an extension type | ||
| /// to the specified Parquet field. | ||
| /// | ||
| /// This is used to preallocate the metadata hashmap size | ||
| pub(crate) fn has_extension_type(parquet_type: &Type) -> bool { | ||
| let Some(parquet_logical_type) = parquet_type.get_basic_info().logical_type() else { | ||
| return false; | ||
| }; | ||
| match parquet_logical_type { | ||
| #[cfg(feature = "variant_experimental")] | ||
| LogicalType::Variant => true, | ||
| #[cfg(feature = "arrow_canonical_extension_types")] | ||
| LogicalType::Uuid => true, | ||
| #[cfg(feature = "arrow_canonical_extension_types")] | ||
| LogicalType::Json => true, | ||
| _ => false, | ||
| } | ||
| } | ||
|
|
||
| /// Return the Parquet logical type to use for the specified Arrow field, if any. | ||
| /// Return the Parquet logical type to use for the specified Arrow Struct field, if any. | ||
| #[cfg(feature = "variant_experimental")] | ||
| pub(crate) fn logical_type_for_struct(field: &Field) -> Option<LogicalType> { | ||
| use parquet_variant_compute::VariantType; | ||
|
|
@@ -67,6 +96,38 @@ pub(crate) fn logical_type_for_struct(field: &Field) -> Option<LogicalType> { | |
| } | ||
|
|
||
| #[cfg(not(feature = "variant_experimental"))] | ||
| pub(crate) fn logical_type_for_struct(field: &Field) -> Option<LogicalType> { | ||
| pub(crate) fn logical_type_for_struct(_field: &Field) -> Option<LogicalType> { | ||
| None | ||
| } | ||
|
|
||
| /// Return the Parquet logical type to use for the specified Arrow fixed size binary field, if any. | ||
| #[cfg(feature = "arrow_canonical_extension_types")] | ||
| pub(crate) fn logical_type_for_fixed_size_binary(field: &Field) -> Option<LogicalType> { | ||
| use arrow_schema::extension::Uuid; | ||
| // If set, map arrow uuid extension type to parquet uuid logical type. | ||
| field | ||
| .try_extension_type::<Uuid>() | ||
| .ok() | ||
| .map(|_| LogicalType::Uuid) | ||
| } | ||
|
|
||
| #[cfg(not(feature = "arrow_canonical_extension_types"))] | ||
| pub(crate) fn logical_type_for_fixed_size_binary(_field: &Field) -> Option<LogicalType> { | ||
| None | ||
| } | ||
|
|
||
| /// Return the Parquet logical type to use for the specified Arrow string field (Utf8, LargeUtf8) if any | ||
| #[cfg(feature = "arrow_canonical_extension_types")] | ||
| pub(crate) fn logical_type_for_string(field: &Field) -> Option<LogicalType> { | ||
| use arrow_schema::extension::Json; | ||
| // Use the Json logical type if the canonical Json | ||
| // extension type is set on this field. | ||
| field | ||
| .try_extension_type::<Json>() | ||
| .map_or(Some(LogicalType::String), |_| Some(LogicalType::Json)) | ||
| } | ||
|
|
||
| #[cfg(not(feature = "arrow_canonical_extension_types"))] | ||
| pub(crate) fn logical_type_for_string(_field: &Field) -> Option<LogicalType> { | ||
| Some(LogicalType::String) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,8 +23,6 @@ use std::collections::HashMap; | |
| use std::sync::Arc; | ||
|
|
||
| use arrow_ipc::writer; | ||
| #[cfg(feature = "arrow_canonical_extension_types")] | ||
| use arrow_schema::extension::{Json, Uuid}; | ||
| use arrow_schema::{DataType, Field, Fields, Schema, TimeUnit}; | ||
|
|
||
| use crate::basic::{ | ||
|
|
@@ -39,7 +37,10 @@ mod extension; | |
| mod primitive; | ||
|
|
||
| use super::PARQUET_FIELD_ID_META_KEY; | ||
| use crate::arrow::schema::extension::logical_type_for_struct; | ||
| use crate::arrow::schema::extension::{ | ||
| has_extension_type, logical_type_for_fixed_size_binary, logical_type_for_string, | ||
| logical_type_for_struct, try_add_extension_type, | ||
| }; | ||
| use crate::arrow::ProjectionMask; | ||
| pub(crate) use complex::{ParquetField, ParquetFieldType}; | ||
|
|
||
|
|
@@ -390,31 +391,27 @@ pub fn parquet_to_arrow_field(parquet_column: &ColumnDescriptor) -> Result<Field | |
| let field = complex::convert_type(&parquet_column.self_type_ptr())?; | ||
| let mut ret = Field::new(parquet_column.name(), field.arrow_type, field.nullable); | ||
|
|
||
| let basic_info = parquet_column.self_type().get_basic_info(); | ||
| let mut meta = HashMap::with_capacity(if cfg!(feature = "arrow_canonical_extension_types") { | ||
| 2 | ||
| } else { | ||
| 1 | ||
| }); | ||
| let parquet_type = parquet_column.self_type(); | ||
| let basic_info = parquet_type.get_basic_info(); | ||
|
|
||
| let mut hash_map_size = 0; | ||
| if basic_info.has_id() { | ||
| hash_map_size += 1; | ||
| } | ||
| if has_extension_type(parquet_type) { | ||
| hash_map_size += 1; | ||
| } | ||
| if hash_map_size == 0 { | ||
| return Ok(ret); | ||
| } | ||
| ret.set_metadata(HashMap::with_capacity(hash_map_size)); | ||
| if basic_info.has_id() { | ||
| meta.insert( | ||
| ret.metadata_mut().insert( | ||
| PARQUET_FIELD_ID_META_KEY.to_string(), | ||
| basic_info.id().to_string(), | ||
| ); | ||
| } | ||
| #[cfg(feature = "arrow_canonical_extension_types")] | ||
| if let Some(logical_type) = basic_info.logical_type() { | ||
| match logical_type { | ||
| LogicalType::Uuid => ret.try_with_extension_type(Uuid)?, | ||
| LogicalType::Json => ret.try_with_extension_type(Json::default())?, | ||
| _ => {} | ||
| } | ||
| } | ||
| if !meta.is_empty() { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the actual fix for #7063 The existing code has a very subtle bug -- by calling |
||
| ret.set_metadata(meta); | ||
| } | ||
|
|
||
| Ok(ret) | ||
| try_add_extension_type(ret, parquet_column.self_type()) | ||
| } | ||
|
|
||
| pub fn decimal_length_from_precision(precision: u8) -> usize { | ||
|
|
@@ -618,16 +615,7 @@ fn arrow_to_parquet_type(field: &Field, coerce_types: bool) -> Result<Type> { | |
| .with_repetition(repetition) | ||
| .with_id(id) | ||
| .with_length(*length) | ||
| .with_logical_type( | ||
| #[cfg(feature = "arrow_canonical_extension_types")] | ||
| // If set, map arrow uuid extension type to parquet uuid logical type. | ||
| field | ||
| .try_extension_type::<Uuid>() | ||
| .ok() | ||
| .map(|_| LogicalType::Uuid), | ||
| #[cfg(not(feature = "arrow_canonical_extension_types"))] | ||
| None, | ||
| ) | ||
| .with_logical_type(logical_type_for_fixed_size_binary(field)) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this code is just moved into the extension module |
||
| .build() | ||
| } | ||
| DataType::BinaryView => Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) | ||
|
|
@@ -664,35 +652,13 @@ fn arrow_to_parquet_type(field: &Field, coerce_types: bool) -> Result<Type> { | |
| } | ||
| DataType::Utf8 | DataType::LargeUtf8 => { | ||
| Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) | ||
| .with_logical_type({ | ||
| #[cfg(feature = "arrow_canonical_extension_types")] | ||
| { | ||
| // Use the Json logical type if the canonical Json | ||
| // extension type is set on this field. | ||
| field | ||
| .try_extension_type::<Json>() | ||
| .map_or(Some(LogicalType::String), |_| Some(LogicalType::Json)) | ||
| } | ||
| #[cfg(not(feature = "arrow_canonical_extension_types"))] | ||
| Some(LogicalType::String) | ||
| }) | ||
| .with_logical_type(logical_type_for_string(field)) | ||
| .with_repetition(repetition) | ||
| .with_id(id) | ||
| .build() | ||
| } | ||
| DataType::Utf8View => Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) | ||
| .with_logical_type({ | ||
| #[cfg(feature = "arrow_canonical_extension_types")] | ||
| { | ||
| // Use the Json logical type if the canonical Json | ||
| // extension type is set on this field. | ||
| field | ||
| .try_extension_type::<Json>() | ||
| .map_or(Some(LogicalType::String), |_| Some(LogicalType::Json)) | ||
| } | ||
| #[cfg(not(feature = "arrow_canonical_extension_types"))] | ||
| Some(LogicalType::String) | ||
| }) | ||
| .with_logical_type(logical_type_for_string(field)) | ||
| .with_repetition(repetition) | ||
| .with_id(id) | ||
| .build(), | ||
|
|
@@ -797,15 +763,14 @@ mod tests { | |
|
|
||
| use std::{collections::HashMap, sync::Arc}; | ||
|
|
||
| use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; | ||
|
|
||
| use crate::arrow::PARQUET_FIELD_ID_META_KEY; | ||
| use crate::file::metadata::KeyValue; | ||
| use crate::file::reader::FileReader; | ||
| use crate::{ | ||
| arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter}, | ||
| schema::{parser::parse_message_type, types::SchemaDescriptor}, | ||
| }; | ||
| use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; | ||
|
|
||
| #[test] | ||
| fn test_flat_primitives() { | ||
|
|
@@ -844,12 +809,26 @@ mod tests { | |
| Field::new("float16", DataType::Float16, true), | ||
| Field::new("string", DataType::Utf8, true), | ||
| Field::new("string_2", DataType::Utf8, true), | ||
| Field::new("json", DataType::Utf8, true), | ||
| json_field(), | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this now actually fails when the canonical extension types are enabled, because a JSON parquet field is now (correctly) annotated with the extension type field |
||
| ]); | ||
|
|
||
| assert_eq!(&arrow_fields, converted_arrow_schema.fields()); | ||
| } | ||
|
|
||
| /// Return the expected Field for a Parquet column annotated with | ||
| /// the JSON logical type. | ||
| fn json_field() -> Field { | ||
| #[cfg(feature = "arrow_canonical_extension_types")] | ||
| { | ||
| Field::new("json", DataType::Utf8, true) | ||
| .with_extension_type(arrow_schema::extension::Json::default()) | ||
| } | ||
| #[cfg(not(feature = "arrow_canonical_extension_types"))] | ||
| { | ||
| Field::new("json", DataType::Utf8, true) | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_decimal_fields() { | ||
| let message_type = " | ||
|
|
@@ -2233,6 +2212,7 @@ mod tests { | |
| #[test] | ||
| #[cfg(feature = "arrow_canonical_extension_types")] | ||
| fn arrow_uuid_to_parquet_uuid() -> Result<()> { | ||
| use arrow_schema::extension::Uuid; | ||
| let arrow_schema = Schema::new(vec![Field::new( | ||
| "uuid", | ||
| DataType::FixedSizeBinary(16), | ||
|
|
@@ -2247,16 +2227,16 @@ mod tests { | |
| Some(LogicalType::Uuid) | ||
| ); | ||
|
|
||
| // TODO: roundtrip | ||
| // let arrow_schema = parquet_to_arrow_schema(&parquet_schema, None)?; | ||
| // assert_eq!(arrow_schema.field(0).try_extension_type::<Uuid>()?, Uuid); | ||
| let arrow_schema = parquet_to_arrow_schema(&parquet_schema, None)?; | ||
| assert_eq!(arrow_schema.field(0).try_extension_type::<Uuid>()?, Uuid); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| #[cfg(feature = "arrow_canonical_extension_types")] | ||
| fn arrow_json_to_parquet_json() -> Result<()> { | ||
| use arrow_schema::extension::Json; | ||
| let arrow_schema = Schema::new(vec![ | ||
| Field::new("json", DataType::Utf8, false).with_extension_type(Json::default()) | ||
| ]); | ||
|
|
@@ -2268,13 +2248,11 @@ mod tests { | |
| Some(LogicalType::Json) | ||
| ); | ||
|
|
||
| // TODO: roundtrip | ||
| // https://github.com/apache/arrow-rs/issues/7063 | ||
| // let arrow_schema = parquet_to_arrow_schema(&parquet_schema, None)?; | ||
| // assert_eq!( | ||
| // arrow_schema.field(0).try_extension_type::<Json>()?, | ||
| // Json::default() | ||
| // ); | ||
| let arrow_schema = parquet_to_arrow_schema(&parquet_schema, None)?; | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it works! |
||
| assert_eq!( | ||
| arrow_schema.field(0).try_extension_type::<Json>()?, | ||
| Json::default() | ||
| ); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The build flags here are definitely the shortest path to the roundtrip you're aiming for...you could also consider an injection approach like:
...and maintain a registry of those in the reader/writer options. Then you don't need compile time flags to support the extensions (something like DataFusion or a derivative could wire it all together at runtime).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is a good idea -- I think @scovich was discussing a registry type approach as well recently. I'll file a ticket to discuss the idea further
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I filed a ticket to track this idea:
In my mind while the build flag approach in this PR is not ideal, it is no worse than what is on
maintoday, though other people may disagree