From a365b612cf6bf91ee6ded03471f0fc9735cb8d69 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 19 Mar 2025 22:54:37 -0500 Subject: [PATCH 01/27] add uuid as opt in --- cpp/src/parquet/arrow/schema_internal.cc | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index 261a0094065..af18eeae81e 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -18,6 +18,7 @@ #include "parquet/arrow/schema_internal.h" #include "arrow/extension/json.h" +#include "arrow/extension/uuid.h" #include "arrow/type.h" #include "parquet/properties.h" @@ -134,8 +135,9 @@ Result> FromByteArray( } } -Result> FromFLBA(const LogicalType& logical_type, - int32_t physical_length) { +Result> FromFLBA( + const LogicalType& logical_type, int32_t physical_length, + const ArrowReaderProperties& reader_properties) { switch (logical_type.type()) { case LogicalType::Type::DECIMAL: return MakeArrowDecimal(logical_type); @@ -143,7 +145,12 @@ Result> FromFLBA(const LogicalType& logical_type, return ::arrow::float16(); case LogicalType::Type::NONE: case LogicalType::Type::INTERVAL: + return ::arrow::fixed_size_binary(physical_length); case LogicalType::Type::UUID: + if (reader_properties.get_arrow_extensions_enabled()) { + return ::arrow::extension::uuid(); + } + return ::arrow::fixed_size_binary(physical_length); default: return Status::NotImplemented("Unhandled logical logical_type ", @@ -211,7 +218,7 @@ Result> GetArrowType( case ParquetType::BYTE_ARRAY: return FromByteArray(logical_type, reader_properties); case ParquetType::FIXED_LEN_BYTE_ARRAY: - return FromFLBA(logical_type, type_length); + return FromFLBA(logical_type, type_length, reader_properties); default: { // PARQUET-1565: This can occur if the file is corrupt return Status::IOError("Invalid physical column type: ", From 4cee020f7a33b017202bcaa541085facff2e9647 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 19 Mar 2025 23:14:12 -0500 Subject: [PATCH 02/27] convert in other direction --- cpp/src/parquet/arrow/schema.cc | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 2f30bea61e4..bf264d6826c 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -22,6 +22,7 @@ #include #include "arrow/extension/json.h" +#include "arrow/extension/uuid.h" #include "arrow/extension_type.h" #include "arrow/io/memory.h" #include "arrow/ipc/api.h" @@ -1053,6 +1054,31 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer // inferred_type is arrow::extension::json(arrow::utf8()) auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); + // Apply metadata recursively to storage type + RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); + inferred->field = inferred->field->WithType(origin_type); + } else if (inferred_type->id() != ::arrow::Type::EXTENSION && + ex_type.extension_name() == std::string("arrow.uuid") && + // TODO(paleolimbot) byte size == 16 + inferred_type->id() == ::arrow::Type::FIXED_SIZE_BINARY) { + // Schema mismatch. + // + // Arrow extensions are DISABLED in Parquet. + // origin_type is ::arrow::extension::json() + // inferred_type is ::arrow::utf8() + // + // Origin type is restored as Arrow should be considered the source of truth. + inferred->field = inferred->field->WithType(origin_type); + RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred)); + } else if (inferred_type->id() == ::arrow::Type::EXTENSION && + ex_type.extension_name() == std::string("arrow.uuid")) { + // Potential schema mismatch. + // + // Arrow extensions are ENABLED in Parquet. + // origin_type is arrow::extension::json(...) + // inferred_type is arrow::extension::json(arrow::utf8()) + auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); + // Apply metadata recursively to storage type RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); inferred->field = inferred->field->WithType(origin_type); From 3d1a4ece68a97b59aa1396f492899da55fa74940 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 20 Mar 2025 09:38:07 -0500 Subject: [PATCH 03/27] one more convert change --- cpp/src/parquet/arrow/schema.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index bf264d6826c..1d967ec8106 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -435,7 +435,13 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, type = ParquetType::BYTE_ARRAY; logical_type = LogicalType::JSON(); break; + } else if (ext_type->extension_name() == std::string("arrow.uuid")) { + type = ParquetType::FIXED_LEN_BYTE_ARRAY; + logical_type = LogicalType::UUID(); + length = 16; + break; } + std::shared_ptr<::arrow::Field> storage_field = ::arrow::field( name, ext_type->storage_type(), field->nullable(), field->metadata()); return FieldToNode(name, storage_field, properties, arrow_properties, out); From dc696cfcf5f95955b98c1bbb2a54250f032871ca Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 24 Mar 2025 10:08:48 -0500 Subject: [PATCH 04/27] enable arrow_extensions_enabled in pyarrow --- cpp/src/arrow/dataset/file_parquet.cc | 2 ++ python/pyarrow/_dataset_parquet.pyx | 25 ++++++++++++++++++++----- python/pyarrow/_parquet.pxd | 2 ++ python/pyarrow/_parquet.pyx | 6 +++++- python/pyarrow/parquet/core.py | 20 +++++++++++++++++--- 5 files changed, 46 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index cf51ea18d7a..6693aa4cca4 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -133,6 +133,8 @@ parquet::ArrowReaderProperties MakeArrowReaderProperties( arrow_properties.set_io_context( parquet_scan_options.arrow_reader_properties->io_context()); arrow_properties.set_use_threads(options.use_threads); + arrow_properties.set_arrow_extensions_enabled( + parquet_scan_options.arrow_reader_properties->get_arrow_extensions_enabled()); return arrow_properties; } diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 45db755d903..a2f248e6e9f 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -703,7 +703,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): cache_options : pyarrow.CacheOptions, default None Cache options used when pre_buffer is enabled. The default values should be good for most use cases. You may want to adjust these for example if - you have exceptionally high latency to the file system. + you have exceptionally high latency to the file system. thrift_string_size_limit : int, default None If not None, override the maximum total string size allocated when decoding Thrift structures. The default limit should be @@ -720,6 +720,9 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): Parquet file. page_checksum_verification : bool, default False If True, verify the page checksum for each page read from the file. + arrow_extensions_enabled : bool, default False + If True, read Parquet logical types as Arrow Extension Types where possible, + (e.g., JSON as arrow.json or UUID as arrow.uuid). """ # Avoid mistakingly creating attributes @@ -733,7 +736,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): thrift_container_size_limit=None, decryption_config=None, decryption_properties=None, - bint page_checksum_verification=False): + bint page_checksum_verification=False, + bint arrow_extensions_enabled=False): self.init(shared_ptr[CFragmentScanOptions]( new CParquetFragmentScanOptions())) self.use_buffered_stream = use_buffered_stream @@ -752,6 +756,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): if decryption_properties is not None: self.decryption_properties = decryption_properties self.page_checksum_verification = page_checksum_verification + self.arrow_extensions_enabled = arrow_extensions_enabled cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp): FragmentScanOptions.init(self, sp) @@ -868,6 +873,14 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): def page_checksum_verification(self, bint page_checksum_verification): self.reader_properties().set_page_checksum_verification(page_checksum_verification) + @property + def arrow_extensions_enabled(self): + return self.arrow_reader_properties().get_arrow_extensions_enabled() + + @arrow_extensions_enabled.setter + def arrow_extensions_enabled(self, bint arrow_extensions_enabled): + self.arrow_reader_properties().set_arrow_extensions_enabled(arrow_extensions_enabled) + def equals(self, ParquetFragmentScanOptions other): """ Parameters @@ -881,11 +894,12 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): attrs = ( self.use_buffered_stream, self.buffer_size, self.pre_buffer, self.cache_options, self.thrift_string_size_limit, self.thrift_container_size_limit, - self.page_checksum_verification) + self.page_checksum_verification, self.arrow_extensions_enabled) other_attrs = ( other.use_buffered_stream, other.buffer_size, other.pre_buffer, other.cache_options, other.thrift_string_size_limit, - other.thrift_container_size_limit, other.page_checksum_verification) + other.thrift_container_size_limit, other.page_checksum_verification, + other.arrow_extensions_enabled) return attrs == other_attrs @staticmethod @@ -902,7 +916,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): cache_options=self.cache_options, thrift_string_size_limit=self.thrift_string_size_limit, thrift_container_size_limit=self.thrift_container_size_limit, - page_checksum_verification=self.page_checksum_verification + page_checksum_verification=self.page_checksum_verification, + arrow_extensions_enabled=self.arrow_extensions_enabled ) return ParquetFragmentScanOptions._reconstruct, (kwargs,) diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 1e3c89e4e72..b8c2f2886e2 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -405,6 +405,8 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: CCacheOptions cache_options() const void set_coerce_int96_timestamp_unit(TimeUnit unit) TimeUnit coerce_int96_timestamp_unit() const + void set_arrow_extensions_enabled(c_bool extensions_enabled) + c_bool get_arrow_extensions_enabled() const ArrowReaderProperties default_arrow_reader_properties() diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 6bc77ed795f..9e5c0c87dc0 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1454,7 +1454,8 @@ cdef class ParquetReader(_Weakrefable): FileDecryptionProperties decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, - page_checksum_verification=False): + page_checksum_verification=False, + arrow_extensions_enabled=False): """ Open a parquet file for reading. @@ -1471,6 +1472,7 @@ cdef class ParquetReader(_Weakrefable): thrift_string_size_limit : int, optional thrift_container_size_limit : int, optional page_checksum_verification : bool, default False + arrow_extensions_enabled : bool, default False """ cdef: shared_ptr[CFileMetaData] c_metadata @@ -1520,6 +1522,8 @@ cdef class ParquetReader(_Weakrefable): arrow_props.set_coerce_int96_timestamp_unit( string_to_timeunit(coerce_int96_timestamp_unit)) + arrow_props.set_arrow_extensions_enabled(arrow_extensions_enabled) + self.source = source get_reader(source, use_memory_map, &self.rd_handle) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 8d3dec96a6b..3d78bce0b6a 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -254,6 +254,9 @@ class ParquetFile: it will be parsed as an URI to determine the filesystem. page_checksum_verification : bool, default False If True, verify the checksum for each page read from the file. + arrow_extensions_enabled : bool, default False + If True, read Parquet logical types as Arrow Extension Types where possible, + (e.g., JSON as arrow.json or UUID as arrow.uuid). Examples -------- @@ -302,7 +305,7 @@ def __init__(self, source, *, metadata=None, common_metadata=None, pre_buffer=False, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, filesystem=None, - page_checksum_verification=False): + page_checksum_verification=False, arrow_extensions_enabled=False): self._close_source = getattr(source, 'closed', True) @@ -322,6 +325,7 @@ def __init__(self, source, *, metadata=None, common_metadata=None, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, page_checksum_verification=page_checksum_verification, + arrow_extensions_enabled=arrow_extensions_enabled, ) self.common_metadata = common_metadata self._nested_paths_by_prefix = self._build_nested_paths() @@ -1264,6 +1268,9 @@ class ParquetDataset: sufficient for most Parquet files. page_checksum_verification : bool, default False If True, verify the page checksum for each page read from the file. +arrow_extensions_enabled : bool, default False + If True, read Parquet logical types as Arrow Extension Types where possible, + (e.g., JSON as arrow.json or UUID as arrow.uuid). Examples -------- @@ -1276,7 +1283,8 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, - page_checksum_verification=False): + page_checksum_verification=False, + arrow_extensions_enabled=False): import pyarrow.dataset as ds @@ -1287,6 +1295,7 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, "thrift_string_size_limit": thrift_string_size_limit, "thrift_container_size_limit": thrift_container_size_limit, "page_checksum_verification": page_checksum_verification, + "arrow_extensions_enabled": arrow_extensions_enabled, } if buffer_size: read_options.update(use_buffered_stream=True, @@ -1674,6 +1683,9 @@ def partitioning(self): sufficient for most Parquet files. page_checksum_verification : bool, default False If True, verify the checksum for each page read from the file. +arrow_extensions_enabled : bool, default False + If True, read Parquet logical types as Arrow Extension Types where possible, + (e.g., JSON as arrow.json or UUID as arrow.uuid). Returns ------- @@ -1768,7 +1780,8 @@ def read_table(source, *, columns=None, use_threads=True, pre_buffer=True, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, - page_checksum_verification=False): + page_checksum_verification=False, + arrow_extensions_enabled=False): try: dataset = ParquetDataset( @@ -1787,6 +1800,7 @@ def read_table(source, *, columns=None, use_threads=True, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, page_checksum_verification=page_checksum_verification, + arrow_extensions_enabled=arrow_extensions_enabled, ) except ImportError: # fall back on ParquetFile for simple cases when pyarrow.dataset From 300f5eb22052aaa70c01b44dadeda0cf33e78d47 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 24 Mar 2025 10:29:23 -0500 Subject: [PATCH 05/27] Add Python test --- .../pyarrow/tests/parquet/test_data_types.py | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 1428f802397..5b4dd178fe5 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -520,4 +520,37 @@ def test_json_extension_type(storage_type): table = pa.table([arr], names=["ext"]) - _simple_table_roundtrip(table) + # With defaults, this should roundtrip (because store_schema=True) + _check_roundtrip(table, table) + + # When store_schema is False, we get a string back by default + _check_roundtrip( + table, + pa.table({"ext": pa.array(data, pa.string())}), + store_schema=False) + + # With arrow_extensions_enabled=True on read, we get a arrow.uuid back + # (but with string() storage) + _check_roundtrip( + table, + pa.table({"ext": pa.array(data, pa.json_(pa.string()))}), + read_table_kwargs={"arrow_extensions_enabled": True}, + store_schema=False) + + +def test_uuid_extension_type(): + data = [ + b'\xe4`\xf9p\x83QGN\xac\x7f\xa4g>K\xa8\xcb', + b'\x1et\x14\x95\xee\xd5C\xea\x9b\xd7s\xdc\x91BK\xaf', + None + ] + arr = pa.array(data, type=pa.uuid()) + + table = pa.table([arr], names=["ext"]) + + _check_roundtrip(table, table) + _check_roundtrip( + table, + pa.table({"ext": pa.array(data, pa.binary(16))}), + store_schema=False) + _check_roundtrip(table, table, {"arrow_extensions_enabled": True}, store_schema=False) From af8227cb7c9946cc7821fc2e6c2aea6bc1878a7f Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 24 Mar 2025 10:46:09 -0500 Subject: [PATCH 06/27] Add C++ test --- cpp/src/parquet/arrow/arrow_schema_test.cc | 65 +++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 535efa0c8e5..5ccbe6c6fb0 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -33,6 +33,7 @@ #include "arrow/array.h" #include "arrow/extension/json.h" +#include "arrow/extension/uuid.h" #include "arrow/ipc/writer.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" @@ -866,7 +867,7 @@ Status ArrowSchemaToParquetMetadata(std::shared_ptr<::arrow::Schema>& arrow_sche return Status::OK(); } -TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { +TEST_F(TestConvertParquetSchema, ParquetSchemaArrowJsonExtension) { std::vector parquet_fields; parquet_fields.push_back(PrimitiveNode::Make( "json_1", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, ConvertedType::JSON)); @@ -948,6 +949,68 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { } } +TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) { + std::vector parquet_fields; + parquet_fields.push_back(PrimitiveNode::Make("uuid", Repetition::OPTIONAL, + LogicalType::UUID(), + ParquetType::FIXED_LEN_BYTE_ARRAY, 16)); + + { + // Parquet file does not contain Arrow schema. + // By default, field should be treated as fixed_size_binary(16) in Arrow. + auto arrow_schema = + ::arrow::schema({::arrow::field("uuid", ::arrow::fixed_size_binary(16), true)}); + std::shared_ptr metadata{}; + ASSERT_OK(ConvertSchema(parquet_fields, metadata)); + CheckFlatSchema(arrow_schema); + } + + { + // Parquet file does not contain Arrow schema. + // If Arrow extensions are enabled, field will be interpreted as uuid() + // extension field. + ArrowReaderProperties props; + props.set_arrow_extensions_enabled(true); + auto arrow_schema = + ::arrow::schema({::arrow::field("uuid", ::arrow::extension::uuid(), true)}); + std::shared_ptr metadata{}; + ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); + CheckFlatSchema(arrow_schema); + } + + { + // Parquet file contains Arrow schema. + // uuid will be interpreted as uuid() field even though extensions are not enabled. + ArrowReaderProperties props; + props.set_arrow_extensions_enabled(false); + std::shared_ptr field_metadata = + ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); + auto arrow_schema = ::arrow::schema( + {::arrow::field("uuid", ::arrow::extension::uuid(), true, field_metadata)}); + + std::shared_ptr metadata; + ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata)); + ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); + CheckFlatSchema(arrow_schema, true /* check_metadata */); + } + + { + // Parquet file contains Arrow schema. + // uuid will be interpreted as uuid() field even though extensions are not enabled. + ArrowReaderProperties props; + props.set_arrow_extensions_enabled(true); + std::shared_ptr field_metadata = + ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); + auto arrow_schema = ::arrow::schema( + {::arrow::field("uuid", ::arrow::extension::uuid(), true, field_metadata)}); + + std::shared_ptr metadata; + ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata)); + ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); + CheckFlatSchema(arrow_schema, true /* check_metadata */); + } +} + class TestConvertArrowSchema : public ::testing::Test { public: virtual void SetUp() {} From bd9c9a9d4c29392ac622969c5ac539e81bb923e2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 24 Mar 2025 10:46:54 -0500 Subject: [PATCH 07/27] format --- cpp/src/arrow/dataset/file_parquet.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 6693aa4cca4..d3c0b71733c 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -134,7 +134,7 @@ parquet::ArrowReaderProperties MakeArrowReaderProperties( parquet_scan_options.arrow_reader_properties->io_context()); arrow_properties.set_use_threads(options.use_threads); arrow_properties.set_arrow_extensions_enabled( - parquet_scan_options.arrow_reader_properties->get_arrow_extensions_enabled()); + parquet_scan_options.arrow_reader_properties->get_arrow_extensions_enabled()); return arrow_properties; } From f18758670ee002c9f9331cc2a353f6da06923944 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 24 Mar 2025 10:49:04 -0500 Subject: [PATCH 08/27] fix comment that was lying --- cpp/src/parquet/arrow/arrow_schema_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 5ccbe6c6fb0..013f733ae1a 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -980,7 +980,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) { { // Parquet file contains Arrow schema. - // uuid will be interpreted as uuid() field even though extensions are not enabled. + // uuid will be interpreted as uuid() field ArrowReaderProperties props; props.set_arrow_extensions_enabled(false); std::shared_ptr field_metadata = From d11fcd8f6730826f08f8be94cc12b8f04ddce521 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 24 Mar 2025 11:05:24 -0500 Subject: [PATCH 09/27] clean up schema.cc --- cpp/src/parquet/arrow/schema.cc | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 1d967ec8106..ff6b9ce5129 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1063,26 +1063,24 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer // Apply metadata recursively to storage type RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); inferred->field = inferred->field->WithType(origin_type); - } else if (inferred_type->id() != ::arrow::Type::EXTENSION && - ex_type.extension_name() == std::string("arrow.uuid") && - // TODO(paleolimbot) byte size == 16 - inferred_type->id() == ::arrow::Type::FIXED_SIZE_BINARY) { + } else if (inferred_type->id() == ::arrow::Type::FIXED_SIZE_BINARY && + ex_type.extension_name() == std::string("arrow.uuid")) { // Schema mismatch. // // Arrow extensions are DISABLED in Parquet. - // origin_type is ::arrow::extension::json() - // inferred_type is ::arrow::utf8() + // origin_type is ::arrow::extension::uuid() + // inferred_type is ::arrow::fixed_size_binary() // // Origin type is restored as Arrow should be considered the source of truth. inferred->field = inferred->field->WithType(origin_type); RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred)); } else if (inferred_type->id() == ::arrow::Type::EXTENSION && ex_type.extension_name() == std::string("arrow.uuid")) { - // Potential schema mismatch. + // Schema match. // // Arrow extensions are ENABLED in Parquet. - // origin_type is arrow::extension::json(...) - // inferred_type is arrow::extension::json(arrow::utf8()) + // origin_type is arrow::extension::uuid() + // inferred_type is arrow::extension::uuid() auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); // Apply metadata recursively to storage type From f7a965ed92593fc872eb805232320acf92bb2e38 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 24 Mar 2025 11:05:57 -0500 Subject: [PATCH 10/27] fix python lint --- python/pyarrow/tests/parquet/test_data_types.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 5b4dd178fe5..933ae85e570 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -553,4 +553,7 @@ def test_uuid_extension_type(): table, pa.table({"ext": pa.array(data, pa.binary(16))}), store_schema=False) - _check_roundtrip(table, table, {"arrow_extensions_enabled": True}, store_schema=False) + _check_roundtrip( + table, + table, + {"arrow_extensions_enabled": True}, store_schema=False) From b4d0730ca2f067139ca99180dd7404b40bd35553 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 31 Mar 2025 11:54:42 -0500 Subject: [PATCH 11/27] apply improvement to arrow_extensions_enabled --- python/pyarrow/_dataset_parquet.pyx | 5 +++-- python/pyarrow/parquet/core.py | 15 +++++++++------ python/pyarrow/tests/parquet/test_data_types.py | 2 +- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index a2f248e6e9f..107f17aca67 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -721,8 +721,9 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): page_checksum_verification : bool, default False If True, verify the page checksum for each page read from the file. arrow_extensions_enabled : bool, default False - If True, read Parquet logical types as Arrow Extension Types where possible, - (e.g., JSON as arrow.json or UUID as arrow.uuid). + If True, read Parquet logical types as Arrow extension types where possible, + (e.g., read JSON as the canonical `arrow.json` extension type or UUID as + the canonical `arrow.uuid` extension type). """ # Avoid mistakingly creating attributes diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 3d78bce0b6a..71a27d4be7f 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -255,8 +255,9 @@ class ParquetFile: page_checksum_verification : bool, default False If True, verify the checksum for each page read from the file. arrow_extensions_enabled : bool, default False - If True, read Parquet logical types as Arrow Extension Types where possible, - (e.g., JSON as arrow.json or UUID as arrow.uuid). + If True, read Parquet logical types as Arrow extension types where possible, + (e.g., read JSON as the canonical `arrow.json` extension type or UUID as + the canonical `arrow.uuid` extension type). Examples -------- @@ -1269,8 +1270,9 @@ class ParquetDataset: page_checksum_verification : bool, default False If True, verify the page checksum for each page read from the file. arrow_extensions_enabled : bool, default False - If True, read Parquet logical types as Arrow Extension Types where possible, - (e.g., JSON as arrow.json or UUID as arrow.uuid). + If True, read Parquet logical types as Arrow extension types where possible, + (e.g., read JSON as the canonical `arrow.json` extension type or UUID as + the canonical `arrow.uuid` extension type). Examples -------- @@ -1684,8 +1686,9 @@ def partitioning(self): page_checksum_verification : bool, default False If True, verify the checksum for each page read from the file. arrow_extensions_enabled : bool, default False - If True, read Parquet logical types as Arrow Extension Types where possible, - (e.g., JSON as arrow.json or UUID as arrow.uuid). + If True, read Parquet logical types as Arrow extension types where possible, + (e.g., read JSON as the canonical `arrow.json` extension type or UUID as + the canonical `arrow.uuid` extension type). Returns ------- diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 933ae85e570..61fa790772f 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -529,7 +529,7 @@ def test_json_extension_type(storage_type): pa.table({"ext": pa.array(data, pa.string())}), store_schema=False) - # With arrow_extensions_enabled=True on read, we get a arrow.uuid back + # With arrow_extensions_enabled=True on read, we get a arrow.json back # (but with string() storage) _check_roundtrip( table, From 2193fd47c9850a1c5ef2fa5a1867232cbb016722 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 31 Mar 2025 12:00:17 -0500 Subject: [PATCH 12/27] Check physical length before inferring Arrow UUID type --- cpp/src/parquet/arrow/schema_internal.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index af18eeae81e..20b3ac499d5 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -147,7 +147,7 @@ Result> FromFLBA( case LogicalType::Type::INTERVAL: return ::arrow::fixed_size_binary(physical_length); case LogicalType::Type::UUID: - if (reader_properties.get_arrow_extensions_enabled()) { + if (physical_length == 16 && reader_properties.get_arrow_extensions_enabled()) { return ::arrow::extension::uuid(); } From a81d5c1d04eb6ec3d88616ffd3d9bf97d25ccc35 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 31 Mar 2025 14:14:31 -0500 Subject: [PATCH 13/27] fix test comments in arrow_schema_test.cc --- cpp/src/parquet/arrow/arrow_schema_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 013f733ae1a..c476e33f65c 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -980,7 +980,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) { { // Parquet file contains Arrow schema. - // uuid will be interpreted as uuid() field + // uuid will be interpreted as uuid() field even though extensions are not enabled. ArrowReaderProperties props; props.set_arrow_extensions_enabled(false); std::shared_ptr field_metadata = @@ -996,7 +996,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) { { // Parquet file contains Arrow schema. - // uuid will be interpreted as uuid() field even though extensions are not enabled. + // uuid will be interpreted as uuid() field also when extensions *are* enabled ArrowReaderProperties props; props.set_arrow_extensions_enabled(true); std::shared_ptr field_metadata = From 62f92b33b7afc6ada942aae85c1c0c5e11905fc0 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 31 Mar 2025 16:26:04 -0500 Subject: [PATCH 14/27] Attempt to reduce duplication in extension type restoration --- cpp/src/parquet/arrow/schema.cc | 100 ++++++++++++++------------------ 1 file changed, 45 insertions(+), 55 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index ff6b9ce5129..c21c14acb9a 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1037,67 +1037,57 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer const auto& inferred_type = inferred->field->type(); if (origin_type->id() == ::arrow::Type::EXTENSION) { - const auto& ex_type = checked_cast(*origin_type); - if (inferred_type->id() != ::arrow::Type::EXTENSION && - ex_type.extension_name() == std::string("arrow.json") && - ::arrow::extension::JsonExtensionType::IsSupportedStorageType( - inferred_type->id())) { - // Schema mismatch. - // - // Arrow extensions are DISABLED in Parquet. - // origin_type is ::arrow::extension::json() - // inferred_type is ::arrow::utf8() - // - // Origin type is restored as Arrow should be considered the source of truth. - inferred->field = inferred->field->WithType(origin_type); - RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred)); - } else if (inferred_type->id() == ::arrow::Type::EXTENSION && - ex_type.extension_name() == std::string("arrow.json")) { - // Potential schema mismatch. - // - // Arrow extensions are ENABLED in Parquet. - // origin_type is arrow::extension::json(...) - // inferred_type is arrow::extension::json(arrow::utf8()) - auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); - - // Apply metadata recursively to storage type + const auto& origin_extension_type = + checked_cast(*origin_type); + std::string origin_extension_name = origin_extension_type.extension_name(); + + // Whether or not the inferred type is also an extension type. This can occur because + // arrow_extensions_enabled is true in the ArrowReaderProperties or because + // the field contained an ARROW:extension:name metadata field and that extension name + // is registered. + bool arrow_extension_inferred = inferred_type->id() == ::arrow::Type::EXTENSION; + + // Check if the inferred storage type is compatible with the extension type + // we're about to apply. We assume that if an extension type was inferred + // that it was constructed with a valid storage type. + bool extension_supports_inferred_storage; + if (origin_extension_name == "arrow.json") { + extension_supports_inferred_storage = + arrow_extension_inferred || + ::arrow::extension::JsonExtensionType::IsSupportedStorageType( + inferred_type->id()); + } else if (origin_extension_name == "arrow.uuid") { + extension_supports_inferred_storage = + arrow_extension_inferred || + (inferred_type->id() == ::arrow::Type::FIXED_SIZE_BINARY && + checked_cast(*inferred_type) + .byte_width() == 16); + } else if (arrow_extension_inferred) { + extension_supports_inferred_storage = origin_extension_type.Equals(*inferred_type); + } else { + extension_supports_inferred_storage = + origin_extension_type.storage_type()->Equals(*inferred_type); + } + + if (arrow_extension_inferred && extension_supports_inferred_storage) { + // e.g., arrow_extensions_enabled is true + auto origin_storage_field = + origin_field.WithType(origin_extension_type.storage_type()); RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); inferred->field = inferred->field->WithType(origin_type); - } else if (inferred_type->id() == ::arrow::Type::FIXED_SIZE_BINARY && - ex_type.extension_name() == std::string("arrow.uuid")) { - // Schema mismatch. - // - // Arrow extensions are DISABLED in Parquet. - // origin_type is ::arrow::extension::uuid() - // inferred_type is ::arrow::fixed_size_binary() - // - // Origin type is restored as Arrow should be considered the source of truth. + } else if (extension_supports_inferred_storage) { + // e.g., arrow_extensions_enabled is false but we still restore the extension type + // because Arrow is the source of truth if we are asked to apply the original + // metadata + inferred->field = inferred->field->WithType(origin_type); RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred)); - } else if (inferred_type->id() == ::arrow::Type::EXTENSION && - ex_type.extension_name() == std::string("arrow.uuid")) { - // Schema match. - // - // Arrow extensions are ENABLED in Parquet. - // origin_type is arrow::extension::uuid() - // inferred_type is arrow::extension::uuid() - auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); - - // Apply metadata recursively to storage type - RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); - inferred->field = inferred->field->WithType(origin_type); } else { - auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); - - // Apply metadata recursively to storage type - RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); - - // Restore extension type, if the storage type is the same as inferred - // from the Parquet type - if (ex_type.storage_type()->Equals(*inferred->field->type())) { - inferred->field = inferred->field->WithType(origin_type); - } + // Applying the original extension type to the given storage may cause an invalid + // extension type instance, so we do not restore the original extension type + RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred)); } + modified = true; } else { ARROW_ASSIGN_OR_RAISE(modified, ApplyOriginalStorageMetadata(origin_field, inferred)); From d1b37e8421dd76384ed03fc15a51af1cca8cbcc3 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 1 Apr 2025 11:01:05 -0500 Subject: [PATCH 15/27] remove unused header --- cpp/src/parquet/arrow/schema.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index c21c14acb9a..bcf30fbccc6 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -22,7 +22,6 @@ #include #include "arrow/extension/json.h" -#include "arrow/extension/uuid.h" #include "arrow/extension_type.h" #include "arrow/io/memory.h" #include "arrow/ipc/api.h" From 4bd607b3bc638bd4a620558bc38e0b4a8908e570 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 1 Apr 2025 12:04:19 -0500 Subject: [PATCH 16/27] support nested extension components --- cpp/src/parquet/arrow/schema.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index bcf30fbccc6..3fb329630e7 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1082,9 +1082,15 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer inferred->field = inferred->field->WithType(origin_type); RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred)); } else { - // Applying the original extension type to the given storage may cause an invalid - // extension type instance, so we do not restore the original extension type - RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred)); + // If we are here, we still *might* be able to restore the extension type + // if we first apply metadata to children (e.g., if the extension type + // is nested and its children are extension type). + auto origin_storage_field = + origin_field.WithType(origin_extension_type.storage_type()); + RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); + if (origin_extension_type.storage_type()->Equals(*inferred->field->type())) { + inferred->field = inferred->field->WithType(origin_type); + } } modified = true; From 75299478f856b927987573ce4190100e60c31f45 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 1 Apr 2025 12:06:19 -0500 Subject: [PATCH 17/27] remove unneeded branch --- cpp/src/parquet/arrow/schema.cc | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 3fb329630e7..811e2e73c05 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1068,19 +1068,14 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer origin_extension_type.storage_type()->Equals(*inferred_type); } - if (arrow_extension_inferred && extension_supports_inferred_storage) { - // e.g., arrow_extensions_enabled is true + if (arrow_extension_inferred || extension_supports_inferred_storage) { + // i.e., arrow_extensions_enabled is true or arrow_extensions_enabled is false but + // we still restore the extension type because Arrow is the source of truth if we + // are asked to apply the original metadata auto origin_storage_field = origin_field.WithType(origin_extension_type.storage_type()); RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); inferred->field = inferred->field->WithType(origin_type); - } else if (extension_supports_inferred_storage) { - // e.g., arrow_extensions_enabled is false but we still restore the extension type - // because Arrow is the source of truth if we are asked to apply the original - // metadata - - inferred->field = inferred->field->WithType(origin_type); - RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred)); } else { // If we are here, we still *might* be able to restore the extension type // if we first apply metadata to children (e.g., if the extension type From d2d98f27bdb4184596e7d74a8a1b9d5f8bfe3638 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 10 Apr 2025 10:19:51 -0500 Subject: [PATCH 18/27] fix merge --- cpp/src/parquet/arrow/schema.cc | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 91b5dcab427..87a3b54178c 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1086,6 +1086,10 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer (inferred_type->id() == ::arrow::Type::FIXED_SIZE_BINARY && checked_cast(*inferred_type) .byte_width() == 16); + } else if (origin_extension_name == "parquet.variant") { + extension_supports_inferred_storage = + arrow_extension_inferred || + VariantExtensionType::IsSupportedStorageType(inferred_type); } else if (arrow_extension_inferred) { extension_supports_inferred_storage = origin_extension_type.Equals(*inferred_type); } else { @@ -1101,19 +1105,6 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer origin_field.WithType(origin_extension_type.storage_type()); RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); inferred->field = inferred->field->WithType(origin_type); - } else if (inferred_type->id() == ::arrow::Type::EXTENSION && - ex_type.extension_name() == std::string("parquet.variant")) { - // Potential schema mismatch. - // - // Arrow extensions are ENABLED in Parquet. - // origin_type is parquet::arrow::variant(...) - // inferred_type is - // parquet::arrow::variant(struct(arrow::binary(),arrow::binary())) - auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); - - // Apply metadata recursively to storage type - RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); - inferred->field = inferred->field->WithType(origin_type); } else { // If we are here, we still *might* be able to restore the extension type // if we first apply metadata to children (e.g., if the extension type From f3894b8af0c4451872d8f08076a8659f12dc0c2a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Apr 2025 10:47:18 -0500 Subject: [PATCH 19/27] Update cpp/src/parquet/arrow/arrow_schema_test.cc Co-authored-by: Gang Wu --- cpp/src/parquet/arrow/arrow_schema_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 1c94240e7fe..d68feba363e 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -1038,7 +1038,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) { // Parquet file does not contain Arrow schema. // By default, field should be treated as fixed_size_binary(16) in Arrow. auto arrow_schema = - ::arrow::schema({::arrow::field("uuid", ::arrow::fixed_size_binary(16), true)}); + ::arrow::schema({::arrow::field("uuid", ::arrow::fixed_size_binary(16))}); std::shared_ptr metadata{}; ASSERT_OK(ConvertSchema(parquet_fields, metadata)); CheckFlatSchema(arrow_schema); From a101c8b46a354d64bf0dc42930e72484575081c6 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Apr 2025 10:47:37 -0500 Subject: [PATCH 20/27] Update cpp/src/parquet/arrow/arrow_schema_test.cc Co-authored-by: Gang Wu --- cpp/src/parquet/arrow/arrow_schema_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index d68feba363e..f04ba7857fe 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -1051,7 +1051,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) { ArrowReaderProperties props; props.set_arrow_extensions_enabled(true); auto arrow_schema = - ::arrow::schema({::arrow::field("uuid", ::arrow::extension::uuid(), true)}); + ::arrow::schema({::arrow::field("uuid", ::arrow::extension::uuid())}); std::shared_ptr metadata{}; ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); CheckFlatSchema(arrow_schema); From 4c66a754a005ebbe1e0b901a6ca049d5f5f4f4d0 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Apr 2025 10:47:56 -0500 Subject: [PATCH 21/27] Update cpp/src/parquet/arrow/schema_internal.cc Co-authored-by: Gang Wu --- cpp/src/parquet/arrow/schema_internal.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index a444b26d818..27e1e528afb 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -153,7 +153,7 @@ Result> FromFLBA( return ::arrow::fixed_size_binary(physical_length); default: - return Status::NotImplemented("Unhandled logical logical_type ", + return Status::NotImplemented("Unhandled logical_type ", logical_type.ToString(), " for fixed-length binary array"); } From 234475ebbf6c21b7464ba0ebee474b95999a4cec Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Apr 2025 10:48:17 -0500 Subject: [PATCH 22/27] Update cpp/src/parquet/arrow/schema.cc Co-authored-by: Gang Wu --- cpp/src/parquet/arrow/schema.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 87a3b54178c..04c94897766 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1112,7 +1112,7 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer auto origin_storage_field = origin_field.WithType(origin_extension_type.storage_type()); RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); - if (origin_extension_type.storage_type()->Equals(*inferred->field->type())) { + if (origin_extension_type.storage_type()->Equals(*inferred_type)) { inferred->field = inferred->field->WithType(origin_type); } } From 8e8d5eed76fe476823f31b6228af1f6c8b4942ae Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Apr 2025 10:48:42 -0500 Subject: [PATCH 23/27] Update cpp/src/parquet/arrow/arrow_schema_test.cc Co-authored-by: Gang Wu --- cpp/src/parquet/arrow/arrow_schema_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index f04ba7857fe..e0b3114dfba 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -1065,7 +1065,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) { std::shared_ptr field_metadata = ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); auto arrow_schema = ::arrow::schema( - {::arrow::field("uuid", ::arrow::extension::uuid(), true, field_metadata)}); + {::arrow::field("uuid", ::arrow::extension::uuid(), /*nullable=*/true, field_metadata)}); std::shared_ptr metadata; ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata)); From 26afc4062a0e1c88248596b8ea5ffb678793aa82 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Apr 2025 10:49:16 -0500 Subject: [PATCH 24/27] Update cpp/src/parquet/arrow/arrow_schema_test.cc Co-authored-by: Gang Wu --- cpp/src/parquet/arrow/arrow_schema_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index e0b3114dfba..2cf946fd419 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -1081,7 +1081,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) { std::shared_ptr field_metadata = ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); auto arrow_schema = ::arrow::schema( - {::arrow::field("uuid", ::arrow::extension::uuid(), true, field_metadata)}); + {::arrow::field("uuid", ::arrow::extension::uuid(), /*nullable=*/true, field_metadata)}); std::shared_ptr metadata; ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata)); From 4b63ec740d135b144543f5d2be35dd0c812e98bc Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Apr 2025 10:58:10 -0500 Subject: [PATCH 25/27] clang-format --- cpp/src/parquet/arrow/arrow_schema_test.cc | 8 ++++---- cpp/src/parquet/arrow/schema_internal.cc | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 2cf946fd419..66fd049f6a3 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -1064,8 +1064,8 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) { props.set_arrow_extensions_enabled(false); std::shared_ptr field_metadata = ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); - auto arrow_schema = ::arrow::schema( - {::arrow::field("uuid", ::arrow::extension::uuid(), /*nullable=*/true, field_metadata)}); + auto arrow_schema = ::arrow::schema({::arrow::field( + "uuid", ::arrow::extension::uuid(), /*nullable=*/true, field_metadata)}); std::shared_ptr metadata; ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata)); @@ -1080,8 +1080,8 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) { props.set_arrow_extensions_enabled(true); std::shared_ptr field_metadata = ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); - auto arrow_schema = ::arrow::schema( - {::arrow::field("uuid", ::arrow::extension::uuid(), /*nullable=*/true, field_metadata)}); + auto arrow_schema = ::arrow::schema({::arrow::field( + "uuid", ::arrow::extension::uuid(), /*nullable=*/true, field_metadata)}); std::shared_ptr metadata; ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata)); diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index 27e1e528afb..8ddac98ec3c 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -153,8 +153,7 @@ Result> FromFLBA( return ::arrow::fixed_size_binary(physical_length); default: - return Status::NotImplemented("Unhandled logical_type ", - logical_type.ToString(), + return Status::NotImplemented("Unhandled logical_type ", logical_type.ToString(), " for fixed-length binary array"); } } From b48f7f5bc2774bdc872df0d4a5cbffcc5c488b80 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Apr 2025 11:27:11 -0500 Subject: [PATCH 26/27] move supported storage type test to UuidType, move common code out of both if branches --- cpp/src/arrow/extension/uuid.cc | 6 +++++- cpp/src/arrow/extension/uuid.h | 2 ++ cpp/src/parquet/arrow/schema.cc | 18 ++++++++---------- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/extension/uuid.cc b/cpp/src/arrow/extension/uuid.cc index 2f36eb3e7d1..b24f6895d0c 100644 --- a/cpp/src/arrow/extension/uuid.cc +++ b/cpp/src/arrow/extension/uuid.cc @@ -40,7 +40,7 @@ Result> UuidType::Deserialize( if (!serialized.empty()) { return Status::Invalid("Unexpected serialized metadata: '", serialized, "'"); } - if (!storage_type->Equals(*fixed_size_binary(16))) { + if (!IsSupportedStorageType(storage_type)) { return Status::Invalid("Invalid storage type for UuidType: ", storage_type->ToString()); } @@ -55,4 +55,8 @@ std::string UuidType::ToString(bool show_metadata) const { std::shared_ptr uuid() { return std::make_shared(); } +bool UuidType::IsSupportedStorageType(const std::shared_ptr& storage_type) { + return storage_type->Equals(*fixed_size_binary(16)); +} + } // namespace arrow::extension diff --git a/cpp/src/arrow/extension/uuid.h b/cpp/src/arrow/extension/uuid.h index 42bb21cf0b2..8c9660c463b 100644 --- a/cpp/src/arrow/extension/uuid.h +++ b/cpp/src/arrow/extension/uuid.h @@ -53,6 +53,8 @@ class ARROW_EXPORT UuidType : public ExtensionType { /// \brief Create a UuidType instance static Result> Make() { return std::make_shared(); } + + static bool IsSupportedStorageType(const std::shared_ptr& storage_type); }; /// \brief Return a UuidType instance. diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 04c94897766..1ec2b7c5018 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -22,6 +22,7 @@ #include #include "arrow/extension/json.h" +#include "arrow/extension/uuid.h" #include "arrow/extension_type.h" #include "arrow/io/memory.h" #include "arrow/ipc/api.h" @@ -1083,9 +1084,7 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer } else if (origin_extension_name == "arrow.uuid") { extension_supports_inferred_storage = arrow_extension_inferred || - (inferred_type->id() == ::arrow::Type::FIXED_SIZE_BINARY && - checked_cast(*inferred_type) - .byte_width() == 16); + ::arrow::extension::UuidType::IsSupportedStorageType(inferred_type); } else if (origin_extension_name == "parquet.variant") { extension_supports_inferred_storage = arrow_extension_inferred || @@ -1097,21 +1096,20 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer origin_extension_type.storage_type()->Equals(*inferred_type); } + // Apply the original storage metadata from the original storage field + auto origin_storage_field = + origin_field.WithType(origin_extension_type.storage_type()); + RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); + if (arrow_extension_inferred || extension_supports_inferred_storage) { // i.e., arrow_extensions_enabled is true or arrow_extensions_enabled is false but // we still restore the extension type because Arrow is the source of truth if we // are asked to apply the original metadata - auto origin_storage_field = - origin_field.WithType(origin_extension_type.storage_type()); - RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); inferred->field = inferred->field->WithType(origin_type); } else { // If we are here, we still *might* be able to restore the extension type // if we first apply metadata to children (e.g., if the extension type - // is nested and its children are extension type). - auto origin_storage_field = - origin_field.WithType(origin_extension_type.storage_type()); - RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); + // is nested and one or more of its children are an extension type). if (origin_extension_type.storage_type()->Equals(*inferred_type)) { inferred->field = inferred->field->WithType(origin_type); } From 555ed7598e4f9c2d348ba4ce3ed7e0216640303d Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 18 Apr 2025 12:35:28 -0500 Subject: [PATCH 27/27] maybe simplify it all again --- cpp/src/parquet/arrow/schema.cc | 52 ++++++++++++++++----------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 1ec2b7c5018..6b27bb8f55c 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1059,23 +1059,37 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer bool modified = false; auto& origin_type = origin_field.type(); - const auto& inferred_type = inferred->field->type(); + // The origin was an extension type. This occurs when the ARROW:extension:name field + // was present when the schema was written and that extension is registered when + // the schema is read. if (origin_type->id() == ::arrow::Type::EXTENSION) { const auto& origin_extension_type = checked_cast(*origin_type); - std::string origin_extension_name = origin_extension_type.extension_name(); - // Whether or not the inferred type is also an extension type. This can occur because - // arrow_extensions_enabled is true in the ArrowReaderProperties or because - // the field contained an ARROW:extension:name metadata field and that extension name - // is registered. + // (Recursively) Apply the original storage metadata from the original storage field + // This applies extension types to child elements, if any. + auto origin_storage_field = + origin_field.WithType(origin_extension_type.storage_type()); + RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); + + // Use the inferred type after child updates for below checks to see if + // we can restore an extension type on the output. + const auto& inferred_type = inferred->field->type(); + + // Whether or not the inferred type is also an extension type. This can occur when + // arrow_extensions_enabled is true in the ArrowReaderProperties. Extension types + // are not currently inferred for any other reason. bool arrow_extension_inferred = inferred_type->id() == ::arrow::Type::EXTENSION; // Check if the inferred storage type is compatible with the extension type - // we're about to apply. We assume that if an extension type was inferred - // that it was constructed with a valid storage type. + // we're hoping to apply. We assume that if an extension type was inferred + // that it was constructed with a valid storage type. Otherwise, we check with + // extension types that we know about for valid storage, falling back to + // storage type equality for extension types that we don't know about. + std::string origin_extension_name = origin_extension_type.extension_name(); bool extension_supports_inferred_storage; + if (origin_extension_name == "arrow.json") { extension_supports_inferred_storage = arrow_extension_inferred || @@ -1089,30 +1103,16 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer extension_supports_inferred_storage = arrow_extension_inferred || VariantExtensionType::IsSupportedStorageType(inferred_type); - } else if (arrow_extension_inferred) { - extension_supports_inferred_storage = origin_extension_type.Equals(*inferred_type); } else { extension_supports_inferred_storage = origin_extension_type.storage_type()->Equals(*inferred_type); } - // Apply the original storage metadata from the original storage field - auto origin_storage_field = - origin_field.WithType(origin_extension_type.storage_type()); - RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); - - if (arrow_extension_inferred || extension_supports_inferred_storage) { - // i.e., arrow_extensions_enabled is true or arrow_extensions_enabled is false but - // we still restore the extension type because Arrow is the source of truth if we - // are asked to apply the original metadata + // If the origin extension of the metadata we are about to apply supports + // the Arrow storage type we would otherwise return, we restore the extension + // type to the output. + if (extension_supports_inferred_storage) { inferred->field = inferred->field->WithType(origin_type); - } else { - // If we are here, we still *might* be able to restore the extension type - // if we first apply metadata to children (e.g., if the extension type - // is nested and one or more of its children are an extension type). - if (origin_extension_type.storage_type()->Equals(*inferred_type)) { - inferred->field = inferred->field->WithType(origin_type); - } } modified = true;