From fba37f5d07cd82aa2268f1970e17d8130618d23f Mon Sep 17 00:00:00 2001 From: logan-keede Date: Sun, 19 Jan 2025 01:57:15 +0530 Subject: [PATCH 1/5] deprecating max_statisics_size --- datafusion/common/src/config.rs | 79 +++++++++++++------ .../common/src/file_options/parquet_writer.rs | 6 +- 2 files changed, 60 insertions(+), 25 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 33a90017bd7e8..bb907d745dd2e 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -108,38 +108,51 @@ use crate::{DataFusionError, Result}; /// ``` /// /// NB: Misplaced commas may result in nonsensical errors -#[macro_export] macro_rules! config_namespace { ( - $(#[doc = $struct_d:tt])* - $vis:vis struct $struct_name:ident { - $( - $(#[doc = $d:tt])* - $field_vis:vis $field_name:ident : $field_type:ty, $(warn = $warn: expr,)? $(transform = $transform:expr,)? default = $default:expr - )*$(,)* - } + $(#[doc = $struct_d:tt])* // Struct-level documentation attributes + $(#[deprecated($($struct_depr:tt)*)])? // Optional struct-level deprecated attribute + $(#[allow($($struct_de:tt)*)])? + $vis:vis struct $struct_name:ident { + $( + $(#[doc = $d:tt])* // Field-level documentation attributes + $(#[deprecated($($field_depr:tt)*)])? // Optional field-level deprecated attribute + $(#[allow($($field_de:tt)*)])? + $field_vis:vis $field_name:ident : $field_type:ty, + $(warn = $warn:expr,)? + $(transform = $transform:expr,)? + default = $default:expr + )*$(,)* + } ) => { - - $(#[doc = $struct_d])* + $(#[doc = $struct_d])* // Apply struct documentation + $(#[deprecated($($struct_depr)*)])? // Apply struct deprecation + $(#[allow($($struct_de)*)])? #[derive(Debug, Clone, PartialEq)] - $vis struct $struct_name{ + $vis struct $struct_name { $( - $(#[doc = $d])* - $field_vis $field_name : $field_type, + $(#[doc = $d])* // Apply field documentation + $(#[deprecated($($field_depr)*)])? // Apply field deprecation + $(#[allow($($field_de)*)])? + $field_vis $field_name: $field_type, )* } impl ConfigField for $struct_name { fn set(&mut self, key: &str, value: &str) -> Result<()> { let (key, rem) = key.split_once('.').unwrap_or((key, "")); - match key { $( - stringify!($field_name) => { - $(let value = $transform(value);)? - $(log::warn!($warn);)? - self.$field_name.set(rem, value.as_ref()) - }, + stringify!($field_name) => { + // Safely apply deprecated attribute if present + // $(#[allow(deprecated)])? + { + $(let value = $transform(value);)? // Apply transformation if specified + $(log::warn!($warn);)? // Log warning if specified + #[allow(deprecated)] + self.$field_name.set(rem, value.as_ref()) + } + }, )* _ => return _config_err!( "Config value \"{}\" not found on {}", key, stringify!($struct_name) @@ -149,15 +162,16 @@ macro_rules! config_namespace { fn visit(&self, v: &mut V, key_prefix: &str, _description: &'static str) { $( - let key = format!(concat!("{}.", stringify!($field_name)), key_prefix); - let desc = concat!($($d),*).trim(); - self.$field_name.visit(v, key.as_str(), desc); + let key = format!(concat!("{}.", stringify!($field_name)), key_prefix); + let desc = concat!($($d),*).trim(); + #[allow(deprecated)] + self.$field_name.visit(v, key.as_str(), desc); )* } } - impl Default for $struct_name { fn default() -> Self { + #[allow(deprecated)] Self { $($field_name: $default),* } @@ -166,6 +180,7 @@ macro_rules! config_namespace { } } + config_namespace! { /// Options related to catalog and directory scanning /// @@ -467,6 +482,9 @@ config_namespace! { /// (writing) Sets max statistics size for any column. If NULL, uses /// default parquet writer setting + /// max_statistics_size is deprecated, currently it is not being used + // TODO: remove once deprecated + #[deprecated(since = "45.0.0", note = "Setting does not do anything")] pub max_statistics_size: Option, default = Some(4096) /// (writing) Target maximum number of rows in each row group (defaults to 1M @@ -1598,19 +1616,23 @@ impl ConfigField for TableParquetOptions { macro_rules! config_namespace_with_hashmap { ( $(#[doc = $struct_d:tt])* + $(#[deprecated($($struct_depr:tt)*)])? // Optional struct-level deprecated attribute $vis:vis struct $struct_name:ident { $( $(#[doc = $d:tt])* + $(#[deprecated($($field_depr:tt)*)])? // Optional field-level deprecated attribute $field_vis:vis $field_name:ident : $field_type:ty, $(transform = $transform:expr,)? default = $default:expr )*$(,)* } ) => { $(#[doc = $struct_d])* + $(#[deprecated($($struct_depr)*)])? // Apply struct deprecation #[derive(Debug, Clone, PartialEq)] $vis struct $struct_name{ $( $(#[doc = $d])* + $(#[deprecated($($field_depr)*)])? // Apply field deprecation $field_vis $field_name : $field_type, )* } @@ -1621,6 +1643,8 @@ macro_rules! config_namespace_with_hashmap { match key { $( stringify!($field_name) => { + // Handle deprecated fields + #[allow(deprecated)] // Allow deprecated fields $(let value = $transform(value);)? self.$field_name.set(rem, value.as_ref()) }, @@ -1635,6 +1659,8 @@ macro_rules! config_namespace_with_hashmap { $( let key = format!(concat!("{}.", stringify!($field_name)), key_prefix); let desc = concat!($($d),*).trim(); + // Handle deprecated fields + #[allow(deprecated)] self.$field_name.visit(v, key.as_str(), desc); )* } @@ -1642,6 +1668,7 @@ macro_rules! config_namespace_with_hashmap { impl Default for $struct_name { fn default() -> Self { + #[allow(deprecated)] Self { $($field_name: $default),* } @@ -1653,7 +1680,7 @@ macro_rules! config_namespace_with_hashmap { let parts: Vec<&str> = key.splitn(2, "::").collect(); match parts.as_slice() { [inner_key, hashmap_key] => { - // Get or create the ColumnOptions for the specified column + // Get or create the struct for the specified key let inner_value = self .entry((*hashmap_key).to_owned()) .or_insert_with($struct_name::default); @@ -1669,6 +1696,7 @@ macro_rules! config_namespace_with_hashmap { $( let key = format!("{}.{field}::{}", key_prefix, column_name, field = stringify!($field_name)); let desc = concat!($($d),*).trim(); + #[allow(deprecated)] col_options.$field_name.visit(v, key.as_str(), desc); )* } @@ -1720,6 +1748,9 @@ config_namespace_with_hashmap! { /// Sets max statistics size for the column path. If NULL, uses /// default parquet options + /// max_statistics_size is deprecated, currently it is not being used + // TODO: remove once deprecated + #[deprecated(since = "45.0.0", note = "Setting does not do anything")] pub max_statistics_size: Option, default = None } } diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 3f06e11bb3767..88e879ed336c2 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -26,6 +26,7 @@ use crate::{ }; use arrow_schema::Schema; +// TODO: handle once deprecated #[allow(deprecated)] use parquet::{ arrow::ARROW_SCHEMA_META_KEY, @@ -157,6 +158,9 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder { builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv); } + // max_statistics_size is deprecated, currently it is not being used + // TODO: remove once deprecated + #[allow(deprecated)] if let Some(max_statistics_size) = options.max_statistics_size { builder = { #[allow(deprecated)] @@ -202,6 +206,7 @@ impl ParquetOptions { /// /// Note that this method does not include the key_value_metadata from [`TableParquetOptions`]. pub fn into_writer_properties_builder(&self) -> Result { + #[allow(deprecated)] let ParquetOptions { data_pagesize_limit, write_batch_size, @@ -535,7 +540,6 @@ mod tests { ), bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp), bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv), - #[allow(deprecated)] max_statistics_size: Some(props.max_statistics_size(&col)), } } From b3fb77b85c407b753ccbc0249034750c6e3e724c Mon Sep 17 00:00:00 2001 From: logan-keede Date: Sun, 19 Jan 2025 02:06:52 +0530 Subject: [PATCH 2/5] formatting by rustfmt --- datafusion/common/src/config.rs | 17 ++++++++--------- .../common/src/file_options/parquet_writer.rs | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index bb907d745dd2e..32b7213d952f5 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -112,15 +112,15 @@ macro_rules! config_namespace { ( $(#[doc = $struct_d:tt])* // Struct-level documentation attributes $(#[deprecated($($struct_depr:tt)*)])? // Optional struct-level deprecated attribute - $(#[allow($($struct_de:tt)*)])? + $(#[allow($($struct_de:tt)*)])? $vis:vis struct $struct_name:ident { $( $(#[doc = $d:tt])* // Field-level documentation attributes $(#[deprecated($($field_depr:tt)*)])? // Optional field-level deprecated attribute - $(#[allow($($field_de:tt)*)])? - $field_vis:vis $field_name:ident : $field_type:ty, - $(warn = $warn:expr,)? - $(transform = $transform:expr,)? + $(#[allow($($field_de:tt)*)])? + $field_vis:vis $field_name:ident : $field_type:ty, + $(warn = $warn:expr,)? + $(transform = $transform:expr,)? default = $default:expr )*$(,)* } @@ -145,7 +145,7 @@ macro_rules! config_namespace { $( stringify!($field_name) => { // Safely apply deprecated attribute if present - // $(#[allow(deprecated)])? + // $(#[allow(deprecated)])? { $(let value = $transform(value);)? // Apply transformation if specified $(log::warn!($warn);)? // Log warning if specified @@ -180,7 +180,6 @@ macro_rules! config_namespace { } } - config_namespace! { /// Options related to catalog and directory scanning /// @@ -482,7 +481,7 @@ config_namespace! { /// (writing) Sets max statistics size for any column. If NULL, uses /// default parquet writer setting - /// max_statistics_size is deprecated, currently it is not being used + /// max_statistics_size is deprecated, currently it is not being used // TODO: remove once deprecated #[deprecated(since = "45.0.0", note = "Setting does not do anything")] pub max_statistics_size: Option, default = Some(4096) @@ -1748,7 +1747,7 @@ config_namespace_with_hashmap! { /// Sets max statistics size for the column path. If NULL, uses /// default parquet options - /// max_statistics_size is deprecated, currently it is not being used + /// max_statistics_size is deprecated, currently it is not being used // TODO: remove once deprecated #[deprecated(since = "45.0.0", note = "Setting does not do anything")] pub max_statistics_size: Option, default = None diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 88e879ed336c2..932da5af379e2 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -158,7 +158,7 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder { builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv); } - // max_statistics_size is deprecated, currently it is not being used + // max_statistics_size is deprecated, currently it is not being used // TODO: remove once deprecated #[allow(deprecated)] if let Some(max_statistics_size) = options.max_statistics_size { From 01de034cd1990ce05ccdfab19ab47181e3ccb0d5 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 19 Jan 2025 09:45:47 -0500 Subject: [PATCH 3/5] Sprinkle more #[allow(deprecated)] --- datafusion/common/src/file_options/parquet_writer.rs | 4 ++++ datafusion/proto-common/src/from_proto/mod.rs | 2 ++ datafusion/proto-common/src/to_proto/mod.rs | 2 ++ datafusion/proto/src/logical_plan/file_formats.rs | 3 +++ 4 files changed, 11 insertions(+) diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 932da5af379e2..6a717d3c0c60f 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -457,6 +457,7 @@ mod tests { fn column_options_with_non_defaults( src_col_defaults: &ParquetOptions, ) -> ParquetColumnOptions { + #[allow(deprecated)] // max_statistics_size ParquetColumnOptions { compression: Some("zstd(22)".into()), dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v), @@ -477,6 +478,7 @@ mod tests { "1.0" }; + #[allow(deprecated)] // max_statistics_size ParquetOptions { data_pagesize_limit: 42, write_batch_size: 42, @@ -520,6 +522,7 @@ mod tests { ) -> ParquetColumnOptions { let bloom_filter_default_props = props.bloom_filter_properties(&col); + #[allow(deprecated)] // max_statistics_size ParquetColumnOptions { bloom_filter_enabled: Some(bloom_filter_default_props.is_some()), encoding: props.encoding(&col).map(|s| s.to_string()), @@ -573,6 +576,7 @@ mod tests { HashMap::from([(COL_NAME.into(), configured_col_props)]) }; + #[allow(deprecated)] // max_statistics_size TableParquetOptions { global: ParquetOptions { // global options diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index 37462acec7bec..87068c1269d29 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -934,6 +934,7 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { fn try_from( value: &protobuf::ParquetOptions, ) -> datafusion_common::Result { + #[allow(deprecated)] // max_statistics_size Ok(ParquetOptions { enable_page_index: value.enable_page_index, pruning: value.pruning, @@ -1011,6 +1012,7 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions { fn try_from( value: &protobuf::ParquetColumnOptions, ) -> datafusion_common::Result { + #[allow(deprecated)] // max_statistics_size Ok(ParquetColumnOptions { compression: value.compression_opt.clone().map(|opt| match opt { protobuf::parquet_column_options::CompressionOpt::Compression(v) => Some(v), diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index c69f7b85f488e..8df40a9fb1754 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -819,6 +819,7 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled), dictionary_page_size_limit: value.dictionary_page_size_limit as u64, statistics_enabled_opt: value.statistics_enabled.clone().map(protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled), + #[allow(deprecated)] max_statistics_size_opt: value.max_statistics_size.map(|v| protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v as u64)), max_row_group_size: value.max_row_group_size as u64, created_by: value.created_by.clone(), @@ -857,6 +858,7 @@ impl TryFrom<&ParquetColumnOptions> for protobuf::ParquetColumnOptions { .statistics_enabled .clone() .map(protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled), + #[allow(deprecated)] max_statistics_size_opt: value.max_statistics_size.map(|v| { protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize( v as u32, diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index 772e6d23426a2..237e6d2a71370 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -362,6 +362,7 @@ impl TableParquetOptionsProto { }; let column_specific_options = global_options.column_specific_options; + #[allow(deprecated)] // max_statistics_size TableParquetOptionsProto { global: Some(ParquetOptionsProto { enable_page_index: global_options.global.enable_page_index, @@ -455,6 +456,7 @@ impl TableParquetOptionsProto { impl From<&ParquetOptionsProto> for ParquetOptions { fn from(proto: &ParquetOptionsProto) -> Self { + #[allow(deprecated)] // max_statistics_size ParquetOptions { enable_page_index: proto.enable_page_index, pruning: proto.pruning, @@ -509,6 +511,7 @@ impl From<&ParquetOptionsProto> for ParquetOptions { impl From for ParquetColumnOptions { fn from(proto: ParquetColumnOptionsProto) -> Self { + #[allow(deprecated)] // max_statistics_size ParquetColumnOptions { bloom_filter_enabled: proto.bloom_filter_enabled_opt.map( |parquet_column_options::BloomFilterEnabledOpt::BloomFilterEnabled(v)| v, From 4627038b149ae0c6654d0c983e09c7ee3f6e11b2 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 21 Jan 2025 16:40:35 -0500 Subject: [PATCH 4/5] update expected configs --- docs/source/user-guide/configs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 1c39064c15d7b..dd9ce759b28a3 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -66,7 +66,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | | datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.max_statistics_size | 4096 | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.max_statistics_size | 4096 | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used | | datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | | datafusion.execution.parquet.created_by | datafusion version 44.0.0 | (writing) Sets "created by" property | | datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length | From 07709ff1d623322cbd76913db6bf04d70ffd2c01 Mon Sep 17 00:00:00 2001 From: logan-keede Date: Fri, 24 Jan 2025 20:55:20 +0530 Subject: [PATCH 5/5] fix: add deprecation warning to information_schema --- datafusion/sqllogictest/test_files/information_schema.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 46618b32d77ac..4653df4000803 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -301,7 +301,7 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets b datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. datafusion.execution.parquet.encoding NULL (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. -datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting +datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer