From cf353acab1493afdcdfc5b59e45df8f81772b71a Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Sat, 15 Oct 2022 23:00:40 +0200 Subject: [PATCH 1/2] Add setting for statistics collection --- datafusion/core/src/execution/context.rs | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 734becca30a6b..616e27e93f3f6 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -491,7 +491,7 @@ impl SessionContext { }; let options = ListingOptions { format: file_format, - collect_stat: false, + collect_stat: self.copied_config().collect_statistics, file_extension: file_extension.to_owned(), target_partitions: self.copied_config().target_partitions, table_partition_cols: cmd.table_partition_cols.clone(), @@ -1092,6 +1092,8 @@ pub const REPARTITION_AGGREGATIONS: &str = "repartition_aggregations"; pub const REPARTITION_WINDOWS: &str = "repartition_windows"; /// Session Configuration entry name for 'PARQUET_PRUNING' pub const PARQUET_PRUNING: &str = "parquet_pruning"; +/// Session Configuration entry name for 'COLLECT_STATISTICS' +pub const COLLECT_STATISTICS: &str = "collect_statistics"; /// Map that holds opaque objects indexed by their type. /// @@ -1149,6 +1151,8 @@ pub struct SessionConfig { pub repartition_windows: bool, /// Should DataFusion parquet reader using the predicate to prune data pub parquet_pruning: bool, + /// Should DataFusion collect statistics after listing files + pub collect_statistics: bool, /// Configuration options pub config_options: Arc>, /// Opaque extensions. @@ -1167,6 +1171,7 @@ impl Default for SessionConfig { repartition_aggregations: true, repartition_windows: true, parquet_pruning: true, + collect_statistics: false, config_options: Arc::new(RwLock::new(ConfigOptions::new())), // Assume no extensions by default. extensions: HashMap::with_capacity_and_hasher( @@ -1269,6 +1274,12 @@ impl SessionConfig { self } + /// Enables or disables the collection of statistics after listing files + pub fn with_collect_statistics(mut self, enabled: bool) -> Self { + self.collect_statistics = enabled; + self + } + /// Get the currently configured batch size pub fn batch_size(&self) -> usize { self.config_options @@ -1312,6 +1323,11 @@ impl SessionConfig { PARQUET_PRUNING.to_owned(), format!("{}", self.parquet_pruning), ); + map.insert( + COLLECT_STATISTICS.to_owned(), + format!("{}", self.collect_statistics), + ); + map } @@ -1771,6 +1787,10 @@ impl TaskContext { .with_parquet_pruning( props.get(PARQUET_PRUNING).unwrap().parse().unwrap(), ) + .with_collect_statistics( + props.get(COLLECT_STATISTICS).unwrap().parse().unwrap(), + ) + } } TaskProperties::SessionConfig(session_config) => session_config.clone(), From 152e4088a206ba3046013fc3b0efd68f0844a844 Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Sat, 15 Oct 2022 23:02:56 +0200 Subject: [PATCH 2/2] fmt --- datafusion/core/src/execution/context.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 616e27e93f3f6..e8e769bd65058 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -1790,7 +1790,6 @@ impl TaskContext { .with_collect_statistics( props.get(COLLECT_STATISTICS).unwrap().parse().unwrap(), ) - } } TaskProperties::SessionConfig(session_config) => session_config.clone(),