diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index aa1ec477345c6..0c87afc03becf 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -820,7 +820,7 @@ run_clickbench_partitioned() { run_clickbench_pushdown() { RESULTS_FILE="${RESULTS_DIR}/clickbench_pushdown.json" echo "RESULTS_FILE: ${RESULTS_FILE}" - echo "Running clickbench (partitioned, 100 files) benchmark with pushdown_filters=true, reorder_filters=true..." + echo "Running clickbench (partitioned, 100 files) benchmark with pushdown_filters=true..." debug_run $CARGO_COMMAND --bin dfbench -- clickbench --pushdown --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG} } diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs index 70aaeb7d2d192..40491b48ab330 100644 --- a/benchmarks/src/clickbench.rs +++ b/benchmarks/src/clickbench.rs @@ -58,7 +58,6 @@ pub struct RunOpt { /// /// Specifically, it enables: /// * `pushdown_filters = true` - /// * `reorder_filters = true` #[arg(long = "pushdown")] pushdown: bool, @@ -196,14 +195,12 @@ impl RunOpt { // Turn on Parquet filter pushdown if requested if self.pushdown { parquet_options.pushdown_filters = true; - parquet_options.reorder_filters = true; } if self.sorted_by.is_some() { // We should compare the dynamic topk optimization when data is sorted, so we make the // assumption that filter pushdown is also enabled in this case. parquet_options.pushdown_filters = true; - parquet_options.reorder_filters = true; } } diff --git a/datafusion-examples/examples/data_io/json_shredding.rs b/datafusion-examples/examples/data_io/json_shredding.rs index 72fbb56773123..174e05a914b29 100644 --- a/datafusion-examples/examples/data_io/json_shredding.rs +++ b/datafusion-examples/examples/data_io/json_shredding.rs @@ -90,7 +90,10 @@ pub async fn json_shredding() -> Result<()> { store.put(&path, payload).await?; // Set up query execution - let mut cfg = SessionConfig::new(); + let mut cfg = SessionConfig::default().set( + "datafusion.execution.parquet.filter_pushdown_min_bytes_per_sec", + &ScalarValue::Float64(Some(0.0)), + ); cfg.options_mut().execution.parquet.pushdown_filters = true; let ctx = SessionContext::new_with_config(cfg); ctx.runtime_env().register_object_store( diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 85361ef5e17e1..803ca83603f3b 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -877,11 +877,6 @@ config_namespace! { /// reduce the number of rows decoded. This optimization is sometimes called "late materialization". pub pushdown_filters: bool, default = false - /// (reading) If true, filter expressions evaluated during the parquet decoding operation - /// will be reordered heuristically to minimize the cost of evaluation. If false, - /// the filters are applied in the same order as written in the query - pub reorder_filters: bool, default = false - /// (reading) Force the use of RowSelections for filter results, when /// pushdown_filters is enabled. If false, the reader will automatically /// choose between a RowSelection and a Bitmap based on the number and @@ -919,6 +914,49 @@ config_namespace! { /// parquet reader setting. 0 means no caching. pub max_predicate_cache_size: Option, default = None + /// (reading) Minimum bytes/sec throughput for adaptive filter pushdown. + /// Filters that achieve at least this throughput (bytes_saved / eval_time) + /// are promoted to row filters. + /// f64::INFINITY = no filters promoted (feature disabled). + /// 0.0 = all filters pushed as row filters (no adaptive logic). + /// Default: 104,857,600 bytes/sec (100 MiB/sec), empirically chosen based on + /// TPC-H, TPC-DS, and ClickBench benchmarks on an m4 MacBook Pro. + /// The optimal value for this setting likely depends on the relative + /// cost of CPU vs. IO in your environment, and to some extent the shape + /// of your query. + /// + /// **Interaction with `pushdown_filters`:** + /// This option only takes effect when `pushdown_filters = true`. + /// When pushdown is disabled, all filters run post-scan and this + /// threshold is ignored. + pub filter_pushdown_min_bytes_per_sec: f64, default = 104_857_600.0 + + /// (reading) Byte-ratio threshold for applying filters one at a time + /// (iterative pruning; aka row-level) vs. all at once (post-scan). + /// The ratio is computed as: (extra filter bytes not in projection) / (projected bytes). + /// Filters whose extra columns consume a smaller fraction than this threshold are placed as row filters. + /// Ratio of filter column bytes to projection bytes that controls + /// initial filter placement. Computed as + /// `filter_compressed_bytes / projection_compressed_bytes`. + /// Filters below this ratio start as row-level filters (enabling late + /// materialization); those above start as post-scan filters. + /// Default: 0.20 — filters whose columns are less than 20% of the + /// projection bytes start at row-level. + /// + /// **Interaction with `pushdown_filters`:** + /// Only takes effect when `pushdown_filters = true`. + pub filter_collecting_byte_ratio_threshold: f64, default = 0.20 + + /// (reading) Z-score for confidence intervals on filter effectiveness. + /// Controls how much statistical evidence is required before promoting + /// or demoting a filter. Lower values = faster decisions with less + /// confidence. Higher values = more conservative, requiring more data. + /// Default: 2.0 (~95% confidence). + /// + /// **Interaction with `pushdown_filters`:** + /// Only takes effect when `pushdown_filters = true`. + pub filter_confidence_z: f64, default = 2.0 + // The following options affect writing to parquet files // and map to parquet::file::properties::WriterProperties diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index eaf5a1642e8e2..6cab1516d62b2 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -199,7 +199,7 @@ impl ParquetOptions { skip_metadata: _, metadata_size_hint: _, pushdown_filters: _, - reorder_filters: _, + force_filter_selections: _, // not used for writer props allow_single_file_parallelism: _, maximum_parallel_row_group_writers: _, @@ -210,6 +210,9 @@ impl ParquetOptions { coerce_int96: _, // not used for writer props skip_arrow_metadata: _, max_predicate_cache_size: _, + filter_pushdown_min_bytes_per_sec: _, // not used for writer props + filter_collecting_byte_ratio_threshold: _, // not used for writer props + filter_confidence_z: _, // not used for writer props } = self; let mut builder = WriterProperties::builder() @@ -470,7 +473,7 @@ mod tests { skip_metadata: defaults.skip_metadata, metadata_size_hint: defaults.metadata_size_hint, pushdown_filters: defaults.pushdown_filters, - reorder_filters: defaults.reorder_filters, + force_filter_selections: defaults.force_filter_selections, allow_single_file_parallelism: defaults.allow_single_file_parallelism, maximum_parallel_row_group_writers: defaults @@ -484,6 +487,10 @@ mod tests { coerce_int96: None, max_predicate_cache_size: defaults.max_predicate_cache_size, use_content_defined_chunking: defaults.use_content_defined_chunking.clone(), + filter_pushdown_min_bytes_per_sec: defaults.filter_pushdown_min_bytes_per_sec, + filter_collecting_byte_ratio_threshold: defaults + .filter_collecting_byte_ratio_threshold, + filter_confidence_z: defaults.filter_confidence_z, } } @@ -585,7 +592,7 @@ mod tests { skip_metadata: global_options_defaults.skip_metadata, metadata_size_hint: global_options_defaults.metadata_size_hint, pushdown_filters: global_options_defaults.pushdown_filters, - reorder_filters: global_options_defaults.reorder_filters, + force_filter_selections: global_options_defaults.force_filter_selections, allow_single_file_parallelism: global_options_defaults .allow_single_file_parallelism, @@ -607,6 +614,11 @@ mod tests { norm_level: c.norm_level, } }), + filter_pushdown_min_bytes_per_sec: global_options_defaults + .filter_pushdown_min_bytes_per_sec, + filter_collecting_byte_ratio_threshold: global_options_defaults + .filter_collecting_byte_ratio_threshold, + filter_confidence_z: global_options_defaults.filter_confidence_z, }, column_specific_options, key_value_metadata, diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs index ccd5766f0a24d..a0b3c992fa076 100644 --- a/datafusion/core/src/dataframe/parquet.rs +++ b/datafusion/core/src/dataframe/parquet.rs @@ -164,7 +164,14 @@ mod tests { let plan = df.explain(false, false)?.collect().await?; // Filters all the way to Parquet let formatted = pretty::pretty_format_batches(&plan)?.to_string(); - assert!(formatted.contains("FilterExec: id@0 = 1"), "{formatted}"); + let data_source_exec_row = formatted + .lines() + .find(|line| line.contains("DataSourceExec:")) + .unwrap(); + assert!( + data_source_exec_row.contains("predicate=id@0 = 1"), + "{formatted}" + ); Ok(()) } diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index dd8c20628b43e..0401f4604137e 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -167,9 +167,11 @@ mod tests { } if self.pushdown_predicate { + let mut opts = source.table_parquet_options().clone(); + opts.global.filter_pushdown_min_bytes_per_sec = 0.0; source = source - .with_pushdown_filters(true) - .with_reorder_filters(true); + .with_table_parquet_options(opts) + .with_pushdown_filters(true); } else { source = source.with_pushdown_filters(false); } diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index c53495421307b..6aa313ce720fc 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -57,8 +57,6 @@ pub struct TestParquetFile { pub struct ParquetScanOptions { /// Enable pushdown filters pub pushdown_filters: bool, - /// enable reordering filters - pub reorder_filters: bool, /// enable page index pub enable_page_index: bool, } @@ -68,8 +66,9 @@ impl ParquetScanOptions { pub fn config(&self) -> SessionConfig { let mut config = ConfigOptions::new(); config.execution.parquet.pushdown_filters = self.pushdown_filters; - config.execution.parquet.reorder_filters = self.reorder_filters; config.execution.parquet.enable_page_index = self.enable_page_index; + // Disable adaptive filter selection for tests that expect deterministic pushdown + config.execution.parquet.filter_pushdown_min_bytes_per_sec = 0.0; config.into() } } diff --git a/datafusion/core/tests/parquet/filter_pushdown.rs b/datafusion/core/tests/parquet/filter_pushdown.rs index e6266b2c088d7..6a5fc2c9beeed 100644 --- a/datafusion/core/tests/parquet/filter_pushdown.rs +++ b/datafusion/core/tests/parquet/filter_pushdown.rs @@ -444,7 +444,6 @@ impl<'a> TestCase<'a> { .read_with_options( ParquetScanOptions { pushdown_filters: false, - reorder_filters: false, enable_page_index: false, }, filter, @@ -455,7 +454,6 @@ impl<'a> TestCase<'a> { .read_with_options( ParquetScanOptions { pushdown_filters: true, - reorder_filters: false, enable_page_index: false, }, filter, @@ -464,24 +462,10 @@ impl<'a> TestCase<'a> { assert_eq!(no_pushdown, only_pushdown); - let pushdown_and_reordering = self - .read_with_options( - ParquetScanOptions { - pushdown_filters: true, - reorder_filters: true, - enable_page_index: false, - }, - filter, - ) - .await; - - assert_eq!(no_pushdown, pushdown_and_reordering); - let page_index_only = self .read_with_options( ParquetScanOptions { pushdown_filters: false, - reorder_filters: false, enable_page_index: true, }, filter, @@ -489,18 +473,17 @@ impl<'a> TestCase<'a> { .await; assert_eq!(no_pushdown, page_index_only); - let pushdown_reordering_and_page_index = self + let pushdown_and_page_index = self .read_with_options( ParquetScanOptions { pushdown_filters: true, - reorder_filters: true, enable_page_index: true, }, filter, ) .await; - assert_eq!(no_pushdown, pushdown_reordering_and_page_index); + assert_eq!(no_pushdown, pushdown_and_page_index); } /// Reads data from a test parquet file using the specified scan options @@ -633,6 +616,11 @@ async fn predicate_cache_default() -> datafusion_common::Result<()> { async fn predicate_cache_pushdown_default() -> datafusion_common::Result<()> { let mut config = SessionConfig::new(); config.options_mut().execution.parquet.pushdown_filters = true; + config + .options_mut() + .execution + .parquet + .filter_pushdown_min_bytes_per_sec = 0.0; let ctx = SessionContext::new_with_config(config); // The cache is on by default, and used when filter pushdown is enabled PredicateCacheTest { @@ -647,6 +635,11 @@ async fn predicate_cache_pushdown_default() -> datafusion_common::Result<()> { async fn predicate_cache_stats_issue_19561() -> datafusion_common::Result<()> { let mut config = SessionConfig::new(); config.options_mut().execution.parquet.pushdown_filters = true; + config + .options_mut() + .execution + .parquet + .filter_pushdown_min_bytes_per_sec = 0.0; // force to get multiple batches to trigger repeated metric compound bug config.options_mut().execution.batch_size = 1; let ctx = SessionContext::new_with_config(config); @@ -664,6 +657,11 @@ async fn predicate_cache_pushdown_default_selections_only() -> datafusion_common::Result<()> { let mut config = SessionConfig::new(); config.options_mut().execution.parquet.pushdown_filters = true; + config + .options_mut() + .execution + .parquet + .filter_pushdown_min_bytes_per_sec = 0.0; // forcing filter selections minimizes the number of rows read from the cache config .options_mut() diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown.rs index 4ff1fad8f52b9..86eadc7bba452 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown.rs @@ -1031,7 +1031,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_partitioned() { // expect the predicate to be pushed down into the probe side DataSource insta::assert_snapshot!( OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true), - @r" + @" OptimizationTest: input: - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -1049,7 +1049,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_partitioned() { - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=Optional(DynamicFilter [ empty ]) " ); @@ -1076,14 +1076,14 @@ async fn test_hashjoin_dynamic_filter_pushdown_partitioned() { #[cfg(not(feature = "force_hash_collisions"))] insta::assert_snapshot!( format!("{}", format_plan_for_test(&plan)), - @r" + @" - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false] - CoalescePartitionsExec - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)] - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ CASE hash_repartition % 12 WHEN 5 THEN a@0 >= ab AND a@0 <= ab AND b@1 >= bb AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:ab,c1:bb}]) WHEN 8 THEN a@0 >= aa AND a@0 <= aa AND b@1 >= ba AND b@1 <= ba AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}]) ELSE false END ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=Optional(DynamicFilter [ CASE hash_repartition % 12 WHEN 5 THEN a@0 >= ab AND a@0 <= ab AND b@1 >= bb AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:ab,c1:bb}]) WHEN 8 THEN a@0 >= aa AND a@0 <= aa AND b@1 >= ba AND b@1 <= ba AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}]) ELSE false END ]) " ); @@ -1101,7 +1101,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_partitioned() { - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=Optional(DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ]) " ); @@ -1231,7 +1231,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_collect_left() { // expect the predicate to be pushed down into the probe side DataSource insta::assert_snapshot!( OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true), - @r" + @" OptimizationTest: input: - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -1247,7 +1247,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_collect_left() { - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)] - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=Optional(DynamicFilter [ empty ]) " ); @@ -1273,13 +1273,13 @@ async fn test_hashjoin_dynamic_filter_pushdown_collect_left() { // Now check what our filter looks like insta::assert_snapshot!( format!("{}", format_plan_for_test(&plan)), - @r" + @" - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false] - CoalescePartitionsExec - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)] - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=Optional(DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ]) " ); @@ -2404,12 +2404,12 @@ async fn test_hashjoin_dynamic_filter_all_partitions_empty() { insta::assert_snapshot!( format_plan_for_test(&plan), - @r" + @" - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)] - RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1 - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true - RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true, predicate=Optional(DynamicFilter [ empty ]) " ); @@ -2429,12 +2429,12 @@ async fn test_hashjoin_dynamic_filter_all_partitions_empty() { // Test that filters are pushed down correctly to each side of the join insta::assert_snapshot!( format_plan_for_test(&plan), - @r" + @" - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)] - RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1 - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true - RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ false ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true, predicate=Optional(DynamicFilter [ false ]) " ); } @@ -2954,18 +2954,28 @@ async fn test_discover_dynamic_filters_via_expressions_api() { use datafusion_physical_expr::expressions::DynamicFilterPhysicalExpr; use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode}; + fn count_in_expr(expr: &dyn PhysicalExpr) -> usize { + let mut count = 0; + if expr.downcast_ref::().is_some() { + count += 1; + } + for child in expr.children() { + count += count_in_expr(child.as_ref()); + } + count + } + fn count_dynamic_filters(plan: &Arc) -> usize { let mut count = 0; - // Check expressions from this node using apply_expressions + // Check expressions from this node using apply_expressions. + // Walk each expression subtree to find DynamicFilterPhysicalExpr even + // when wrapped (e.g. by OptionalFilterPhysicalExpr). let _ = plan.apply_expressions(&mut |expr| { - if let Some(_df) = expr.downcast_ref::() { - count += 1; - } + count += count_in_expr(expr); Ok(TreeNodeRecursion::Continue) }); - // Recursively visit children for child in plan.children() { count += count_dynamic_filters(child); } diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index 8ab0d150a7272..dca95e111fd36 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -878,7 +878,7 @@ async fn parquet_explain_analyze() { .to_string(); // should contain aggregated stats - assert_contains!(&formatted, "output_rows=8"); + assert_contains!(&formatted, "output_rows=5"); assert_contains!( &formatted, "row_groups_pruned_bloom_filter=1 total \u{2192} 1 matched" diff --git a/datafusion/datasource-parquet/benches/parquet_nested_filter_pushdown.rs b/datafusion/datasource-parquet/benches/parquet_nested_filter_pushdown.rs index 02137b5a1d288..cd3d2da56be53 100644 --- a/datafusion/datasource-parquet/benches/parquet_nested_filter_pushdown.rs +++ b/datafusion/datasource-parquet/benches/parquet_nested_filter_pushdown.rs @@ -24,6 +24,7 @@ use arrow::array::{ use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use criterion::{Criterion, Throughput, criterion_group, criterion_main}; use datafusion_common::ScalarValue; +use datafusion_datasource_parquet::selectivity::SelectivityTracker; use datafusion_datasource_parquet::{ParquetFileMetrics, build_row_filter}; use datafusion_expr::{Expr, col}; use datafusion_functions_nested::expr_fn::array_has; @@ -115,9 +116,17 @@ fn scan_with_predicate( let file_metrics = ParquetFileMetrics::new(0, &path.display().to_string(), &metrics); let builder = if pushdown { - if let Some(row_filter) = - build_row_filter(predicate, file_schema, &metadata, false, &file_metrics)? - { + let tracker = Arc::new(SelectivityTracker::new()); + let filters = vec![(0usize, Arc::clone(predicate))]; + let (maybe_row_filter, _unbuildable) = build_row_filter( + &filters, + file_schema, + &metadata, + 0, + &tracker, + &file_metrics, + )?; + if let Some(row_filter) = maybe_row_filter { builder.with_row_filter(row_filter) } else { builder diff --git a/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown.rs b/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown.rs index b52408d4222d8..cfc326d84fb6b 100644 --- a/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown.rs +++ b/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown.rs @@ -50,6 +50,7 @@ use arrow::array::{BooleanArray, Int32Array, RecordBatch, StringBuilder, StructA use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; use criterion::{Criterion, Throughput, criterion_group, criterion_main}; use datafusion_common::ScalarValue; +use datafusion_datasource_parquet::selectivity::SelectivityTracker; use datafusion_datasource_parquet::{ParquetFileMetrics, build_row_filter}; use datafusion_expr::{Expr, col}; use datafusion_physical_expr::planner::logical2physical; @@ -210,9 +211,17 @@ fn scan( let mut filter_applied = false; let builder = if pushdown { - if let Some(row_filter) = - build_row_filter(predicate, file_schema, &metadata, false, &file_metrics)? - { + let tracker = Arc::new(SelectivityTracker::new()); + let filters = vec![(0usize, Arc::clone(predicate))]; + let (maybe_row_filter, _unbuildable) = build_row_filter( + &filters, + file_schema, + &metadata, + 0, + &tracker, + &file_metrics, + )?; + if let Some(row_filter) = maybe_row_filter { filter_applied = true; builder.with_row_filter(row_filter) } else { diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index 7dda7b1b12811..2feb7814c5733 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -501,6 +501,12 @@ impl FileFormat for ParquetFormat { ) -> Result> { let mut metadata_size_hint = None; + let filter_pushdown_min_bytes_per_sec = state + .config_options() + .execution + .parquet + .filter_pushdown_min_bytes_per_sec; + if let Some(metadata) = self.metadata_size_hint() { metadata_size_hint = Some(metadata); } @@ -510,7 +516,10 @@ impl FileFormat for ParquetFormat { .downcast_ref::() .cloned() .ok_or_else(|| internal_datafusion_err!("Expected ParquetSource"))?; - source = source.with_table_parquet_options(self.options.clone()); + let mut options = self.options.clone(); + options.global.filter_pushdown_min_bytes_per_sec = + filter_pushdown_min_bytes_per_sec; + source = source.with_table_parquet_options(options); // Use the CachedParquetFileReaderFactory let metadata_cache = state.runtime_env().cache_manager.get_file_metadata_cache(); diff --git a/datafusion/datasource-parquet/src/metrics.rs b/datafusion/datasource-parquet/src/metrics.rs index 8eb5912b919da..ece86af498f82 100644 --- a/datafusion/datasource-parquet/src/metrics.rs +++ b/datafusion/datasource-parquet/src/metrics.rs @@ -91,6 +91,8 @@ pub struct ParquetFileMetrics { /// number of rows that were stored in the cache after evaluating predicates /// reused for the output. pub predicate_cache_records: Gauge, + //// Time spent applying filters + pub filter_apply_time: Time, } impl ParquetFileMetrics { @@ -192,6 +194,10 @@ impl ParquetFileMetrics { .with_category(MetricCategory::Rows) .gauge("predicate_cache_records", partition); + let filter_apply_time = MetricBuilder::new(metrics) + .with_new_label("filename", filename.to_string()) + .subset_time("filter_apply_time", partition); + Self { files_ranges_pruned_statistics, predicate_evaluation_errors, @@ -211,6 +217,7 @@ impl ParquetFileMetrics { scan_efficiency_ratio, predicate_cache_inner_records, predicate_cache_records, + filter_apply_time, } } } diff --git a/datafusion/datasource-parquet/src/mod.rs b/datafusion/datasource-parquet/src/mod.rs index 9a907f4118a86..eb81383a93ca7 100644 --- a/datafusion/datasource-parquet/src/mod.rs +++ b/datafusion/datasource-parquet/src/mod.rs @@ -33,6 +33,7 @@ mod page_filter; mod reader; mod row_filter; mod row_group_filter; +pub mod selectivity; mod sort; pub mod source; mod supported_predicates; diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index bad1c684b47f5..854c708f06ddc 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -27,6 +27,7 @@ use crate::{ use arrow::array::{RecordBatch, RecordBatchOptions}; use arrow::datatypes::DataType; use datafusion_datasource::morsel::{Morsel, MorselPlan, MorselPlanner, Morselizer}; +use datafusion_physical_expr::conjunction; use datafusion_physical_expr::projection::{ProjectionExprs, Projector}; use datafusion_physical_expr::utils::reassign_expr_columns; use datafusion_physical_expr_adapter::replace_columns_with_literals; @@ -94,8 +95,13 @@ pub(super) struct ParquetMorselizer { pub(crate) limit: Option, /// If should keep the output rows in order pub preserve_order: bool, - /// Optional predicate to apply during the scan - pub predicate: Option>, + /// Optional predicate conjuncts to apply during the scan. Each conjunct + /// carries a stable `FilterId` used by the selectivity tracker. + pub predicate_conjuncts: + Option)>>, + /// Shared adaptive selectivity tracker used to decide per-filter placement + /// (row-level vs post-scan) at runtime. + pub selectivity_tracker: Arc, /// Table schema, including partition columns. pub table_schema: TableSchema, /// Optional hint for how large the initial request to read parquet metadata @@ -108,8 +114,6 @@ pub(super) struct ParquetMorselizer { /// Should the filters be evaluated during the parquet scan using /// [`DataFusionArrowPredicate`](row_filter::DatafusionArrowPredicate)? pub pushdown_filters: bool, - /// Should the filters be reordered to optimize the scan? - pub reorder_filters: bool, /// Should we force the reader to use RowSelections for filtering pub force_filter_selections: bool, /// Should the page index be read from parquet files, if present, to skip @@ -274,7 +278,9 @@ struct PreparedParquetOpen { output_schema: SchemaRef, projection: ProjectionExprs, predicate: Option>, - reorder_predicates: bool, + predicate_conjuncts: + Option)>>, + selectivity_tracker: Arc, pushdown_filters: bool, force_filter_selections: bool, enable_page_index: bool, @@ -597,15 +603,24 @@ impl ParquetMorselizer { )); let mut projection = self.projection.clone(); - let mut predicate = self.predicate.clone(); + let mut predicate_conjuncts = self.predicate_conjuncts.clone(); if !literal_columns.is_empty() { projection = projection.try_map_exprs(|expr| { replace_columns_with_literals(Arc::clone(&expr), &literal_columns) })?; - predicate = predicate - .map(|p| replace_columns_with_literals(p, &literal_columns)) - .transpose()?; + if let Some(ref mut conjuncts) = predicate_conjuncts { + for (_id, expr) in conjuncts.iter_mut() { + *expr = replace_columns_with_literals( + Arc::clone(expr), + &literal_columns, + )?; + } + } } + // Build a single combined predicate for file-level pruning. + let predicate: Option> = predicate_conjuncts + .as_ref() + .map(|c| conjunction(c.iter().map(|(_, e)| Arc::clone(e)))); let predicate_creation_errors = MetricBuilder::new(&self.metrics) .with_category(MetricCategory::Rows) @@ -643,7 +658,8 @@ impl ParquetMorselizer { output_schema, projection, predicate, - reorder_predicates: self.reorder_filters, + predicate_conjuncts, + selectivity_tracker: Arc::clone(&self.selectivity_tracker), pushdown_filters: self.pushdown_filters, force_filter_selections: self.force_filter_selections, enable_page_index: self.enable_page_index, @@ -819,6 +835,31 @@ impl MetadataLoadedParquetOpen { prepared.projection = prepared .projection .try_map_exprs(|p| simplifier.simplify(rewriter.rewrite(p)?))?; + // Adapt each per-filter conjunct individually, keeping + // FilterIds stable so the adaptive selectivity tracker can + // correlate runtime stats across files. Skip conjuncts that + // simplify to the literal `TRUE` — they add nothing to the + // predicate and would bloat the row-filter / post-scan buckets. + // Both the row-filter and post-scan paths downstream consume + // these in physical-file-schema space: the post-scan path + // widens the decoder's projection mask to include filter + // columns and evaluates against the pre-project wide batch, + // so logical-schema expressions are not required there. + if let Some(ref mut conjuncts) = prepared.predicate_conjuncts { + let mut adapted = Vec::with_capacity(conjuncts.len()); + for (id, expr) in conjuncts.drain(..) { + let rewritten = rewriter.rewrite(expr)?; + let simplified = simplifier.simplify(rewritten)?; + if let Some(lit) = simplified + .downcast_ref::( + ) && let ScalarValue::Boolean(Some(true)) = lit.value() + { + continue; + } + adapted.push((id, simplified)); + } + *conjuncts = adapted; + } } prepared.physical_file_schema = Arc::clone(&physical_file_schema); @@ -1075,25 +1116,87 @@ impl RowGroupsPrunedParquetOpen { let file_metadata = Arc::clone(reader_metadata.metadata()); let rg_metadata = file_metadata.row_groups(); - // Filter pushdown: evaluate predicates during scan - let row_filter = if let Some(predicate) = prepared - .pushdown_filters - .then_some(prepared.predicate.clone()) - .flatten() + // Adaptive filter placement. + // + // Ask the shared `SelectivityTracker` to split our predicate + // conjuncts (already adapted to `physical_file_schema` in the + // `PrepareFilters` state) into two buckets based on measured + // effectiveness across prior files: + // + // - `row_filters` — evaluated inside the Parquet decoder via + // `ArrowPredicate`s, enabling late-materialization savings. + // - `post_scan` — evaluated against the decoded wide batch just + // before the projector strips it down. Any filter-only columns + // the post-scan filter references are added to the decoder's + // projection mask below, so the filter can always be applied. + // + // For the first file we encounter, `partition_filters` uses a + // cheap byte-ratio heuristic (filter-column bytes / projection- + // column bytes) for initial placement. Subsequent files refine + // the placement using Welford statistics reported from the + // row-filter path (`DatafusionArrowPredicate::evaluate`) and + // the post-scan path (`apply_post_scan_filters_with_stats`). + let projection_column_indices: Vec = { + let mut idxs: Vec = prepared + .projection + .expr_iter() + .flat_map(|expr| { + datafusion_physical_expr::utils::collect_columns(&expr) + .into_iter() + .map(|c| c.index()) + .collect::>() + }) + .collect(); + idxs.sort_unstable(); + idxs.dedup(); + idxs + }; + let projection_compressed_bytes = row_filter::total_compressed_bytes( + &projection_column_indices, + file_metadata.as_ref(), + ); + + let (row_filter_conjuncts, mut post_scan_conjuncts) = if prepared.pushdown_filters + && let Some(conjuncts) = prepared.predicate_conjuncts.clone() + && !conjuncts.is_empty() { - let row_filter = row_filter::build_row_filter( - &predicate, - &prepared.physical_file_schema, + let partitioned = prepared.selectivity_tracker.partition_filters( + conjuncts, + projection_compressed_bytes, file_metadata.as_ref(), - prepared.reorder_predicates, - &prepared.file_metrics, ); + (partitioned.row_filters, partitioned.post_scan) + } else { + (Vec::new(), Vec::new()) + }; - match row_filter { - Ok(Some(filter)) => Some(filter), - Ok(None) => None, + // Build row-level `ArrowPredicate`s for the row_filters bucket. + // Any conjunct that `build_row_filter` reports as `unbuildable` + // falls through to the post-scan bucket so we never silently drop + // a filter — dropping would relax the user's predicate and return + // wrong results. Both buckets are already in `physical_file_schema` + // space (adapted in `PrepareFilters`), so `unbuildable` is mixed + // straight back into `post_scan_conjuncts` without further + // rewriting. + let row_filter = if !row_filter_conjuncts.is_empty() { + match row_filter::build_row_filter( + &row_filter_conjuncts, + &prepared.physical_file_schema, + file_metadata.as_ref(), + projection_compressed_bytes, + &prepared.selectivity_tracker, + &prepared.file_metrics, + ) { + Ok((row_filter, unbuildable)) => { + post_scan_conjuncts.extend(unbuildable); + row_filter + } Err(e) => { - debug!("Ignoring error building row filter for '{predicate:?}': {e}"); + debug!( + "Error building row filter for {row_filter_conjuncts:?}: {e}; \ + falling all row-filter candidates through to post-scan" + ); + post_scan_conjuncts.extend(row_filter_conjuncts); None } } @@ -1101,7 +1204,46 @@ impl RowGroupsPrunedParquetOpen { None }; - // Prune by limit if limit is set and limit order is not sensitive + // Pre-compute the per-filter "other-bytes-per-row" quantity — + // the bytes of projection columns *not* referenced by this filter, + // amortised across rows. This is what late materialization saves + // per pruned row and is the same cost metric the row-filter path + // reports to the tracker, so promote/demote rankings compare + // filters on a single common axis. + let total_rows: i64 = file_metadata + .row_groups() + .iter() + .map(|rg| rg.num_rows()) + .sum(); + let post_scan_other_bytes_per_row: Vec = post_scan_conjuncts + .iter() + .map(|(_, expr)| { + let filter_cols: Vec = + datafusion_physical_expr::utils::collect_columns(expr) + .iter() + .map(|c| c.index()) + .collect(); + let filter_compressed = row_filter::total_compressed_bytes( + &filter_cols, + file_metadata.as_ref(), + ); + if total_rows > 0 { + projection_compressed_bytes.saturating_sub(filter_compressed) as f64 + / total_rows as f64 + } else { + 0.0 + } + }) + .collect(); + + // Prune by limit if limit is set and limit order is not sensitive. + // + // If we have post-scan filters we still apply row-group limit pruning + // — the decoder may still stop early — but we purposely skip + // `decoder_builder.with_limit(limit)` below, and instead apply the + // limit downstream of the post-scan filter wrapper. Applying the + // limit at the decoder when a post-scan filter can still drop rows + // would return fewer rows than requested. if let (Some(limit), false) = (prepared.limit, prepared.preserve_order) { row_groups.prune_by_limit(limit, rg_metadata, &prepared.file_metrics); } @@ -1133,8 +1275,21 @@ impl RowGroupsPrunedParquetOpen { } let arrow_reader_metrics = ArrowReaderMetrics::enabled(); + // Build the decoder's projection/read plan over the UNION of the + // user projection and every post-scan filter's column references. + // This is what lets the post-scan path evaluate filters that touch + // columns which would otherwise be projected away: the decoder + // produces a "wide" batch containing both the user-visible + // projection *and* the extra columns the filters need, and the + // projector below strips the filter-only columns back out after + // the filter has had a chance to read them. + let read_plan_exprs: Vec> = prepared + .projection + .expr_iter() + .chain(post_scan_conjuncts.iter().map(|(_, expr)| Arc::clone(expr))) + .collect(); let read_plan = build_projection_read_plan( - prepared.projection.expr_iter(), + read_plan_exprs, &prepared.physical_file_schema, reader_metadata.parquet_schema(), ); @@ -1157,7 +1312,12 @@ impl RowGroupsPrunedParquetOpen { } decoder_builder = decoder_builder.with_row_groups(prepared_plan.row_group_indexes); - if let Some(limit) = prepared.limit { + // Only push the limit into the decoder when there are no post-scan + // filters. Otherwise the decoder would stop before the filters + // have had a chance to drop rows, returning too few matches. + if let Some(limit) = prepared.limit + && post_scan_conjuncts.is_empty() + { decoder_builder = decoder_builder.with_limit(limit); } if let Some(max_predicate_cache_size) = prepared.max_predicate_cache_size { @@ -1171,22 +1331,37 @@ impl RowGroupsPrunedParquetOpen { prepared.file_metrics.predicate_cache_inner_records.clone(); let predicate_cache_records = prepared.file_metrics.predicate_cache_records.clone(); + let filter_apply_time = prepared.file_metrics.filter_apply_time.clone(); // Check if we need to replace the schema to handle things like differing nullability or metadata. // See note below about file vs. output schema. let stream_schema = read_plan.projected_schema; let replace_schema = stream_schema != prepared.output_schema; - // Rebase column indices to match the narrowed stream schema. - // The projection expressions have indices based on physical_file_schema, - // but the stream only contains the columns selected by the ProjectionMask. + // Rebase column indices to match the (possibly widened) stream + // schema. Both the projection expressions (which only reference + // the user projection) and the post-scan filter expressions (which + // may reference additional columns) need this rebase — both + // originally used indices into `physical_file_schema`, but the + // decoder only decodes the subset of columns in the read plan's + // projection mask. let projection = prepared .projection .try_map_exprs(|expr| reassign_expr_columns(expr, &stream_schema))?; + let post_scan_filters: Vec<( + crate::selectivity::FilterId, + Arc, + )> = post_scan_conjuncts + .into_iter() + .map(|(id, expr)| { + reassign_expr_columns(expr, &stream_schema).map(|e| (id, e)) + }) + .collect::>()?; let projector = projection.make_projector(&stream_schema)?; let output_schema = Arc::clone(&prepared.output_schema); let files_ranges_pruned_statistics = prepared.file_metrics.files_ranges_pruned_statistics.clone(); + let stream = futures::stream::unfold( PushDecoderStreamState { decoder, @@ -1198,14 +1373,18 @@ impl RowGroupsPrunedParquetOpen { predicate_cache_inner_records, predicate_cache_records, baseline_metrics: prepared.baseline_metrics, + post_scan_filters, + selectivity_tracker: Arc::clone(&prepared.selectivity_tracker), + post_scan_other_bytes_per_row, + filter_apply_time, }, |state| async move { state.transition().await }, ) .fuse(); + let stream = stream.boxed(); // Wrap the stream so a dynamic filter can stop the file scan early. if let Some(file_pruner) = prepared.file_pruner { - let stream = stream.boxed(); Ok(EarlyStoppingStream::new( stream, file_pruner, @@ -1213,9 +1392,77 @@ impl RowGroupsPrunedParquetOpen { ) .boxed()) } else { - Ok(stream.boxed()) + Ok(stream) + } + } +} + +/// Apply post-scan filters to a single decoded `RecordBatch`, reporting +/// per-filter selectivity and evaluation cost to the shared +/// [`crate::selectivity::SelectivityTracker`] so the adaptive system can +/// promote filters to row-level (`RowFilter`) in subsequent files. +/// +/// Called by [`PushDecoderStreamState::transition`] against the +/// pre-projection "wide" batch — i.e. a batch containing the user +/// projection columns PLUS any columns the post-scan filters reference +/// that weren't already in the projection. This lets the filters evaluate +/// even against columns that would normally be projected away; the +/// projector (built from the original projection expressions) strips the +/// filter-only columns back out immediately after filtering. +/// +/// `other_bytes_per_row` is the per-filter bytes-per-row contribution of +/// the non-filter projection columns (i.e. the bytes late materialization +/// would save). The row-filter path reports the same quantity via +/// `DatafusionArrowPredicate::other_projected_bytes_per_row`, so the +/// tracker can rank filters on a single common axis. +fn apply_post_scan_filters_with_stats( + batch: RecordBatch, + filters: &[(crate::selectivity::FilterId, Arc)], + other_bytes_per_row: &[f64], + tracker: &crate::selectivity::SelectivityTracker, +) -> Result { + use arrow::array::BooleanArray; + use arrow::compute::{and, filter_record_batch}; + use datafusion_common::cast::as_boolean_array; + + if batch.num_rows() == 0 { + return Ok(batch); + } + + let input_rows = batch.num_rows() as u64; + let mut combined_mask: Option = None; + + for (i, (id, expr)) in filters.iter().enumerate() { + // Mid-stream drop, mirror of `DatafusionArrowPredicate::evaluate`. + // Set by the tracker on `OptionalFilterPhysicalExpr` whose CI + // upper bound has fallen below `min_bytes_per_sec`; correctness is + // preserved because the originating join independently enforces + // the predicate. We do not update the tracker for a skipped batch. + if tracker.is_filter_skipped(*id) { + continue; + } + + let start = datafusion_common::instant::Instant::now(); + let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?; + let bool_arr = as_boolean_array(result.as_ref())?; + let nanos = start.elapsed().as_nanos() as u64; + let num_matched = bool_arr.true_count() as u64; + + let other_bytes = (other_bytes_per_row[i] * input_rows as f64) as u64; + tracker.update(*id, num_matched, input_rows, nanos, other_bytes); + + if num_matched < input_rows { + combined_mask = Some(match combined_mask { + Some(prev) => and(&prev, bool_arr)?, + None => bool_arr.clone(), + }); } } + + match combined_mask { + Some(mask) => Ok(filter_record_batch(&batch, &mask)?), + None => Ok(batch), + } } /// State for a stream that decodes a single Parquet file using a push-based decoder. @@ -1234,6 +1481,23 @@ struct PushDecoderStreamState { predicate_cache_inner_records: Gauge, predicate_cache_records: Gauge, baseline_metrics: BaselineMetrics, + /// Post-scan filters expressed against `stream_schema` (the wide + /// schema the decoder yields, which includes filter-only columns added + /// to the projection mask specifically so post-scan can evaluate + /// against them). Applied to each decoded batch before `project_batch` + /// narrows the batch down to the user-requested projection; the + /// projector naturally drops the filter-only columns after filtering. + post_scan_filters: Vec<(crate::selectivity::FilterId, Arc)>, + /// Shared adaptive tracker fed per-filter per-batch stats from + /// `apply_post_scan_filters_with_stats`, mirroring what the row-filter + /// `DatafusionArrowPredicate::evaluate` path reports. + selectivity_tracker: Arc, + /// Pre-computed "bytes per row of the non-filter projection" for each + /// post-scan filter (same cost metric the row-filter path reports, so + /// promote/demote decisions compare apples to apples). + post_scan_other_bytes_per_row: Vec, + /// Elapsed-time metric for post-scan filter application. + filter_apply_time: datafusion_physical_plan::metrics::Time, } impl PushDecoderStreamState { @@ -1271,11 +1535,45 @@ impl PushDecoderStreamState { Ok(DecodeResult::Data(batch)) => { let mut timer = self.baseline_metrics.elapsed_compute().timer(); self.copy_arrow_reader_metrics(); - let result = self.project_batch(&batch); - timer.stop(); - // Release the borrow on baseline_metrics before moving self - drop(timer); - return Some((result, self)); + // Apply post-scan filters against the pre-projection + // (wide) batch so any filter-only columns the decoder + // decoded for us are still present. After filtering, + // the projector strips them back out. + let filtered = if self.post_scan_filters.is_empty() { + Ok(batch) + } else { + let start = datafusion_common::instant::Instant::now(); + let out = apply_post_scan_filters_with_stats( + batch, + &self.post_scan_filters, + &self.post_scan_other_bytes_per_row, + &self.selectivity_tracker, + ); + self.filter_apply_time.add_elapsed(start); + out + }; + match filtered { + // Post-scan may filter every row in a batch. Skip + // fully-empty filtered batches and loop to the + // next decoded batch — emitting an empty batch + // would noisily turn into one visible output + // batch downstream. + Ok(b) if b.num_rows() == 0 => { + timer.stop(); + continue; + } + Ok(b) => { + let result = self.project_batch(&b); + timer.stop(); + drop(timer); + return Some((result, self)); + } + Err(e) => { + timer.stop(); + drop(timer); + return Some((Err(e), self)); + } + } } Ok(DecodeResult::Finished) => { return None; @@ -1667,7 +1965,6 @@ mod test { metadata_size_hint: Option, metrics: ExecutionPlanMetricsSet, pushdown_filters: bool, - reorder_filters: bool, force_filter_selections: bool, enable_page_index: bool, enable_bloom_filter: bool, @@ -1693,7 +1990,6 @@ mod test { metadata_size_hint: None, metrics: ExecutionPlanMetricsSet::new(), pushdown_filters: false, - reorder_filters: false, force_filter_selections: false, enable_page_index: false, enable_bloom_filter: false, @@ -1741,12 +2037,6 @@ mod test { self } - /// Enable filter reordering. - fn with_reorder_filters(mut self, enable: bool) -> Self { - self.reorder_filters = enable; - self - } - /// Enable row group stats pruning. fn with_row_group_stats_pruning(mut self, enable: bool) -> Self { self.enable_row_group_stats_pruning = enable; @@ -1789,13 +2079,26 @@ mod test { ProjectionExprs::from_indices(&all_indices, &file_schema) }; + use datafusion_physical_expr::split_conjunction; + let predicate_conjuncts: Option< + Vec<(crate::selectivity::FilterId, Arc)>, + > = self.predicate.as_ref().map(|p| { + split_conjunction(p) + .into_iter() + .enumerate() + .map(|(id, expr)| (id, Arc::clone(expr))) + .collect() + }); + let selectivity_tracker = + Arc::new(crate::selectivity::SelectivityTracker::new()); ParquetMorselizer { partition_index: self.partition_index, projection, batch_size: self.batch_size, limit: self.limit, preserve_order: self.preserve_order, - predicate: self.predicate, + predicate_conjuncts, + selectivity_tracker, table_schema, metadata_size_hint: self.metadata_size_hint, metrics: self.metrics, @@ -1803,7 +2106,6 @@ mod test { DefaultParquetFileReaderFactory::new(store), ), pushdown_filters: self.pushdown_filters, - reorder_filters: self.reorder_filters, force_filter_selections: self.force_filter_selections, enable_page_index: self.enable_page_index, enable_bloom_filter: self.enable_bloom_filter, @@ -2241,7 +2543,6 @@ mod test { .with_projection_indices(&[0]) .with_predicate(predicate) .with_pushdown_filters(true) // note that this is true! - .with_reorder_filters(true) .build() }; diff --git a/datafusion/datasource-parquet/src/row_filter.rs b/datafusion/datasource-parquet/src/row_filter.rs index c5c372055826b..71ff6d54ea412 100644 --- a/datafusion/datasource-parquet/src/row_filter.rs +++ b/datafusion/datasource-parquet/src/row_filter.rs @@ -65,6 +65,7 @@ //! - `WHERE s['value'] > 5` — pushed down (accesses a primitive leaf) //! - `WHERE s IS NOT NULL` — not pushed down (references the whole struct) +use log::debug; use std::collections::BTreeSet; use std::sync::Arc; @@ -81,10 +82,10 @@ use parquet::schema::types::SchemaDescriptor; use datafusion_common::Result; use datafusion_common::cast::as_boolean_array; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; +use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr::ScalarFunctionExpr; use datafusion_physical_expr::expressions::{Column, Literal}; use datafusion_physical_expr::utils::{collect_columns, reassign_expr_columns}; -use datafusion_physical_expr::{PhysicalExpr, split_conjunction}; use datafusion_physical_plan::metrics; @@ -119,18 +120,49 @@ pub(crate) struct DatafusionArrowPredicate { rows_matched: metrics::Count, /// how long was spent evaluating this predicate time: metrics::Time, + /// Stable id used by the adaptive selectivity tracker to key per-filter + /// statistics across files. + filter_id: crate::selectivity::FilterId, + /// Shared handle to the adaptive selectivity tracker. Per-batch stats + /// are reported through `update()` after each `evaluate()` call. + tracker: Arc, + /// Estimated *late-materialization savings* per row for this filter: + /// the compressed bytes of projection columns that the filter does + /// NOT reference, amortised across the file's rows. When a pruned + /// row is dropped by the filter, these are the bytes the reader + /// avoids decoding further along the pipeline — the quantity the + /// adaptive tracker needs in order to rank filters by "cost avoided + /// per unit evaluation time". This MUST match the metric the + /// post-scan path reports in `apply_post_scan_filters_with_stats` + /// (see `opener.rs::post_scan_other_bytes_per_row`); if the two + /// paths disagreed, the tracker would rank row-filter and post-scan + /// candidates on incomparable axes and wrongly promote or demote them. + other_projected_bytes_per_row: f64, + /// Mid-stream "drop" flag, shared with the + /// [`crate::selectivity::SelectivityTracker`]. The tracker flips this + /// when an `OptionalFilterPhysicalExpr` proves CPU-dominated and + /// ineffective; once set, [`Self::evaluate`] returns an all-true mask + /// without invoking `physical_expr`. Filter columns are still decoded + /// (the parquet decoder cannot be reconfigured mid-scan), so this only + /// reclaims CPU, not I/O. Flagged only for filters known to be + /// optional, so correctness is preserved by the join itself. + skip_flag: Arc, } impl DatafusionArrowPredicate { - /// Create a new `DatafusionArrowPredicate` from a `FilterCandidate` + /// Create a new `DatafusionArrowPredicate` from a `FilterCandidate`. pub fn try_new( candidate: FilterCandidate, rows_pruned: metrics::Count, rows_matched: metrics::Count, time: metrics::Time, + filter_id: crate::selectivity::FilterId, + tracker: Arc, + other_projected_bytes_per_row: f64, ) -> Result { let physical_expr = reassign_expr_columns(candidate.expr, &candidate.read_plan.projected_schema)?; + let skip_flag = tracker.skip_flag(filter_id); Ok(Self { physical_expr, @@ -138,6 +170,10 @@ impl DatafusionArrowPredicate { rows_pruned, rows_matched, time, + filter_id, + tracker, + other_projected_bytes_per_row, + skip_flag, }) } } @@ -148,10 +184,27 @@ impl ArrowPredicate for DatafusionArrowPredicate { } fn evaluate(&mut self, batch: RecordBatch) -> ArrowResult { + // Mid-stream drop: the tracker has decided this optional filter is + // pulling its weight no longer. Return an all-true mask to bypass + // expression evaluation entirely. We still bump `rows_matched` so + // the per-predicate count stays consistent with input rows; the + // tracker is intentionally NOT updated for skipped batches because + // (a) we have nothing meaningful to report and (b) flooding it + // with zero-cost samples would mask the underlying effectiveness + // signal if the flag is ever cleared. + if self.skip_flag.load(std::sync::atomic::Ordering::Acquire) { + let rows_in_batch = batch.num_rows(); + self.rows_matched.add(rows_in_batch); + return Ok(BooleanArray::from(vec![true; rows_in_batch])); + } + // scoped timer updates on drop let mut timer = self.time.timer(); + let start_nanos = datafusion_common::instant::Instant::now(); - self.physical_expr + let rows_in_batch = batch.num_rows(); + let result = self + .physical_expr .evaluate(&batch) .and_then(|v| v.into_array(batch.num_rows())) .and_then(|array| { @@ -161,13 +214,38 @@ impl ArrowPredicate for DatafusionArrowPredicate { self.rows_pruned.add(num_pruned); self.rows_matched.add(num_matched); timer.stop(); - Ok(bool_arr) + Ok((bool_arr, num_matched)) }) .map_err(|e| { ArrowError::ComputeError(format!( "Error evaluating filter predicate: {e:?}" )) - }) + }); + + match result { + Ok((bool_arr, num_matched)) => { + let eval_nanos = start_nanos.elapsed().as_nanos() as u64; + // Report *late-materialization savings* (bytes of non-filter + // projection columns the decoder would have had to read for + // each pruned row), matching the post-scan path. This is the + // quantity the SelectivityTracker converts into the + // "bytes saved per second of evaluation time" effectiveness + // metric. Reporting the filter's own byte cost instead + // would invert the promote/demote rankings. + let batch_bytes = (rows_in_batch as f64 + * self.other_projected_bytes_per_row) + .round() as u64; + self.tracker.update( + self.filter_id, + num_matched as u64, + rows_in_batch as u64, + eval_nanos, + batch_bytes, + ); + Ok(bool_arr) + } + Err(e) => Err(e), + } } } @@ -991,95 +1069,167 @@ fn size_of_columns(columns: &[usize], metadata: &ParquetMetaData) -> Result)>; + +/// Build row-level filters for the row-filter partition chosen by the +/// adaptive selectivity tracker. /// -/// # Arguments -/// * `expr` - The filter predicate, already adapted to reference columns in `file_schema` -/// * `file_schema` - The Arrow schema of the parquet file (the result of converting -/// the parquet schema to Arrow, potentially with type coercions applied) -/// * `metadata` - Parquet file metadata used for cost estimation -/// * `reorder_predicates` - If true, reorder predicates to minimize I/O -/// * `file_metrics` - Metrics for tracking filter performance +/// Each input filter keeps its stable filter id so the resulting +/// `ArrowPredicate`s can report per-batch statistics back to the tracker on +/// each `evaluate()` call, driving future promote/demote decisions. /// -/// # Returns -/// * `Ok(Some(row_filter))` if the expression can be used as a RowFilter -/// * `Ok(None)` if the expression cannot be used as a RowFilter -/// * `Err(e)` if an error occurs while building the filter +/// Filters that cannot be represented as an `ArrowPredicate` (e.g. whole +/// struct references or other unsupported patterns) are returned in the +/// second element of the returned tuple so the opener can apply them +/// post-scan instead of silently dropping them. /// -/// Note: The returned `RowFilter` may not contain all conjuncts from the original -/// expression. Conjuncts that cannot be evaluated as an `ArrowPredicate` are ignored. +/// # Arguments +/// * `filters` — The candidate filters paired with their stable ids. Assumed +/// to already be adapted to reference columns in `file_schema`. +/// * `file_schema` — The Arrow schema of the parquet file. +/// * `metadata` — Parquet file metadata used for cost estimation. +/// * `projection_compressed_bytes` — Total compressed bytes the user +/// projection reads across the file. Used to derive the per-filter +/// *late-materialization savings* reported to the tracker, so that +/// row-filter and post-scan candidates are ranked on a single common +/// axis. +/// * `tracker` — Shared adaptive selectivity tracker. +/// * `file_metrics` — Metrics for tracking filter performance. /// -/// For example, if the expression is `a = 1 AND b = 2 AND c = 3` and `b = 2` -/// cannot be evaluated for some reason, the returned `RowFilter` will contain -/// only `a = 1` and `c = 3`. +/// # Returns +/// * `Ok((Some(row_filter), unbuildable))` when at least one filter could be +/// represented as a row-level predicate. +/// * `Ok((None, unbuildable))` when no filters could be represented as row +/// filters; all are returned in `unbuildable`. +/// * `Err(e)` if an error occurs while building the filter. pub fn build_row_filter( - expr: &Arc, + filters: &[(crate::selectivity::FilterId, Arc)], file_schema: &SchemaRef, metadata: &ParquetMetaData, - reorder_predicates: bool, + projection_compressed_bytes: usize, + tracker: &Arc, file_metrics: &ParquetFileMetrics, -) -> Result> { +) -> Result<(Option, UnbuildableFilters)> { let rows_pruned = &file_metrics.pushdown_rows_pruned; let rows_matched = &file_metrics.pushdown_rows_matched; let time = &file_metrics.row_pushdown_eval_time; - // Split into conjuncts: - // `a = 1 AND b = 2 AND c = 3` -> [`a = 1`, `b = 2`, `c = 3`] - let predicates = split_conjunction(expr); - - // Determine which conjuncts can be evaluated as ArrowPredicates, if any - let mut candidates: Vec = predicates - .into_iter() - .map(|expr| { - FilterCandidateBuilder::new(Arc::clone(expr), Arc::clone(file_schema)) - .build(metadata) - }) - .collect::, _>>()? - .into_iter() - .flatten() - .collect(); - - // no candidates - if candidates.is_empty() { - return Ok(None); + // Total rows in the file, used to amortise compressed-byte totals + // over rows. We floor at 1 so empty files don't divide by zero; any + // effectiveness contribution in that degenerate case is irrelevant + // because there are no batches to track anyway. + let total_rows: i64 = metadata.row_groups().iter().map(|rg| rg.num_rows()).sum(); + let total_rows_f = total_rows.max(1) as f64; + + // Try to build a candidate for each filter independently. Any filter + // that can't be represented as an `ArrowPredicate`, for *any* reason + // (the candidate builder returned `None`, the builder returned an + // `Err`, or the `DatafusionArrowPredicate` constructor failed below), + // falls through into `unbuildable` so the caller can apply it + // post-scan. Silently dropping any conjunct here would relax the + // user's predicate and return wrong results — see the + // `post_scan_conjuncts` fallthrough in + // `ParquetOpener::build_stream`. + let mut buildable: Vec<(crate::selectivity::FilterId, FilterCandidate)> = + Vec::with_capacity(filters.len()); + let mut unbuildable: UnbuildableFilters = Vec::new(); + for (id, expr) in filters { + match FilterCandidateBuilder::new(Arc::clone(expr), Arc::clone(file_schema)) + .build(metadata) + { + Ok(Some(c)) => buildable.push((*id, c)), + Ok(None) => unbuildable.push((*id, Arc::clone(expr))), + Err(e) => { + debug!( + "failed to build row-filter candidate for {id}: {e}; falling through to post-scan" + ); + unbuildable.push((*id, Arc::clone(expr))); + } + } } - if reorder_predicates { - candidates.sort_unstable_by_key(|c| c.required_bytes); + if buildable.is_empty() { + return Ok((None, unbuildable)); } // To avoid double-counting metrics when multiple predicates are used: - // - All predicates should count rows_pruned (cumulative pruned rows) - // - Only the last predicate should count rows_matched (final result) - // This ensures: rows_matched + rows_pruned = total rows processed - let total_candidates = candidates.len(); - - candidates - .into_iter() - .enumerate() - .map(|(idx, candidate)| { - let is_last = idx == total_candidates - 1; - - // All predicates share the pruned counter (cumulative) - let predicate_rows_pruned = rows_pruned.clone(); - - // Only the last predicate tracks matched rows (final result) - let predicate_rows_matched = if is_last { - rows_matched.clone() - } else { - metrics::Count::new() - }; + // - All predicates share the cumulative rows_pruned counter + // - Only the last predicate writes to rows_matched (final pass count) + // This preserves the invariant: rows_matched + rows_pruned = total rows. + let total_candidates = buildable.len(); + + let mut predicates: Vec> = + Vec::with_capacity(total_candidates); + for (idx, (filter_id, candidate)) in buildable.into_iter().enumerate() { + let is_last = idx == total_candidates - 1; + let predicate_rows_pruned = rows_pruned.clone(); + let predicate_rows_matched = if is_last { + rows_matched.clone() + } else { + metrics::Count::new() + }; + // Late-materialization savings: bytes of the *non-filter* portion + // of the projection, per row. When the filter prunes a row, the + // decoder avoids decoding these bytes further downstream — that + // is the quantity the tracker needs as `batch_bytes` so its + // effectiveness metric (bytes-saved / eval-time) ranks filters + // by actual savings rather than by their own read cost. Match the + // post-scan path's formula in + // `opener.rs::post_scan_other_bytes_per_row`. + let other_projected_bytes_per_row = + projection_compressed_bytes.saturating_sub(candidate.required_bytes) as f64 + / total_rows_f; + // Remember the original expression before we move `candidate` into + // `try_new`, so that a failed predicate construction can fall back + // into `unbuildable` rather than being silently dropped. + let original_expr = Arc::clone(&candidate.expr); + match DatafusionArrowPredicate::try_new( + candidate, + predicate_rows_pruned, + predicate_rows_matched, + time.clone(), + filter_id, + Arc::clone(tracker), + other_projected_bytes_per_row, + ) { + Ok(pred) => predicates.push(Box::new(pred) as _), + Err(e) => { + debug!( + "failed to construct ArrowPredicate for filter {filter_id}: {e}; \ + falling through to post-scan" + ); + unbuildable.push((filter_id, original_expr)); + } + } + } - DatafusionArrowPredicate::try_new( - candidate, - predicate_rows_pruned, - predicate_rows_matched, - time.clone(), - ) - .map(|pred| Box::new(pred) as _) - }) - .collect::, _>>() - .map(|filters| Some(RowFilter::new(filters))) + if predicates.is_empty() { + Ok((None, unbuildable)) + } else { + Ok((Some(RowFilter::new(predicates)), unbuildable)) + } +} + +/// Estimate the total on-disk (compressed) byte cost of reading the given +/// leaf column indices across every row group in the file. Used by the +/// adaptive [`crate::selectivity::SelectivityTracker`] as a cheap proxy for +/// filter evaluation cost before runtime stats are available. +pub(crate) fn total_compressed_bytes( + column_indices: &[usize], + metadata: &ParquetMetaData, +) -> usize { + let mut total: i64 = 0; + for rg in metadata.row_groups() { + for &idx in column_indices { + if let Some(col) = rg.columns().get(idx) { + total += col.compressed_size(); + } + } + } + total.max(0) as usize } #[cfg(test)] @@ -1183,11 +1333,15 @@ mod test { .expect("building candidate") .expect("candidate expected"); + let test_tracker = Arc::new(crate::selectivity::SelectivityTracker::new()); let mut row_filter = DatafusionArrowPredicate::try_new( candidate, Count::new(), Count::new(), Time::new(), + 0, + Arc::clone(&test_tracker), + 0.0, ) .expect("creating filter predicate"); @@ -1222,11 +1376,15 @@ mod test { .expect("building candidate") .expect("candidate expected"); + let test_tracker = Arc::new(crate::selectivity::SelectivityTracker::new()); let mut row_filter = DatafusionArrowPredicate::try_new( candidate, Count::new(), Count::new(), Time::new(), + 0, + Arc::clone(&test_tracker), + 0.0, ) .expect("creating filter predicate"); @@ -1371,10 +1529,18 @@ mod test { let file_metrics = ParquetFileMetrics::new(0, &format!("{func_name}.parquet"), &metrics); - let row_filter = - build_row_filter(&expr, &file_schema, &metadata, false, &file_metrics) - .expect("building row filter") - .expect("row filter should exist"); + let tracker = Arc::new(crate::selectivity::SelectivityTracker::new()); + let filters = vec![(0usize, expr)]; + let (row_filter, _unbuildable) = build_row_filter( + &filters, + &file_schema, + &metadata, + 0, + &tracker, + &file_metrics, + ) + .expect("building row filter"); + let row_filter = row_filter.expect("row filter should exist"); let reader = parquet_reader_builder .with_row_filter(row_filter) @@ -1949,10 +2115,18 @@ mod test { let metrics = ExecutionPlanMetricsSet::new(); let file_metrics = ParquetFileMetrics::new(0, "struct_e2e.parquet", &metrics); - let row_filter = - build_row_filter(&expr, &file_schema, &metadata, false, &file_metrics) - .expect("building row filter") - .expect("row filter should exist"); + let tracker = Arc::new(crate::selectivity::SelectivityTracker::new()); + let filters = vec![(0usize, expr)]; + let (row_filter, _unbuildable) = build_row_filter( + &filters, + &file_schema, + &metadata, + 0, + &tracker, + &file_metrics, + ) + .expect("building row filter"); + let row_filter = row_filter.expect("row filter should exist"); let reader = parquet_reader_builder .with_row_filter(row_filter) diff --git a/datafusion/datasource-parquet/src/selectivity.rs b/datafusion/datasource-parquet/src/selectivity.rs new file mode 100644 index 0000000000000..cbcee64190978 --- /dev/null +++ b/datafusion/datasource-parquet/src/selectivity.rs @@ -0,0 +1,2015 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Adaptive filter selectivity tracking for Parquet row filters. +//! +//! See [`SelectivityTracker`] for the main entry point, `FilterState` for the +//! per-filter lifecycle, `PartitionedFilters` for the output consumed by +//! `ParquetOpener::open`, and [`FilterId`] for stable filter identification. + +use log::debug; +use parking_lot::{Mutex, RwLock}; +use parquet::file::metadata::ParquetMetaData; +use std::collections::HashMap; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; + +use datafusion_physical_expr::utils::collect_columns; +use datafusion_physical_expr_common::physical_expr::{ + OptionalFilterPhysicalExpr, PhysicalExpr, snapshot_generation, +}; + +/// Stable identifier for a filter conjunct, assigned by `ParquetSource::with_predicate`. +pub type FilterId = usize; + +/// Re-evaluate the per-filter skip flag every Nth batch update. The CI +/// upper bound is a couple of arithmetic ops so this cap mostly serves to +/// keep cache lines for `is_optional` / `skip_flags` cold on the hot path. +const SKIP_FLAG_CHECK_INTERVAL: u64 = 4; + +/// Per-filter lifecycle state in the adaptive filter system. +/// +/// State transitions: +/// - **(unseen)** → [`RowFilter`](Self::RowFilter) or [`PostScan`](Self::PostScan) +/// on first encounter in [`SelectivityTracker::partition_filters`]. +/// - [`PostScan`](Self::PostScan) → [`RowFilter`](Self::RowFilter) when +/// effectiveness ≥ `min_bytes_per_sec` and enough rows have been observed. +/// - [`RowFilter`](Self::RowFilter) → [`PostScan`](Self::PostScan) when +/// effectiveness is below threshold (mandatory filter). +/// - [`RowFilter`](Self::RowFilter) → [`Dropped`](Self::Dropped) when +/// effectiveness is below threshold and the filter is optional +/// ([`OptionalFilterPhysicalExpr`]). +/// - [`RowFilter`](Self::RowFilter) → [`PostScan`](Self::PostScan)/[`Dropped`](Self::Dropped) +/// on periodic re-evaluation if effectiveness drops below threshold after +/// CI upper bound drops below threshold. +/// - **Any state** → re-evaluated when a dynamic filter's +/// `snapshot_generation` changes. +#[derive(Debug, Clone, Copy, PartialEq)] +pub(crate) enum FilterState { + /// Currently a row filter. + RowFilter, + /// Currently a post-scan filter. + PostScan, + /// Dropped entirely (insufficient throughput and optional). + Dropped, +} + +/// Result of partitioning filters into row filters vs post-scan. +/// +/// Produced by [`SelectivityTracker::partition_filters`], consumed by +/// `ParquetOpener::open` to build row-level predicates and post-scan filters. +/// +/// Filters are partitioned based on their effectiveness threshold. +#[derive(Debug, Clone, Default)] +pub(crate) struct PartitionedFilters { + /// Filters promoted past collection — individual chained ArrowPredicates + pub(crate) row_filters: Vec<(FilterId, Arc)>, + /// Filters demoted to post-scan (fast path only) + pub(crate) post_scan: Vec<(FilterId, Arc)>, +} + +/// Tracks selectivity statistics for a single filter expression. +#[derive(Debug, Clone, Default, Copy, PartialEq)] +struct SelectivityStats { + /// Number of rows that matched (passed) the filter + rows_matched: u64, + /// Total number of rows evaluated + rows_total: u64, + /// Cumulative evaluation time in nanoseconds + eval_nanos: u64, + /// Cumulative bytes across batches this filter has been evaluated on + bytes_seen: u64, + /// Welford's online algorithm: number of per-batch effectiveness samples + sample_count: u64, + /// Welford's online algorithm: running mean of per-batch effectiveness + eff_mean: f64, + /// Welford's online algorithm: running sum of squared deviations (M2) + eff_m2: f64, +} + +impl SelectivityStats { + /// Returns the effectiveness as an opaque ordering score (higher = run first). + /// + /// Currently computed as bytes/sec throughput using self-contained stats. + /// Callers should not assume the unit. + fn effectiveness(&self) -> Option { + if self.rows_total == 0 || self.eval_nanos == 0 || self.bytes_seen == 0 { + return None; + } + let rows_pruned = self.rows_total - self.rows_matched; + let bytes_per_row = self.bytes_seen as f64 / self.rows_total as f64; + let bytes_saved = rows_pruned as f64 * bytes_per_row; + Some(bytes_saved * 1_000_000_000.0 / self.eval_nanos as f64) + } + + /// Returns the lower bound of a confidence interval on mean effectiveness. + /// + /// Uses Welford's online variance to compute a one-sided CI: + /// `mean - z * stderr`. Returns `None` if fewer than 2 samples. + fn confidence_lower_bound(&self, confidence_z: f64) -> Option { + if self.sample_count < 2 { + return None; + } + let variance = self.eff_m2 / (self.sample_count - 1) as f64; + let stderr = (variance / self.sample_count as f64).sqrt(); + Some(self.eff_mean - confidence_z * stderr) + } + + /// Returns the upper bound of a confidence interval on mean effectiveness. + /// + /// Uses Welford's online variance: `mean + z * stderr`. + /// Returns `None` if fewer than 2 samples. + fn confidence_upper_bound(&self, confidence_z: f64) -> Option { + if self.sample_count < 2 { + return None; + } + let variance = self.eff_m2 / (self.sample_count - 1) as f64; + let stderr = (variance / self.sample_count as f64).sqrt(); + Some(self.eff_mean + confidence_z * stderr) + } + + /// Update stats with new observations. + fn update(&mut self, matched: u64, total: u64, eval_nanos: u64, batch_bytes: u64) { + self.rows_matched += matched; + self.rows_total += total; + self.eval_nanos += eval_nanos; + self.bytes_seen += batch_bytes; + + // Feed Welford's algorithm with per-batch effectiveness. We admit + // samples with `batch_bytes == 0` — that legitimately represents a + // filter whose projection is a subset of its referenced columns, so + // late materialization has nothing to save even when the filter + // does prune rows. Recording `batch_eff = 0` for such batches lets + // the mid-stream skip path detect "CPU spent, no late- + // materialization payoff" and drop the filter if it is optional. + if total > 0 && eval_nanos > 0 { + let rows_pruned = total - matched; + let bytes_per_row = if total > 0 { + batch_bytes as f64 / total as f64 + } else { + 0.0 + }; + let batch_eff = + (rows_pruned as f64 * bytes_per_row) * 1e9 / eval_nanos as f64; + + self.sample_count += 1; + let delta = batch_eff - self.eff_mean; + self.eff_mean += delta / self.sample_count as f64; + let delta2 = batch_eff - self.eff_mean; + self.eff_m2 += delta * delta2; + } + } +} + +/// Immutable configuration for a [`SelectivityTracker`]. +/// +/// Use the builder methods to customise, then call [`build()`](TrackerConfig::build) +/// to produce a ready-to-use tracker. +pub(crate) struct TrackerConfig { + /// Minimum bytes/sec throughput for promoting a filter (default: INFINITY = disabled). + pub min_bytes_per_sec: f64, + /// Byte-ratio threshold for initial filter placement (row-level vs post-scan). + /// Computed as `filter_compressed_bytes / projection_compressed_bytes`. + /// When low, the filter columns are small relative to the projection, + /// so row-level placement enables large late-materialization savings. + /// When high, the filter columns dominate the projection, so there's + /// little benefit from late materialization. + /// Default is 0.20. + pub byte_ratio_threshold: f64, + /// Z-score for confidence intervals on filter effectiveness. + /// Lower values (e.g. 1.0 or 0.0) will make the tracker more aggressive about promotion/demotion based on limited data. + /// Higher values (e.g. 3.0) will require more confidence before changing filter states. + /// Default is 2.0, corresponding to ~97.5% one-sided confidence. + /// Set to <= 0.0 to disable confidence intervals and promote/demote based on point estimates alone (not recommended). + /// Set to INFINITY to disable promotion entirely (overrides `min_bytes_per_sec`). + pub confidence_z: f64, +} + +impl TrackerConfig { + pub fn new() -> Self { + Self { + min_bytes_per_sec: f64::INFINITY, + byte_ratio_threshold: 0.20, + confidence_z: 2.0, + } + } + + pub fn with_min_bytes_per_sec(mut self, v: f64) -> Self { + self.min_bytes_per_sec = v; + self + } + + pub fn with_byte_ratio_threshold(mut self, v: f64) -> Self { + self.byte_ratio_threshold = v; + self + } + + pub fn with_confidence_z(mut self, v: f64) -> Self { + self.confidence_z = v; + self + } + + pub fn build(self) -> SelectivityTracker { + SelectivityTracker { + config: self, + filter_stats: RwLock::new(HashMap::new()), + skip_flags: RwLock::new(HashMap::new()), + is_optional: RwLock::new(HashMap::new()), + inner: Mutex::new(SelectivityTrackerInner::new()), + } + } +} + +impl Default for TrackerConfig { + fn default() -> Self { + Self::new() + } +} + +/// Cross-file adaptive system that measures filter effectiveness and decides +/// which filters are promoted to row-level predicates (pushed into the Parquet +/// reader) vs. applied post-scan (demoted) or dropped entirely. +/// +/// # Locking design +/// +/// All locks are **private** to this struct — external callers cannot hold a +/// guard across expensive work, and all lock-holding code paths are auditable +/// in this file alone. +/// +/// State is split across two independent locks to minimise contention between +/// the hot per-batch `update()` path and the cold per-file-open +/// `partition_filters()` path: +/// +/// - **`filter_stats`** (`RwLock>>`) +/// — `update()` acquires a *shared read* lock on the outer map, then a +/// per-filter `Mutex` to increment counters. Multiple threads updating +/// *different* filters never contend at all; threads updating the *same* +/// filter serialize only on the cheap per-filter `Mutex` (~100 ns). +/// `partition_filters()` also takes a read lock here when it needs to +/// inspect stats for promotion/demotion decisions, so it never blocks +/// `update()` callers. The write lock is taken only briefly in Phase 2 +/// of `partition_filters()` to insert entries for newly-seen filter IDs. +/// +/// - **`inner`** (`Mutex`) — holds the filter +/// state-machine (`filter_states`) and dynamic-filter generation tracking. +/// Only `partition_filters()` acquires this lock (once per file open), so +/// concurrent `update()` calls are completely unaffected. +/// +/// ## Lock ordering (deadlock-free) +/// +/// Locks are always acquired in the order `inner` → `filter_stats` → +/// per-filter `Mutex`. Because `update()` never acquires `inner`, no +/// cycle is possible. +/// +/// ## Correctness of concurrent access +/// +/// `update()` may write stats while `partition_filters()` reads them for +/// promotion/demotion. Both hold a shared `filter_stats` read lock; the +/// per-filter `Mutex` ensures they do not interleave on the same filter's +/// stats. One proceeds first; the other sees a consistent (slightly newer +/// or older) snapshot. This is benign — the single-lock design that +/// preceded this split already allowed stats to change between consecutive +/// reads within `partition_filters()`. +/// +/// On promote/demote, `partition_filters()` zeros a filter's stats via the +/// per-filter `Mutex`. An `update()` running concurrently may write one +/// stale batch's worth of data to the freshly-zeroed stats; this is quickly +/// diluted by hundreds of correct-context batches and is functionally +/// identical to the old design where `update()` queued behind the write +/// lock and ran immediately after. +/// +/// # Filter state machine +/// +/// ```text +/// ┌─────────┐ +/// │ New │ +/// └─────────┘ +/// │ +/// ▼ +/// ┌────────────────────────┐ +/// │ Estimated Cost │ +/// │Bytes needed for filter │ +/// └────────────────────────┘ +/// │ +/// ┌──────────────────┴──────────────────┐ +/// ┌────────▼────────┐ ┌────────▼────────┐ +/// │ Post-scan │ │ Row filter │ +/// │ │ │ │ +/// └─────────────────┘ └─────────────────┘ +/// │ │ +/// ▼ ▼ +/// ┌─────────────────┐ ┌─────────────────┐ +/// │ Effectiveness │ │ Effectiveness │ +/// │ Bytes pruned │ │ Bytes pruned │ +/// │ per │ │ per │ +/// │Second of compute│ │Second of compute│ +/// └─────────────────┘ └─────────────────┘ +/// │ │ +/// └──────────────────┬──────────────────┘ +/// ▼ +/// ┌───────────────────────────────────────────────┐ +/// │ New Scan │ +/// │ Move filters based on effectiveness. │ +/// │ Promote (move post-scan -> row filter). │ +/// │ Demote (move row-filter -> post-scan). │ +/// │ Disable (for optional filters; either row │ +/// │ filter or disabled). │ +/// └───────────────────────────────────────────────┘ +/// │ +/// ┌──────────────────┴──────────────────┐ +/// ┌────────▼────────┐ ┌────────▼────────┐ +/// │ Post-scan │ │ Row filter │ +/// │ │ │ │ +/// └─────────────────┘ └─────────────────┘ +/// ``` +/// +/// See `TrackerConfig` for configuration knobs. +pub struct SelectivityTracker { + config: TrackerConfig, + /// Per-filter selectivity statistics, each individually `Mutex`-protected. + /// + /// The outer `RwLock` is almost always read-locked: both `update()` (hot, + /// per-batch) and `partition_filters()` (cold, per-file-open) only need + /// shared access to look up existing entries. The write lock is taken + /// only when `partition_filters()` inserts entries for newly-seen filter + /// IDs — a brief, infrequent operation. + /// + /// Each inner `Mutex` protects a single filter's + /// counters, so concurrent `update()` calls on *different* filters + /// proceed in parallel with zero contention. + filter_stats: RwLock>>, + /// Per-filter "skip" flags — when set, the corresponding filter is + /// treated as a no-op by both the row-filter + /// (`DatafusionArrowPredicate::evaluate`) and the post-scan path + /// (`apply_post_scan_filters_with_stats`). This is the mid-stream + /// equivalent of dropping an optional filter: once the per-batch + /// `update()` path proves an `OptionalFilterPhysicalExpr` is + /// CPU-dominated and ineffective, it flips the flag and subsequent + /// batches stop paying the evaluation cost. The decoder still decodes + /// the filter columns (we cannot rebuild it mid-scan), so I/O is not + /// reclaimed; only the predicate evaluation is skipped. + /// + /// Only ever set for filters whose `is_optional` entry is `true` — + /// mandatory filters must always execute or queries return wrong rows. + skip_flags: RwLock>>, + /// Whether each filter is wrapped in an `OptionalFilterPhysicalExpr`, + /// captured at first-encounter in `partition_filters` so the per-batch + /// `update()` path can decide whether the filter is safe to no-op + /// without re-inspecting the expression tree on every batch. + is_optional: RwLock>, + /// Filter lifecycle state machine and dynamic-filter generation tracking. + /// + /// Only `partition_filters()` acquires this lock (once per file open). + /// `update()` never touches it, so the hot per-batch path is completely + /// decoupled from the cold state-machine path. + inner: Mutex, +} + +impl std::fmt::Debug for SelectivityTracker { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SelectivityTracker") + .field("config.min_bytes_per_sec", &self.config.min_bytes_per_sec) + .finish() + } +} + +impl Default for SelectivityTracker { + fn default() -> Self { + Self::new() + } +} + +impl SelectivityTracker { + /// Create a new tracker with default settings (feature disabled). + pub fn new() -> Self { + TrackerConfig::new().build() + } + + /// Update stats for a filter after processing a batch. + /// + /// **Locking:** acquires `filter_stats.read()` (shared) then a per-filter + /// `Mutex`. Never touches `inner`, so this hot per-batch path cannot + /// contend with the cold per-file-open `partition_filters()` path. + /// + /// Silently skips unknown filter IDs (can occur if `update()` is called + /// before `partition_filters()` has registered the filter — in practice + /// this cannot happen because `partition_filters()` runs during file open + /// before any batches are processed). + /// + /// **Mid-stream drop:** after every `SKIP_FLAG_CHECK_INTERVAL`'th batch + /// we evaluate the CI upper bound; if it falls below + /// `min_bytes_per_sec` and the filter is wrapped in + /// `OptionalFilterPhysicalExpr`, we set the per-filter skip flag. + /// Subsequent calls to `DatafusionArrowPredicate::evaluate` (row-level) + /// and `apply_post_scan_filters_with_stats` (post-scan) observe the + /// flag and short-circuit their work for that filter. Mandatory + /// filters are never flagged because doing so would change the result + /// set. + pub(crate) fn update( + &self, + id: FilterId, + matched: u64, + total: u64, + eval_nanos: u64, + batch_bytes: u64, + ) { + let stats_map = self.filter_stats.read(); + let Some(entry) = stats_map.get(&id) else { + return; + }; + let mut stats = entry.lock(); + stats.update(matched, total, eval_nanos, batch_bytes); + + // Mid-stream drop check. Only consult the skip mechanism for + // filters we already know to be optional, and only after enough + // samples for `confidence_upper_bound` to be defined. The modulo + // gate keeps the per-batch overhead tiny on the hot path. + if !self.config.min_bytes_per_sec.is_finite() + || !stats.sample_count.is_multiple_of(SKIP_FLAG_CHECK_INTERVAL) + { + return; + } + let Some(ub) = stats.confidence_upper_bound(self.config.confidence_z) else { + return; + }; + if ub >= self.config.min_bytes_per_sec { + return; + } + drop(stats); + drop(stats_map); + + // Optionality is captured at first sight in `partition_filters` so + // we can answer this without re-walking the expression tree. + let is_optional = self.is_optional.read().get(&id).copied().unwrap_or(false); + if !is_optional { + return; + } + if let Some(flag) = self.skip_flags.read().get(&id) + && !flag.swap(true, Ordering::Release) + { + debug!( + "FilterId {id}: mid-stream skip — CI upper bound {ub} < {} bytes/sec", + self.config.min_bytes_per_sec + ); + } + } + + /// Returns the shared skip flag for `id`, creating one if absent. + /// + /// Cloned into [`crate::row_filter::DatafusionArrowPredicate`] so the + /// row-filter path can short-circuit when the per-batch update path + /// decides the filter has stopped pulling its weight. The post-scan + /// path uses [`Self::is_filter_skipped`] instead — it does not need a + /// long-lived handle. + pub(crate) fn skip_flag(&self, id: FilterId) -> Arc { + if let Some(existing) = self.skip_flags.read().get(&id) { + return Arc::clone(existing); + } + let mut write = self.skip_flags.write(); + Arc::clone( + write + .entry(id) + .or_insert_with(|| Arc::new(AtomicBool::new(false))), + ) + } + + /// Returns `true` when `id` has been mid-stream-dropped by the tracker. + /// + /// Cheap: a single `RwLock::read` plus an atomic load. Called from the + /// post-scan filter loop in `apply_post_scan_filters_with_stats`. + pub(crate) fn is_filter_skipped(&self, id: FilterId) -> bool { + self.skip_flags + .read() + .get(&id) + .is_some_and(|f| f.load(Ordering::Acquire)) + } + + /// Partition filters into row-level predicates vs post-scan filters. + /// + /// Called once per file open (cold path). + /// + /// **Locking — two phases:** + /// 1. Acquires `inner` (exclusive) and `filter_stats` (shared read) for + /// all decision logic — promotion, demotion, initial placement, and + /// sorting by effectiveness. Because `filter_stats` is only + /// read-locked, concurrent `update()` calls proceed unblocked. + /// 2. If new filter IDs were seen, briefly acquires `filter_stats` (write) + /// to insert per-filter `Mutex` entries so that future `update()` calls + /// can find them. + pub(crate) fn partition_filters( + &self, + filters: Vec<(FilterId, Arc)>, + projection_scan_size: usize, + metadata: &ParquetMetaData, + ) -> PartitionedFilters { + // Phase 1: inner.lock() + filter_stats.read() → all decision logic + let mut guard = self.inner.lock(); + let stats_map = self.filter_stats.read(); + let result = guard.partition_filters( + filters, + projection_scan_size, + metadata, + &self.config, + &stats_map, + ); + drop(stats_map); + drop(guard); + + // Phase 2: if new filters were seen, briefly acquire write lock to insert entries + if !result.new_filter_ids.is_empty() { + let mut stats_write = self.filter_stats.write(); + for id in &result.new_filter_ids { + stats_write + .entry(*id) + .or_insert_with(|| Mutex::new(SelectivityStats::default())); + } + } + if !result.new_optional_flags.is_empty() { + let mut optional_write = self.is_optional.write(); + let mut skip_write = self.skip_flags.write(); + for (id, is_optional) in result.new_optional_flags { + optional_write.entry(id).or_insert(is_optional); + skip_write + .entry(id) + .or_insert_with(|| Arc::new(AtomicBool::new(false))); + } + } + + result.partitioned + } + + /// Test helper: ensure a stats entry exists for the given filter ID. + /// In production, `partition_filters()` inserts entries for new filters. + /// Tests that call `update()` without prior `partition_filters()` need this. + #[cfg(test)] + fn ensure_stats_entry(&self, id: FilterId) { + let map = self.filter_stats.read(); + if map.get(&id).is_none() { + drop(map); + self.filter_stats + .write() + .entry(id) + .or_insert_with(|| Mutex::new(SelectivityStats::default())); + } + } +} + +/// Internal result from [`SelectivityTrackerInner::partition_filters`]. +/// +/// Carries both the partitioned filters and a list of newly-seen filter IDs +/// back to the outer [`SelectivityTracker::partition_filters`], which uses +/// `new_filter_ids` to insert per-filter `Mutex` entries into `filter_stats` +/// in a brief Phase 2 write lock. +struct PartitionResult { + partitioned: PartitionedFilters, + new_filter_ids: Vec, + /// `(FilterId, is_optional)` entries observed for the first time in this + /// `partition_filters` call. The outer `SelectivityTracker` records + /// optionality alongside `filter_stats` so that the hot `update()` path + /// can decide whether the per-filter skip flag is safe to flip without + /// inspecting the expression tree. + new_optional_flags: Vec<(FilterId, bool)>, +} + +/// Filter state-machine and generation tracking, guarded by the `Mutex` +/// inside [`SelectivityTracker`]. +/// +/// This struct intentionally does **not** contain per-filter stats — those +/// live in the separate `filter_stats` lock so that the hot `update()` path +/// can modify stats without acquiring this lock. Only the cold +/// `partition_filters()` path (once per file open) needs this lock. +#[derive(Debug)] +struct SelectivityTrackerInner { + /// Per-filter lifecycle state (RowFilter / PostScan / Dropped). + filter_states: HashMap, + /// Last-seen snapshot generation per filter, for detecting when a dynamic + /// filter's selectivity has changed (e.g. hash-join build side grew). + snapshot_generations: HashMap, +} + +impl SelectivityTrackerInner { + fn new() -> Self { + Self { + filter_states: HashMap::new(), + snapshot_generations: HashMap::new(), + } + } + + /// Check and update the snapshot generation for a filter. + fn note_generation( + &mut self, + id: FilterId, + generation: u64, + stats_map: &HashMap>, + ) { + if generation == 0 { + return; + } + match self.snapshot_generations.get(&id) { + Some(&prev_generation) if prev_generation == generation => {} + Some(_) => { + let current_state = self.filter_states.get(&id).copied(); + // Always reset stats since selectivity changed with new generation. + if let Some(entry) = stats_map.get(&id) { + *entry.lock() = SelectivityStats::default(); + } + self.snapshot_generations.insert(id, generation); + + // Optional/dynamic filters only get more selective over time + // (hash join build side accumulates more values). So if the + // filter was already working (RowFilter or PostScan), preserve + // its state. Only un-drop Dropped filters back to PostScan + // so they get another chance with the new selectivity. + if current_state == Some(FilterState::Dropped) { + debug!("FilterId {id} generation changed, un-dropping to PostScan"); + self.filter_states.insert(id, FilterState::PostScan); + } else { + debug!( + "FilterId {id} generation changed, resetting stats but preserving state {current_state:?}" + ); + } + } + None => { + self.snapshot_generations.insert(id, generation); + } + } + } + + /// Get the effectiveness for a filter by ID. + fn get_effectiveness_by_id( + &self, + id: FilterId, + stats_map: &HashMap>, + ) -> Option { + stats_map + .get(&id) + .and_then(|entry| entry.lock().effectiveness()) + } + + /// Demote a filter to post-scan or drop it entirely if optional. + fn demote_or_drop( + &mut self, + id: FilterId, + expr: &Arc, + post_scan: &mut Vec<(FilterId, Arc)>, + stats_map: &HashMap>, + ) { + if expr.downcast_ref::().is_none() { + self.filter_states.insert(id, FilterState::PostScan); + post_scan.push((id, Arc::clone(expr))); + // Reset stats for this filter so it can be re-evaluated as a post-scan filter. + if let Some(entry) = stats_map.get(&id) { + *entry.lock() = SelectivityStats::default(); + } + } else { + self.filter_states.insert(id, FilterState::Dropped); + } + } + + /// Promote a filter to row-level. + fn promote( + &mut self, + id: FilterId, + expr: Arc, + row_filters: &mut Vec<(FilterId, Arc)>, + stats_map: &HashMap>, + ) { + row_filters.push((id, expr)); + self.filter_states.insert(id, FilterState::RowFilter); + // Reset stats for this filter since it will be evaluated at row-level now. + if let Some(entry) = stats_map.get(&id) { + *entry.lock() = SelectivityStats::default(); + } + } + + /// Partition filters into collecting / promoted / post-scan buckets. + fn partition_filters( + &mut self, + filters: Vec<(FilterId, Arc)>, + projection_scan_size: usize, + metadata: &ParquetMetaData, + config: &TrackerConfig, + stats_map: &HashMap>, + ) -> PartitionResult { + let mut new_filter_ids = Vec::new(); + let mut new_optional_flags: Vec<(FilterId, bool)> = Vec::new(); + + // If min_bytes_per_sec is INFINITY -> all filters are post-scan. + if config.min_bytes_per_sec.is_infinite() { + debug!( + "Filter promotion disabled via min_bytes_per_sec=INFINITY; all {} filters post-scan", + filters.len() + ); + // Register all filter IDs so update() can find them + for (id, expr) in &filters { + if !stats_map.contains_key(id) { + new_filter_ids.push(*id); + new_optional_flags.push((*id, is_optional_filter(expr))); + } + } + return PartitionResult { + partitioned: PartitionedFilters { + row_filters: Vec::new(), + post_scan: filters, + }, + new_filter_ids, + new_optional_flags, + }; + } + // If min_bytes_per_sec is 0 -> all filters are promoted. + if config.min_bytes_per_sec == 0.0 { + debug!( + "All filters promoted via min_bytes_per_sec=0; all {} filters row-level", + filters.len() + ); + // Register all filter IDs so update() can find them + for (id, expr) in &filters { + if !stats_map.contains_key(id) { + new_filter_ids.push(*id); + new_optional_flags.push((*id, is_optional_filter(expr))); + } + } + return PartitionResult { + partitioned: PartitionedFilters { + row_filters: filters, + post_scan: Vec::new(), + }, + new_filter_ids, + new_optional_flags, + }; + } + + // Note snapshot generations for dynamic filter detection. + // This clears stats for any filter whose generation has changed since the last scan. + // This must be done before any other logic since it can change filter states and stats. + for &(id, ref expr) in &filters { + let generation = snapshot_generation(expr); + self.note_generation(id, generation, stats_map); + } + + // Separate into row filters and post-scan filters based on effectiveness and state. + let mut row_filters: Vec<(FilterId, Arc)> = Vec::new(); + let mut post_scan_filters: Vec<(FilterId, Arc)> = Vec::new(); + + let confidence_z = config.confidence_z; + for (id, expr) in filters { + let state = self.filter_states.get(&id).copied(); + + let Some(state) = state else { + // New filter: decide initial placement using the + // filter_bytes / projection_bytes ratio. This ratio captures + // the I/O tradeoff: + // + // - Low ratio (filter columns are small vs projection): row-filter + // enables late materialization — the large non-filter portion of + // the projection is only decoded for rows that pass the filter. + // + // - High ratio (filter columns are most of the projection): little + // benefit from late materialization since there's not much left + // to skip. Post-scan avoids row-filter overhead. + // + // Extra bytes (filter columns not in projection) are naturally + // included in filter_bytes, making the ratio higher and placement + // more conservative, which is correct since those bytes represent + // additional I/O cost for row-filter evaluation. + let filter_columns: Vec = collect_columns(&expr) + .iter() + .map(|col| col.index()) + .collect(); + let filter_bytes = + crate::row_filter::total_compressed_bytes(&filter_columns, metadata); + let byte_ratio = if projection_scan_size > 0 { + filter_bytes as f64 / projection_scan_size as f64 + } else { + 1.0 + }; + + if !stats_map.contains_key(&id) { + new_filter_ids.push(id); + new_optional_flags.push((id, is_optional_filter(&expr))); + } + + if byte_ratio <= config.byte_ratio_threshold { + debug!( + "FilterId {id}: New filter → Row filter (byte_ratio {byte_ratio:.4} <= {}) — {expr}", + config.byte_ratio_threshold + ); + self.filter_states.insert(id, FilterState::RowFilter); + row_filters.push((id, expr)); + } else { + debug!( + "FilterId {id}: New filter → Post-scan (byte_ratio {byte_ratio:.4} > {}) — {expr}", + config.byte_ratio_threshold + ); + self.filter_states.insert(id, FilterState::PostScan); + post_scan_filters.push((id, expr)); + } + continue; + }; + + match state { + FilterState::RowFilter => { + // Should we demote this filter based on CI upper bound? + if let Some(entry) = stats_map.get(&id) { + let stats = entry.lock(); + if let Some(ub) = stats.confidence_upper_bound(confidence_z) + && ub < config.min_bytes_per_sec + { + drop(stats); + debug!( + "FilterId {id}: Row filter → Post-scan via CI upper bound {ub} < {} bytes/sec — {expr}", + config.min_bytes_per_sec + ); + self.demote_or_drop( + id, + &expr, + &mut post_scan_filters, + stats_map, + ); + continue; + } + } + // If not demoted, keep as row filter. + row_filters.push((id, expr)); + } + FilterState::PostScan => { + // Should we promote this filter based on CI lower bound? + if let Some(entry) = stats_map.get(&id) { + let stats = entry.lock(); + if let Some(lb) = stats.confidence_lower_bound(confidence_z) + && lb >= config.min_bytes_per_sec + { + drop(stats); + debug!( + "FilterId {id}: Post-scan → Row filter via CI lower bound {lb} >= {} bytes/sec — {expr}", + config.min_bytes_per_sec + ); + self.promote(id, expr, &mut row_filters, stats_map); + continue; + } + } + // Should we drop this filter if it's optional and ineffective? + // Non-optional filters must stay as post-scan regardless. + if let Some(entry) = stats_map.get(&id) { + let stats = entry.lock(); + if let Some(ub) = stats.confidence_upper_bound(confidence_z) + && ub < config.min_bytes_per_sec + && expr.downcast_ref::().is_some() + { + drop(stats); + debug!( + "FilterId {id}: Post-scan → Dropped via CI upper bound {ub} < {} bytes/sec — {expr}", + config.min_bytes_per_sec + ); + self.filter_states.insert(id, FilterState::Dropped); + continue; + } + } + // Keep as post-scan filter (don't reset stats for mandatory filters). + post_scan_filters.push((id, expr)); + } + FilterState::Dropped => continue, + } + } + + // Sort row filters by: + // - Effectiveness (descending, higher = better) if available for both filters. + // - Scan size (ascending, cheapest first) as fallback — cheap filters prune + // rows before expensive ones, reducing downstream evaluation cost. + let cmp_row_filters = + |(id_a, expr_a): &(FilterId, Arc), + (id_b, expr_b): &(FilterId, Arc)| { + let eff_a = self.get_effectiveness_by_id(*id_a, stats_map); + let eff_b = self.get_effectiveness_by_id(*id_b, stats_map); + if let (Some(eff_a), Some(eff_b)) = (eff_a, eff_b) { + eff_b + .partial_cmp(&eff_a) + .unwrap_or(std::cmp::Ordering::Equal) + } else { + let size_a = filter_scan_size(expr_a, metadata); + let size_b = filter_scan_size(expr_b, metadata); + size_a.cmp(&size_b) + } + }; + row_filters.sort_by(cmp_row_filters); + // Post-scan filters: same logic (cheaper post-scan filters first to reduce + // the batch size for subsequent filters). + post_scan_filters.sort_by(cmp_row_filters); + + debug!( + "Partitioned filters: {} row-level, {} post-scan", + row_filters.len(), + post_scan_filters.len() + ); + PartitionResult { + partitioned: PartitionedFilters { + row_filters, + post_scan: post_scan_filters, + }, + new_filter_ids, + new_optional_flags, + } + } +} + +/// Returns `true` if `expr` is wrapped in [`OptionalFilterPhysicalExpr`]. +fn is_optional_filter(expr: &Arc) -> bool { + expr.downcast_ref::().is_some() +} + +/// Calculate the estimated number of bytes needed to evaluate a filter based on the columns +/// it references as if it were applied to the entire file. +/// This is used for initial placement of new filters before any stats are available, and as a fallback for filters without stats. +fn filter_scan_size(expr: &Arc, metadata: &ParquetMetaData) -> usize { + let columns: Vec = collect_columns(expr) + .iter() + .map(|col| col.index()) + .collect(); + + crate::row_filter::total_compressed_bytes(&columns, metadata) +} + +#[cfg(test)] +mod tests { + use super::*; + use datafusion_physical_expr::expressions::Column; + use parquet::basic::Type as PhysicalType; + use parquet::file::metadata::{ColumnChunkMetaData, FileMetaData, RowGroupMetaData}; + use parquet::schema::types::SchemaDescPtr; + use parquet::schema::types::Type as SchemaType; + use std::sync::Arc; + + mod helper_functions { + use super::*; + + /// Creates test ParquetMetaData with specified row groups and column sizes. + /// + /// # Arguments + /// * `specs` - Vec of (num_rows, vec![compressed_size]) tuples for each row group + pub fn create_test_metadata(specs: Vec<(i64, Vec)>) -> ParquetMetaData { + // Get the maximum number of columns from all specs + let num_columns = specs + .iter() + .map(|(_, sizes)| sizes.len()) + .max() + .unwrap_or(1); + let schema_descr = get_test_schema_descr_with_columns(num_columns); + + let row_group_metadata: Vec<_> = specs + .into_iter() + .map(|(num_rows, column_sizes)| { + let columns = column_sizes + .into_iter() + .enumerate() + .map(|(col_idx, size)| { + ColumnChunkMetaData::builder(schema_descr.column(col_idx)) + .set_num_values(num_rows) + .set_total_compressed_size(size as i64) + .build() + .unwrap() + }) + .collect(); + + RowGroupMetaData::builder(schema_descr.clone()) + .set_num_rows(num_rows) + .set_column_metadata(columns) + .build() + .unwrap() + }) + .collect(); + + let total_rows: i64 = row_group_metadata.iter().map(|rg| rg.num_rows()).sum(); + let file_metadata = + FileMetaData::new(1, total_rows, None, None, schema_descr.clone(), None); + + ParquetMetaData::new(file_metadata, row_group_metadata) + } + + /// Creates a simple column expression with given name and index. + pub fn col_expr(name: &str, index: usize) -> Arc { + Arc::new(Column::new(name, index)) + } + + /// Create schema with specified number of columns, each named "a", "b", etc. + pub fn get_test_schema_descr_with_columns(num_columns: usize) -> SchemaDescPtr { + use parquet::basic::LogicalType; + + let fields: Vec<_> = (0..num_columns) + .map(|i| { + let col_name = format!("{}", (b'a' + i as u8) as char); + SchemaType::primitive_type_builder( + &col_name, + PhysicalType::BYTE_ARRAY, + ) + .with_logical_type(Some(LogicalType::String)) + .build() + .unwrap() + }) + .map(Arc::new) + .collect(); + + let schema = SchemaType::group_type_builder("schema") + .with_fields(fields) + .build() + .unwrap(); + Arc::new(parquet::schema::types::SchemaDescriptor::new(Arc::new( + schema, + ))) + } + } + + mod selectivity_stats_tests { + use super::*; + + #[test] + fn test_effectiveness_basic_calculation() { + let mut stats = SelectivityStats::default(); + + // 100 rows total, 50 rows pruned (matched 50), 1 sec eval time, 10000 bytes seen + // bytes_per_row = 10000 / 100 = 100 + // bytes_saved = 50 * 100 = 5000 + // effectiveness = 5000 * 1e9 / 1e9 = 5000 + stats.update(50, 100, 1_000_000_000, 10_000); + + let eff = stats.effectiveness().unwrap(); + assert!((eff - 5000.0).abs() < 0.1); + } + + #[test] + fn test_effectiveness_zero_rows_total() { + let mut stats = SelectivityStats::default(); + stats.update(0, 0, 1_000_000_000, 10_000); + + assert_eq!(stats.effectiveness(), None); + } + + #[test] + fn test_effectiveness_zero_eval_nanos() { + let mut stats = SelectivityStats::default(); + stats.update(50, 100, 0, 10_000); + + assert_eq!(stats.effectiveness(), None); + } + + #[test] + fn test_effectiveness_zero_bytes_seen() { + let mut stats = SelectivityStats::default(); + stats.update(50, 100, 1_000_000_000, 0); + + assert_eq!(stats.effectiveness(), None); + } + + #[test] + fn test_effectiveness_all_rows_matched() { + let mut stats = SelectivityStats::default(); + // All rows matched (no pruning) + stats.update(100, 100, 1_000_000_000, 10_000); + + let eff = stats.effectiveness().unwrap(); + assert_eq!(eff, 0.0); + } + + #[test] + fn test_confidence_bounds_single_sample() { + let mut stats = SelectivityStats::default(); + stats.update(50, 100, 1_000_000_000, 10_000); + + // Single sample returns None for confidence bounds + assert_eq!(stats.confidence_lower_bound(2.0), None); + assert_eq!(stats.confidence_upper_bound(2.0), None); + } + + #[test] + fn test_welford_identical_samples() { + let mut stats = SelectivityStats::default(); + + // Add two identical samples + stats.update(50, 100, 1_000_000_000, 10_000); + stats.update(50, 100, 1_000_000_000, 10_000); + + // Variance should be 0 + assert_eq!(stats.sample_count, 2); + let lb = stats.confidence_lower_bound(2.0).unwrap(); + let ub = stats.confidence_upper_bound(2.0).unwrap(); + + // Both should be equal to the mean since variance is 0 + assert!((lb - ub).abs() < 0.01); + } + + #[test] + fn test_welford_variance_calculation() { + let mut stats = SelectivityStats::default(); + + // Add samples that will produce effectiveness values of ~100, ~200, ~300 + // These are constructed to give those exact effectiveness values + stats.update(50, 100, 1_000_000_000, 10_000); // eff ≈ 5000 + stats.update(40, 100, 1_000_000_000, 10_000); // eff ≈ 6000 + stats.update(30, 100, 1_000_000_000, 10_000); // eff ≈ 7000 + + // We should have 3 samples + assert_eq!(stats.sample_count, 3); + + // Mean should be 6000 + assert!((stats.eff_mean - 6000.0).abs() < 1.0); + + // Both bounds should be defined + let lb = stats.confidence_lower_bound(1.0).unwrap(); + let ub = stats.confidence_upper_bound(1.0).unwrap(); + + assert!(lb < stats.eff_mean); + assert!(ub > stats.eff_mean); + } + + #[test] + fn test_confidence_bounds_asymmetry() { + let mut stats = SelectivityStats::default(); + + stats.update(50, 100, 1_000_000_000, 10_000); + stats.update(40, 100, 1_000_000_000, 10_000); + + let lb = stats.confidence_lower_bound(2.0).unwrap(); + let ub = stats.confidence_upper_bound(2.0).unwrap(); + + // Bounds should be symmetric around the mean + let lower_dist = stats.eff_mean - lb; + let upper_dist = ub - stats.eff_mean; + + assert!((lower_dist - upper_dist).abs() < 0.01); + } + + #[test] + fn test_welford_incremental_vs_batch() { + // Create two identical stats objects + let mut stats_incremental = SelectivityStats::default(); + let mut stats_batch = SelectivityStats::default(); + + // Incremental: add one at a time + stats_incremental.update(50, 100, 1_000_000_000, 10_000); + stats_incremental.update(40, 100, 1_000_000_000, 10_000); + stats_incremental.update(30, 100, 1_000_000_000, 10_000); + + // Batch: simulate batch update (all at once) + stats_batch.update(120, 300, 3_000_000_000, 30_000); + + // Both should produce the same overall statistics + assert_eq!(stats_incremental.rows_total, stats_batch.rows_total); + assert_eq!(stats_incremental.rows_matched, stats_batch.rows_matched); + + // Means should be close + assert!((stats_incremental.eff_mean - stats_batch.eff_mean).abs() < 100.0); + } + + #[test] + fn test_effectiveness_numerical_stability() { + let mut stats = SelectivityStats::default(); + + // Test with large values to ensure numerical stability + stats.update( + 500_000_000, + 1_000_000_000, + 10_000_000_000_000, + 1_000_000_000_000, + ); + + let eff = stats.effectiveness(); + assert!(eff.is_some()); + assert!(eff.unwrap() > 0.0); + assert!(!eff.unwrap().is_nan()); + assert!(!eff.unwrap().is_infinite()); + } + } + + mod tracker_config_tests { + use super::*; + + #[test] + fn test_default_config() { + let config = TrackerConfig::default(); + + assert!(config.min_bytes_per_sec.is_infinite()); + assert_eq!(config.byte_ratio_threshold, 0.20); + assert_eq!(config.confidence_z, 2.0); + } + + #[test] + fn test_with_min_bytes_per_sec() { + let config = TrackerConfig::new().with_min_bytes_per_sec(1000.0); + + assert_eq!(config.min_bytes_per_sec, 1000.0); + } + + #[test] + fn test_with_byte_ratio_threshold() { + let config = TrackerConfig::new().with_byte_ratio_threshold(0.5); + + assert_eq!(config.byte_ratio_threshold, 0.5); + } + + #[test] + fn test_with_confidence_z() { + let config = TrackerConfig::new().with_confidence_z(3.0); + + assert_eq!(config.confidence_z, 3.0); + } + + #[test] + fn test_builder_chain() { + let config = TrackerConfig::new() + .with_min_bytes_per_sec(500.0) + .with_byte_ratio_threshold(0.3) + .with_confidence_z(1.5); + + assert_eq!(config.min_bytes_per_sec, 500.0); + assert_eq!(config.byte_ratio_threshold, 0.3); + assert_eq!(config.confidence_z, 1.5); + } + + #[test] + fn test_build_creates_tracker() { + let tracker = TrackerConfig::new().with_min_bytes_per_sec(1000.0).build(); + + // Tracker should be created and functional + assert_eq!(tracker.config.min_bytes_per_sec, 1000.0); + } + } + + mod state_machine_tests { + use super::helper_functions::*; + use super::*; + + #[test] + fn test_initial_placement_low_byte_ratio() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.2) + .build(); + + // Create metadata: 1 row group, 100 rows, 1000 bytes for column + let metadata = create_test_metadata(vec![(100, vec![1000])]); + + // Filter using column 0 (1000 bytes out of 1000 projection = 100% ratio > 0.2) + // So this should be placed in post-scan initially + let expr = col_expr("a", 0); + let filters = vec![(1, expr)]; + + let result = tracker.partition_filters(filters, 1000, &metadata); + + // With 100% byte ratio, should go to post-scan + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 1); + } + + #[test] + fn test_initial_placement_filter_in_projection_low_ratio() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.5) + .build(); + + // Create metadata: 1 row group, 100 rows, 100 bytes for column + let metadata = create_test_metadata(vec![(100, vec![100])]); + + // Filter using column 0 which IS in the projection. + // filter_bytes=100, projection=1000, ratio=0.10 <= 0.5 → RowFilter + let expr = col_expr("a", 0); + let filters = vec![(1, expr)]; + + let result = tracker.partition_filters(filters, 1000, &metadata); + + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + } + + #[test] + fn test_initial_placement_high_byte_ratio() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.5) + .build(); + + // Create metadata: 1 row group, 100 rows, 100 bytes for column + let metadata = create_test_metadata(vec![(100, vec![100])]); + + // Filter using column 0 (100 bytes / 1000 projection = 10% ratio <= 0.5) + // So this should be placed in row-filter immediately + let expr = col_expr("a", 0); + let filters = vec![(1, expr)]; + + let result = tracker.partition_filters(filters, 1000, &metadata); + + // With 10% byte ratio, should go to row-filter + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + } + + #[test] + fn test_min_bytes_per_sec_infinity_disables_promotion() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(f64::INFINITY) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![100])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr)]; + + let result = tracker.partition_filters(filters, 1000, &metadata); + + // All filters should go to post_scan when min_bytes_per_sec is INFINITY + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 1); + } + + #[test] + fn test_min_bytes_per_sec_zero_promotes_all() { + let tracker = TrackerConfig::new().with_min_bytes_per_sec(0.0).build(); + + let metadata = create_test_metadata(vec![(100, vec![1000])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr)]; + + let result = tracker.partition_filters(filters, 1000, &metadata); + + // All filters should be promoted to row_filters when min_bytes_per_sec is 0 + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + } + + #[test] + fn test_promotion_via_confidence_lower_bound() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.5) // Force to PostScan initially + .with_confidence_z(0.5) // Lower z for easier promotion + .build(); + + let metadata = create_test_metadata(vec![(100, vec![1000])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // First partition: goes to PostScan (high byte ratio) + let result = tracker.partition_filters(filters.clone(), 1000, &metadata); + assert_eq!(result.post_scan.len(), 1); + assert_eq!(result.row_filters.len(), 0); + + // Feed high effectiveness stats + for _ in 0..5 { + tracker.update(1, 10, 100, 100_000, 1000); // high effectiveness + } + + // Second partition: should be promoted to RowFilter + let result = tracker.partition_filters(filters, 1000, &metadata); + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + } + + #[test] + fn test_demotion_via_confidence_upper_bound() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(10000.0) + .with_byte_ratio_threshold(0.1) // Force to RowFilter initially + .with_confidence_z(0.5) // Lower z for easier demotion + .build(); + + let metadata = create_test_metadata(vec![(100, vec![100])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // First partition: goes to RowFilter (low byte ratio) + let result = tracker.partition_filters(filters.clone(), 1000, &metadata); + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + + // Feed low effectiveness stats + for _ in 0..5 { + tracker.update(1, 100, 100, 100_000, 1000); // all rows matched, no pruning + } + + // Second partition: should be demoted to PostScan + let result = tracker.partition_filters(filters, 1000, &metadata); + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 1); + } + + #[test] + fn test_demotion_resets_stats() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(10000.0) + .with_byte_ratio_threshold(0.1) + .with_confidence_z(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![100])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // Start as RowFilter + tracker.partition_filters(filters.clone(), 1000, &metadata); + + // Add stats + tracker.update(1, 100, 100, 100_000, 1000); + tracker.update(1, 100, 100, 100_000, 1000); + + // Demote + tracker.partition_filters(filters.clone(), 1000, &metadata); + + // Stats should be zeroed after demotion + let stats_map = tracker.filter_stats.read(); + assert_eq!( + *stats_map.get(&1).unwrap().lock(), + SelectivityStats::default() + ); + } + + #[test] + fn test_promotion_resets_stats() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(100.0) + .with_byte_ratio_threshold(0.5) + .with_confidence_z(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![1000])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // Start as PostScan + tracker.partition_filters(filters.clone(), 1000, &metadata); + + // Add stats + for _ in 0..3 { + tracker.update(1, 50, 100, 100_000, 1000); + } + + // Promote + tracker.partition_filters(filters.clone(), 1000, &metadata); + + // Stats should be zeroed after promotion + let stats_map = tracker.filter_stats.read(); + assert_eq!( + *stats_map.get(&1).unwrap().lock(), + SelectivityStats::default() + ); + } + + #[test] + fn test_optional_filter_dropping() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(10000.0) + .with_byte_ratio_threshold(0.5) + .with_confidence_z(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![1000])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // Start as PostScan + tracker.partition_filters(filters.clone(), 1000, &metadata); + + // Feed poor effectiveness stats + for _ in 0..5 { + tracker.update(1, 100, 100, 100_000, 1000); // no pruning + } + + // Next partition: should stay as PostScan (not dropped because not optional) + let result = tracker.partition_filters(filters, 1000, &metadata); + assert_eq!(result.post_scan.len(), 1); + assert_eq!(result.row_filters.len(), 0); + } + + #[test] + fn test_persistent_dropped_state() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(10000.0) + .with_byte_ratio_threshold(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![1000])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // Mark filter as dropped by manually setting state + tracker + .inner + .lock() + .filter_states + .insert(1, FilterState::Dropped); + + // On next partition, dropped filters should not reappear + let result = tracker.partition_filters(filters, 1000, &metadata); + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 0); + } + } + + mod filter_ordering_tests { + use super::helper_functions::*; + use super::*; + + #[test] + fn test_filters_get_partitioned() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1.0) // Very low threshold + .build(); + + let metadata = create_test_metadata(vec![(100, vec![100, 100, 100])]); + let filters = vec![ + (1, col_expr("a", 0)), + (2, col_expr("a", 1)), + (3, col_expr("a", 2)), + ]; + + // Partition should process all filters + let result = tracker.partition_filters(filters.clone(), 1000, &metadata); + + // With min_bytes_per_sec=1.0, filters should be partitioned + assert!(result.row_filters.len() + result.post_scan.len() > 0); + + // Add stats and partition again + tracker.update(1, 60, 100, 1_000_000, 100); + tracker.update(2, 10, 100, 1_000_000, 100); + tracker.update(3, 40, 100, 1_000_000, 100); + + let result2 = tracker.partition_filters(filters, 1000, &metadata); + + // Filters should still be partitioned + assert!(result2.row_filters.len() + result2.post_scan.len() > 0); + } + + #[test] + fn test_filters_processed_without_stats() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1.0) // Very low threshold + .build(); + + // Different column sizes: 300, 200, 100 bytes + let metadata = create_test_metadata(vec![(100, vec![300, 200, 100])]); + let filters = vec![ + (1, col_expr("a", 0)), + (2, col_expr("a", 1)), + (3, col_expr("a", 2)), + ]; + + // First partition - no stats yet + let result = tracker.partition_filters(filters.clone(), 1000, &metadata); + + // All filters should be processed (partitioned into row/post-scan) + assert!(result.row_filters.len() + result.post_scan.len() > 0); + + // Filters should be consistent on repeated calls + let result2 = tracker.partition_filters(filters, 1000, &metadata); + assert_eq!( + result.row_filters.len() + result.post_scan.len(), + result2.row_filters.len() + result2.post_scan.len() + ); + } + + #[test] + fn test_filters_with_partial_stats() { + let tracker = TrackerConfig::new().with_min_bytes_per_sec(1.0).build(); + + // Give filter 2 larger bytes so it's prioritized when falling back to byte ratio + let metadata = create_test_metadata(vec![(100, vec![100, 300, 100])]); + let filters = vec![ + (1, col_expr("a", 0)), + (2, col_expr("a", 1)), + (3, col_expr("a", 2)), + ]; + + // First partition + let result1 = tracker.partition_filters(filters.clone(), 1000, &metadata); + assert!(result1.row_filters.len() + result1.post_scan.len() > 0); + + // Only add stats for filters 1 and 3, not 2 + tracker.update(1, 60, 100, 1_000_000, 100); + tracker.update(3, 60, 100, 1_000_000, 100); + + // Second partition with partial stats + let result2 = tracker.partition_filters(filters, 1000, &metadata); + assert!(result2.row_filters.len() + result2.post_scan.len() > 0); + } + + #[test] + fn test_ordering_stability_with_identical_values() { + let tracker = TrackerConfig::new().with_min_bytes_per_sec(0.0).build(); + + let metadata = create_test_metadata(vec![(100, vec![100, 100, 100])]); + let filters = vec![ + (1, col_expr("a", 0)), + (2, col_expr("a", 1)), + (3, col_expr("a", 2)), + ]; + + let result1 = tracker.partition_filters(filters.clone(), 1000, &metadata); + let result2 = tracker.partition_filters(filters, 1000, &metadata); + + // Without stats and with identical byte sizes, order should be stable + assert_eq!(result1.row_filters[0].0, result2.row_filters[0].0); + assert_eq!(result1.row_filters[1].0, result2.row_filters[1].0); + assert_eq!(result1.row_filters[2].0, result2.row_filters[2].0); + } + } + + mod dynamic_filter_tests { + use super::helper_functions::*; + use super::*; + + #[test] + fn test_generation_zero_ignored() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![1000])]); + + // Create two filters with same ID but generation 0 and 1 + // Generation 0 should be ignored + let expr1 = col_expr("a", 0); + let filters1 = vec![(1, expr1)]; + + tracker.partition_filters(filters1, 1000, &metadata); + tracker.update(1, 50, 100, 100_000, 1000); + + // Generation 0 doesn't trigger state reset + let snapshot_gen = tracker.inner.lock().snapshot_generations.get(&1).copied(); + assert_eq!(snapshot_gen, None); + } + + #[test] + fn test_generation_change_clears_stats() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.5) + .build(); + + // Pre-populate stats entry so update() can find it + tracker.ensure_stats_entry(1); + + // Initialize generation to 100 + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 100, &stats); + } + + // Add stats + tracker.update(1, 50, 100, 100_000, 1000); + tracker.update(1, 50, 100, 100_000, 1000); + + let stats_before = { + let stats_map = tracker.filter_stats.read(); + *stats_map.get(&1).unwrap().lock() != SelectivityStats::default() + }; + assert!(stats_before); + + // Simulate generation change to a different value + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 101, &stats); + } + + // Stats should be zeroed on generation change + let stats_after = { + let stats_map = tracker.filter_stats.read(); + *stats_map.get(&1).unwrap().lock() == SelectivityStats::default() + }; + assert!(stats_after); + } + + #[test] + fn test_generation_unchanged_preserves_stats() { + let tracker = TrackerConfig::new().with_min_bytes_per_sec(1000.0).build(); + + // Pre-populate stats entry so update() can find it + tracker.ensure_stats_entry(1); + + // Manually set generation + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 100, &stats); + } + + // Add stats + tracker.update(1, 50, 100, 100_000, 1000); + tracker.update(1, 50, 100, 100_000, 1000); + + let sample_count_before = { + let stats_map = tracker.filter_stats.read(); + stats_map.get(&1).map(|s| s.lock().sample_count) + }; + assert_eq!(sample_count_before, Some(2)); + + // Call note_generation with same generation + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 100, &stats); + } + + // Stats should be preserved + let sample_count_after = { + let stats_map = tracker.filter_stats.read(); + stats_map.get(&1).map(|s| s.lock().sample_count) + }; + assert_eq!(sample_count_after, Some(2)); + } + + #[test] + fn test_generation_change_preserves_state() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.1) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![100])]); + + // First partition: goes to RowFilter + let expr = col_expr("a", 0); + let filters = vec![(1, expr)]; + tracker.partition_filters(filters.clone(), 1000, &metadata); + + let state_before = tracker.inner.lock().filter_states.get(&1).copied(); + assert_eq!(state_before, Some(FilterState::RowFilter)); + + // Simulate generation change + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 100, &stats); + } + + // State should be preserved despite stats being cleared + let state_after = tracker.inner.lock().filter_states.get(&1).copied(); + assert_eq!(state_after, Some(FilterState::RowFilter)); + } + + #[test] + fn test_generation_change_undrops_dropped_filter() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.1) + .build(); + + // Manually set filter state to Dropped + tracker + .inner + .lock() + .filter_states + .insert(1, FilterState::Dropped); + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 100, &stats); + } + + // Simulate generation change + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 101, &stats); + } + + // Dropped filter should be un-dropped to PostScan + let state_after = tracker.inner.lock().filter_states.get(&1).copied(); + assert_eq!(state_after, Some(FilterState::PostScan)); + } + + #[test] + fn test_multiple_filters_independent_generation_tracking() { + let tracker = TrackerConfig::new().with_min_bytes_per_sec(1000.0).build(); + + // Pre-populate stats entries so update() can find them + tracker.ensure_stats_entry(1); + tracker.ensure_stats_entry(2); + + // Set generations for multiple filters + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 100, &stats); + inner.note_generation(2, 200, &stats); + } + + // Add stats to both + tracker.update(1, 50, 100, 100_000, 1000); + tracker.update(2, 50, 100, 100_000, 1000); + + // Change generation of filter 1 only + { + let mut inner = tracker.inner.lock(); + let stats = tracker.filter_stats.read(); + inner.note_generation(1, 101, &stats); + } + + // Filter 1 stats should be zeroed, filter 2 preserved + let stats_map = tracker.filter_stats.read(); + assert_eq!( + *stats_map.get(&1).unwrap().lock(), + SelectivityStats::default() + ); + assert_ne!( + *stats_map.get(&2).unwrap().lock(), + SelectivityStats::default() + ); + } + } + + mod integration_tests { + use super::helper_functions::*; + use super::*; + + #[test] + fn test_full_promotion_lifecycle() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(500.0) + .with_byte_ratio_threshold(0.5) // Force initial PostScan + .with_confidence_z(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![1000])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // Step 1: Initial placement (PostScan) + let result = tracker.partition_filters(filters.clone(), 1000, &metadata); + assert_eq!(result.post_scan.len(), 1); + assert_eq!(result.row_filters.len(), 0); + + // Step 2: Accumulate high effectiveness stats + for _ in 0..5 { + tracker.update(1, 10, 100, 100_000, 1000); // high effectiveness + } + + // Step 3: Promotion should occur + let result = tracker.partition_filters(filters.clone(), 1000, &metadata); + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + + // Step 4: Continue to partition without additional updates + let result = tracker.partition_filters(filters, 1000, &metadata); + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + } + + #[test] + fn test_full_demotion_lifecycle() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(10000.0) + .with_byte_ratio_threshold(0.1) // Force initial RowFilter + .with_confidence_z(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![100])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // Step 1: Initial placement (RowFilter) + let result = tracker.partition_filters(filters.clone(), 1000, &metadata); + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + + // Step 2: Accumulate low effectiveness stats + for _ in 0..5 { + tracker.update(1, 100, 100, 100_000, 1000); // no pruning + } + + // Step 3: Demotion should occur + let result = tracker.partition_filters(filters.clone(), 1000, &metadata); + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 1); + + // Step 4: Continue to partition without additional updates + let result = tracker.partition_filters(filters, 1000, &metadata); + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 1); + } + + #[test] + fn test_multiple_filters_mixed_states() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.4) // Force PostScan initially (500/1000=0.5 > 0.4) + .with_confidence_z(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![500, 500])]); + let filters = vec![(1, col_expr("a", 0)), (2, col_expr("a", 1))]; + + // Initial partition: both go to PostScan (500/1000 = 0.5 > 0.4) + let result = tracker.partition_filters(filters.clone(), 1000, &metadata); + assert_eq!(result.post_scan.len(), 2); + + // Filter 1: high effectiveness (promote) + for _ in 0..3 { + tracker.update(1, 10, 100, 100_000, 500); + } + + // Filter 2: low effectiveness (stay PostScan) + for _ in 0..3 { + tracker.update(2, 100, 100, 100_000, 500); + } + + // Next partition: Filter 1 promoted, Filter 2 stays PostScan + let result = tracker.partition_filters(filters, 1000, &metadata); + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 1); + assert_eq!(result.row_filters[0].0, 1); + assert_eq!(result.post_scan[0].0, 2); + } + + #[test] + fn test_empty_filter_list() { + let tracker = TrackerConfig::new().build(); + let metadata = create_test_metadata(vec![(100, vec![1000])]); + let filters = vec![]; + + let result = tracker.partition_filters(filters, 1000, &metadata); + + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 0); + } + + #[test] + fn test_single_filter() { + let tracker = TrackerConfig::new().with_min_bytes_per_sec(0.0).build(); + + let metadata = create_test_metadata(vec![(100, vec![100])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr)]; + + let result = tracker.partition_filters(filters, 1000, &metadata); + + assert_eq!(result.row_filters.len(), 1); + assert_eq!(result.post_scan.len(), 0); + } + + #[test] + fn test_zero_effectiveness_stays_at_boundary() { + let tracker = TrackerConfig::new() + .with_min_bytes_per_sec(100.0) + .with_byte_ratio_threshold(0.1) + .with_confidence_z(0.5) + .build(); + + let metadata = create_test_metadata(vec![(100, vec![100])]); + let expr = col_expr("a", 0); + let filters = vec![(1, expr.clone())]; + + // Start as RowFilter + tracker.partition_filters(filters.clone(), 1000, &metadata); + + // All rows match (zero effectiveness) + for _ in 0..5 { + tracker.update(1, 100, 100, 100_000, 100); + } + + // Should demote due to CI upper bound being 0 + let result = tracker.partition_filters(filters, 1000, &metadata); + assert_eq!(result.row_filters.len(), 0); + assert_eq!(result.post_scan.len(), 1); + } + + #[test] + fn test_confidence_z_parameter_stored() { + // Test that different confidence_z values are properly stored in config + let tracker_conservative = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.5) + .with_confidence_z(3.0) // Harder to promote + .build(); + + let tracker_aggressive = TrackerConfig::new() + .with_min_bytes_per_sec(1000.0) + .with_byte_ratio_threshold(0.5) + .with_confidence_z(0.5) // Easier to promote + .build(); + + // Verify configs are stored correctly + assert_eq!(tracker_conservative.config.confidence_z, 3.0); + assert_eq!(tracker_aggressive.config.confidence_z, 0.5); + + // The z-score affects confidence intervals during promotion/demotion decisions. + // With identical stats, higher z requires narrower confidence intervals, + // making promotion harder. With lower z, confidence intervals are wider, + // making promotion easier. This is tested in other integration tests + // that verify actual promotion/demotion behavior. + } + } +} diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index a014c8b2726e7..6da11aff5ff9e 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -39,8 +39,9 @@ use datafusion_common::tree_node::TreeNodeRecursion; use datafusion_datasource::TableSchema; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_physical_expr::EquivalenceProperties; +use datafusion_physical_expr::conjunction; use datafusion_physical_expr::projection::ProjectionExprs; -use datafusion_physical_expr::{EquivalenceProperties, conjunction}; use datafusion_physical_expr_adapter::DefaultPhysicalExprAdapterFactory; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_expr_common::physical_expr::fmt_sql; @@ -277,8 +278,10 @@ pub struct ParquetSource { /// In particular, this is the schema of the table without partition columns, /// *not* the physical schema of the file. pub(crate) table_schema: TableSchema, - /// Optional predicate for row filtering during parquet scan - pub(crate) predicate: Option>, + /// Optional predicate conjuncts for row filtering during parquet scan. + /// Each conjunct is tagged with a stable FilterId for selectivity tracking. + pub(crate) predicate_conjuncts: + Option)>>, /// Optional user defined parquet file reader factory pub(crate) parquet_file_reader_factory: Option>, /// Batch size configuration @@ -294,6 +297,10 @@ pub struct ParquetSource { /// so we still need to sort them after reading, so the reverse scan is inexact. /// Used to optimize ORDER BY ... DESC on sorted data. reverse_row_groups: bool, + /// Tracks filter selectivity across files for adaptive filter reordering. + /// Shared across all openers - each opener reads stats and makes its own + /// decision about which filters to push down vs. apply post-scan. + pub(crate) selectivity_tracker: Arc, } impl ParquetSource { @@ -312,13 +319,16 @@ impl ParquetSource { table_schema, table_parquet_options: TableParquetOptions::default(), metrics: ExecutionPlanMetricsSet::new(), - predicate: None, + predicate_conjuncts: None, parquet_file_reader_factory: None, batch_size: None, metadata_size_hint: None, #[cfg(feature = "parquet_encryption")] encryption_factory: None, reverse_row_groups: false, + selectivity_tracker: Arc::new( + crate::selectivity::SelectivityTracker::default(), + ), } } @@ -327,6 +337,15 @@ impl ParquetSource { mut self, table_parquet_options: TableParquetOptions, ) -> Self { + // Update the selectivity tracker from the config + let opts = &table_parquet_options.global; + self.selectivity_tracker = Arc::new( + crate::selectivity::TrackerConfig::new() + .with_min_bytes_per_sec(opts.filter_pushdown_min_bytes_per_sec) + .with_byte_ratio_threshold(opts.filter_collecting_byte_ratio_threshold) + .with_confidence_z(opts.filter_confidence_z) + .build(), + ); self.table_parquet_options = table_parquet_options; self } @@ -342,11 +361,23 @@ impl ParquetSource { self } - /// Set predicate information + /// Set predicate information. + /// + /// The predicate is split into conjuncts and each is assigned a stable + /// `FilterId` (its index in the conjunct list). These IDs are used for + /// selectivity tracking across files, avoiding ExprKey mismatch issues + /// when expressions are rebased or simplified per-file. #[expect(clippy::needless_pass_by_value)] pub fn with_predicate(&self, predicate: Arc) -> Self { + use datafusion_physical_expr::split_conjunction; let mut conf = self.clone(); - conf.predicate = Some(Arc::clone(&predicate)); + let conjuncts: Vec<(crate::selectivity::FilterId, Arc)> = + split_conjunction(&predicate) + .into_iter() + .enumerate() + .map(|(id, expr)| (id, Arc::clone(expr))) + .collect(); + conf.predicate_conjuncts = Some(conjuncts); conf } @@ -367,8 +398,15 @@ impl ParquetSource { /// Optional predicate. #[deprecated(since = "50.2.0", note = "use `filter` instead")] - pub fn predicate(&self) -> Option<&Arc> { - self.predicate.as_ref() + pub fn predicate(&self) -> Option> { + self.combined_predicate() + } + + /// Build a combined predicate from the conjuncts, if any. + fn combined_predicate(&self) -> Option> { + self.predicate_conjuncts + .as_ref() + .map(|conjuncts| conjunction(conjuncts.iter().map(|(_, e)| Arc::clone(e)))) } /// return the optional file reader factory @@ -399,22 +437,6 @@ impl ParquetSource { self.table_parquet_options.global.pushdown_filters } - /// If true, the `RowFilter` made by `pushdown_filters` may try to - /// minimize the cost of filter evaluation by reordering the - /// predicate [`Expr`]s. If false, the predicates are applied in - /// the same order as specified in the query. Defaults to false. - /// - /// [`Expr`]: datafusion_expr::Expr - pub fn with_reorder_filters(mut self, reorder_filters: bool) -> Self { - self.table_parquet_options.global.reorder_filters = reorder_filters; - self - } - - /// Return the value described in [`Self::with_reorder_filters`] - fn reorder_filters(&self) -> bool { - self.table_parquet_options.global.reorder_filters - } - /// Return the value of [`datafusion_common::config::ParquetOptions::force_filter_selections`] fn force_filter_selections(&self) -> bool { self.table_parquet_options.global.force_filter_selections @@ -561,13 +583,13 @@ impl FileSource for ParquetSource { .expect("Batch size must set before creating ParquetMorselizer"), limit: base_config.limit, preserve_order: base_config.preserve_order, - predicate: self.predicate.clone(), + predicate_conjuncts: self.predicate_conjuncts.clone(), + selectivity_tracker: Arc::clone(&self.selectivity_tracker), table_schema: self.table_schema.clone(), metadata_size_hint: self.metadata_size_hint, metrics: self.metrics().clone(), parquet_file_reader_factory, pushdown_filters: self.pushdown_filters(), - reorder_filters: self.reorder_filters(), force_filter_selections: self.force_filter_selections(), enable_page_index: self.enable_page_index(), enable_bloom_filter: self.bloom_filter_on_read(), @@ -588,7 +610,7 @@ impl FileSource for ParquetSource { } fn filter(&self) -> Option> { - self.predicate.clone() + self.combined_predicate() } fn with_batch_size(&self, batch_size: usize) -> Arc { @@ -641,7 +663,7 @@ impl FileSource for ParquetSource { // the actual predicates are built in reference to the physical schema of // each file, which we do not have at this point and hence cannot use. // Instead, we use the logical schema of the file (the table schema without partition columns). - if let Some(predicate) = &self.predicate { + if let Some(predicate) = &self.combined_predicate() { let predicate_creation_errors = Count::new(); if let Some(pruning_predicate) = build_pruning_predicates( Some(predicate), @@ -718,13 +740,16 @@ impl FileSource for ParquetSource { PushedDown::No => None, }) .collect_vec(); - let predicate = match source.predicate { - Some(predicate) => { - conjunction(std::iter::once(predicate).chain(allowed_filters)) - } - None => conjunction(allowed_filters), - }; - source.predicate = Some(predicate); + // Merge existing conjuncts with new allowed filters + let mut all_conjuncts: Vec> = source + .predicate_conjuncts + .as_ref() + .map(|c| c.iter().map(|(_, e)| Arc::clone(e)).collect()) + .unwrap_or_default(); + all_conjuncts.extend(allowed_filters); + // Re-assign FilterIds by index + source.predicate_conjuncts = + Some(all_conjuncts.into_iter().enumerate().collect()); source = source.with_pushdown_filters(pushdown_filters); let source = Arc::new(source); // If pushdown_filters is false we tell our parents that they still have to handle the filters, @@ -835,8 +860,10 @@ impl FileSource for ParquetSource { ) -> datafusion_common::Result { // Visit predicate (filter) expression if present let mut tnr = TreeNodeRecursion::Continue; - if let Some(predicate) = &self.predicate { - tnr = tnr.visit_sibling(|| f(predicate.as_ref()))?; + if let Some(ref conjuncts) = self.predicate_conjuncts { + for (_, expr) in conjuncts { + tnr = tnr.visit_sibling(|| f(expr.as_ref()))?; + } } // Visit projection expressions @@ -861,8 +888,9 @@ mod tests { let parquet_source = ParquetSource::new(Arc::new(Schema::empty())).with_predicate(predicate); - // same value. but filter() call Arc::clone internally - assert_eq!(parquet_source.predicate(), parquet_source.filter().as_ref()); + // Both should return equivalent predicates + assert!(parquet_source.predicate().is_some()); + assert!(parquet_source.filter().is_some()); } #[test] diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index a42a1560cb769..9faff1b12320a 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -687,6 +687,156 @@ pub fn is_volatile(expr: &Arc) -> bool { is_volatile } +/// A transparent wrapper that marks a [`PhysicalExpr`] as *optional* — i.e., +/// droppable without affecting query correctness. +/// +/// This is used for filters that are performance hints (e.g., dynamic join +/// filters) as opposed to mandatory predicates. The selectivity tracker can +/// detect this wrapper via `expr.as_any().downcast_ref::()` +/// and choose to drop the filter entirely when it is not cost-effective. +/// +/// All [`PhysicalExpr`] methods are delegated to the wrapped inner expression. +/// +/// Currently used by `HashJoinExec` for dynamic join filters. When the +/// selectivity tracker drops such a filter, the join still enforces +/// correctness independently — "dropped" simply means the filter is never +/// applied as a scan-time optimization. +#[derive(Debug)] +pub struct OptionalFilterPhysicalExpr { + inner: Arc, +} + +impl OptionalFilterPhysicalExpr { + /// Create a new optional filter wrapping the given expression. + pub fn new(inner: Arc) -> Self { + Self { inner } + } + + /// Returns a clone of the inner (unwrapped) expression. + pub fn inner(&self) -> Arc { + Arc::clone(&self.inner) + } +} + +impl Display for OptionalFilterPhysicalExpr { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "Optional({})", self.inner) + } +} + +impl PartialEq for OptionalFilterPhysicalExpr { + fn eq(&self, other: &Self) -> bool { + self.inner.as_ref() == other.inner.as_ref() + } +} + +impl Eq for OptionalFilterPhysicalExpr {} + +impl Hash for OptionalFilterPhysicalExpr { + fn hash(&self, state: &mut H) { + self.inner.as_ref().hash(state); + } +} + +impl PhysicalExpr for OptionalFilterPhysicalExpr { + fn data_type(&self, input_schema: &Schema) -> Result { + self.inner.data_type(input_schema) + } + + fn nullable(&self, input_schema: &Schema) -> Result { + self.inner.nullable(input_schema) + } + + fn evaluate(&self, batch: &RecordBatch) -> Result { + self.inner.evaluate(batch) + } + + fn return_field(&self, input_schema: &Schema) -> Result { + self.inner.return_field(input_schema) + } + + fn evaluate_selection( + &self, + batch: &RecordBatch, + selection: &BooleanArray, + ) -> Result { + self.inner.evaluate_selection(batch, selection) + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.inner] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + assert_eq_or_internal_err!( + children.len(), + 1, + "OptionalFilterPhysicalExpr: expected 1 child" + ); + Ok(Arc::new(OptionalFilterPhysicalExpr::new(Arc::clone( + &children[0], + )))) + } + + fn evaluate_bounds(&self, children: &[&Interval]) -> Result { + self.inner.evaluate_bounds(children) + } + + fn propagate_constraints( + &self, + interval: &Interval, + children: &[&Interval], + ) -> Result>> { + self.inner.propagate_constraints(interval, children) + } + + fn evaluate_statistics(&self, children: &[&Distribution]) -> Result { + self.inner.evaluate_statistics(children) + } + + fn propagate_statistics( + &self, + parent: &Distribution, + children: &[&Distribution], + ) -> Result>> { + self.inner.propagate_statistics(parent, children) + } + + fn get_properties(&self, children: &[ExprProperties]) -> Result { + self.inner.get_properties(children) + } + + fn fmt_sql(&self, f: &mut Formatter<'_>) -> fmt::Result { + self.inner.fmt_sql(f) + } + + fn snapshot(&self) -> Result>> { + // Always unwrap the Optional wrapper for snapshot consumers (e.g. PruningPredicate). + // If inner has a snapshot, use it; otherwise return the inner directly. + Ok(Some(match self.inner.snapshot()? { + Some(snap) => snap, + None => Arc::clone(&self.inner), + })) + } + + fn snapshot_generation(&self) -> u64 { + // The wrapper itself is not dynamic; tree-walking picks up + // inner's generation via children(). + 0 + } + + fn is_volatile_node(&self) -> bool { + self.inner.is_volatile_node() + } + + fn placement(&self) -> ExpressionPlacement { + self.inner.placement() + } +} + #[cfg(test)] mod test { use crate::physical_expr::PhysicalExpr; @@ -694,6 +844,7 @@ mod test { use arrow::datatypes::{DataType, Schema}; use datafusion_expr_common::columnar_value::ColumnarValue; use std::fmt::{Display, Formatter}; + use std::hash::{Hash, Hasher}; use std::sync::Arc; #[derive(Debug, PartialEq, Eq, Hash)] @@ -868,4 +1019,104 @@ mod test { &BooleanArray::from(vec![true; 5]), ); } + + #[test] + fn test_optional_filter_downcast() { + use super::OptionalFilterPhysicalExpr; + + let inner: Arc = Arc::new(TestExpr {}); + let optional = Arc::new(OptionalFilterPhysicalExpr::new(Arc::clone(&inner))); + + // Can downcast to detect the wrapper + let as_physical: Arc = optional; + assert!( + as_physical + .downcast_ref::() + .is_some() + ); + + // Inner expr is NOT detectable as optional + assert!(inner.downcast_ref::().is_none()); + } + + #[test] + fn test_optional_filter_delegates_evaluate() { + use super::OptionalFilterPhysicalExpr; + + let inner: Arc = Arc::new(TestExpr {}); + let optional = OptionalFilterPhysicalExpr::new(Arc::clone(&inner)); + + let batch = + unsafe { RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 5) }; + let result = optional.evaluate(&batch).unwrap(); + let array = result.to_array(5).unwrap(); + assert_eq!(array.len(), 5); + } + + #[test] + fn test_optional_filter_children_and_with_new_children() { + use super::OptionalFilterPhysicalExpr; + + let inner: Arc = Arc::new(TestExpr {}); + let optional = Arc::new(OptionalFilterPhysicalExpr::new(Arc::clone(&inner))); + + // children() returns the inner + let children = optional.children(); + assert_eq!(children.len(), 1); + + // with_new_children preserves the wrapper + let new_inner: Arc = Arc::new(TestExpr {}); + let rewrapped = Arc::clone(&optional) + .with_new_children(vec![new_inner]) + .unwrap(); + assert!( + rewrapped + .downcast_ref::() + .is_some() + ); + } + + #[test] + fn test_optional_filter_inner() { + use super::OptionalFilterPhysicalExpr; + + let inner: Arc = Arc::new(TestExpr {}); + let optional = OptionalFilterPhysicalExpr::new(Arc::clone(&inner)); + + // inner() returns a clone of the wrapped expression + let unwrapped = optional.inner(); + assert!(unwrapped.downcast_ref::().is_some()); + } + + #[test] + fn test_optional_filter_snapshot_generation_zero() { + use super::OptionalFilterPhysicalExpr; + + let inner: Arc = Arc::new(TestExpr {}); + let optional = OptionalFilterPhysicalExpr::new(inner); + + assert_eq!(optional.snapshot_generation(), 0); + } + + #[test] + fn test_optional_filter_eq_hash() { + use super::OptionalFilterPhysicalExpr; + use std::collections::hash_map::DefaultHasher; + + let inner1: Arc = Arc::new(TestExpr {}); + let inner2: Arc = Arc::new(TestExpr {}); + + let opt1 = OptionalFilterPhysicalExpr::new(inner1); + let opt2 = OptionalFilterPhysicalExpr::new(inner2); + + // Same inner type → equal + assert_eq!(opt1, opt2); + + // Same hash + let mut h1 = DefaultHasher::new(); + let mut h2 = DefaultHasher::new(); + opt1.hash(&mut h1); + opt2.hash(&mut h2); + assert_eq!(h1.finish(), h2.finish()); + } } diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index 735375441f549..2a45ca26ee9dc 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -90,7 +90,9 @@ use datafusion_physical_expr::projection::{ProjectionRef, combine_projections}; use datafusion_physical_expr::{PhysicalExpr, PhysicalExprRef}; use datafusion_common::hash_utils::RandomState; -use datafusion_physical_expr_common::physical_expr::fmt_sql; +use datafusion_physical_expr_common::physical_expr::{ + OptionalFilterPhysicalExpr, fmt_sql, +}; use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays; use futures::TryStreamExt; use parking_lot::Mutex; @@ -1645,9 +1647,12 @@ impl ExecutionPlan for HashJoinExec { if phase == FilterPushdownPhase::Post && self.allow_join_dynamic_filter_pushdown(config) { - // Add actual dynamic filter to right side (probe side) + // Add actual dynamic filter to right side (probe side), + // wrapped as optional so it can be dropped if ineffective. let dynamic_filter = Self::create_dynamic_filter(&self.on); - right_child = right_child.with_self_filter(dynamic_filter); + let wrapped: Arc = + Arc::new(OptionalFilterPhysicalExpr::new(dynamic_filter)); + right_child = right_child.with_self_filter(wrapped); } Ok(FilterDescription::new() @@ -1669,8 +1674,12 @@ impl ExecutionPlan for HashJoinExec { // Note that we don't check PushdDownPredicate::discrimnant because even if nothing said // "yes, I can fully evaluate this filter" things might still use it for statistics -> it's worth updating let predicate = Arc::clone(&filter.predicate); - if let Ok(dynamic_filter) = - Arc::downcast::(predicate) + // Unwrap OptionalFilterPhysicalExpr if present to get the inner DynamicFilterPhysicalExpr + let inner = predicate + .downcast_ref::() + .map(|opt| opt.inner()) + .unwrap_or(predicate); + if let Ok(dynamic_filter) = Arc::downcast::(inner) { // We successfully pushed down our self filter - we need to make a new node with the dynamic filter let new_node = self diff --git a/datafusion/proto-common/proto/datafusion_common.proto b/datafusion/proto-common/proto/datafusion_common.proto index 31ece63577b4f..b159742f3697d 100644 --- a/datafusion/proto-common/proto/datafusion_common.proto +++ b/datafusion/proto-common/proto/datafusion_common.proto @@ -535,7 +535,7 @@ message ParquetOptions { bool pruning = 2; // default = true bool skip_metadata = 3; // default = true bool pushdown_filters = 5; // default = false - bool reorder_filters = 6; // default = false + reserved 6; // was reorder_filters bool force_filter_selections = 34; // default = false uint64 data_pagesize_limit = 7; // default = 1024 * 1024 uint64 write_batch_size = 8; // default = 1024 @@ -605,6 +605,18 @@ message ParquetOptions { } CdcOptions content_defined_chunking = 35; + + oneof filter_collecting_byte_ratio_threshold_opt { + double filter_collecting_byte_ratio_threshold = 40; + } + + oneof filter_confidence_z_opt { + double filter_confidence_z = 41; + } + + oneof filter_pushdown_min_bytes_per_sec_opt { + double filter_pushdown_min_bytes_per_sec = 42; + } } message CdcOptions { diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index 4b7a91f38c201..4804a9605bd44 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -1023,7 +1023,7 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { }) .unwrap_or(None), pushdown_filters: value.pushdown_filters, - reorder_filters: value.reorder_filters, + force_filter_selections: value.force_filter_selections, data_pagesize_limit: value.data_pagesize_limit as usize, write_batch_size: value.write_batch_size as usize, @@ -1092,14 +1092,20 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { use_content_defined_chunking: value.content_defined_chunking.map(|cdc| { let defaults = CdcOptions::default(); CdcOptions { - // proto3 uses 0 as the wire default for uint64; a zero chunk size is - // invalid, so treat it as "field not set" and fall back to the default. min_chunk_size: if cdc.min_chunk_size != 0 { cdc.min_chunk_size as usize } else { defaults.min_chunk_size }, max_chunk_size: if cdc.max_chunk_size != 0 { cdc.max_chunk_size as usize } else { defaults.max_chunk_size }, - // norm_level = 0 is a valid value (and the default), so pass it through directly. norm_level: cdc.norm_level, } }), + filter_pushdown_min_bytes_per_sec: value.filter_pushdown_min_bytes_per_sec_opt.map(|opt| match opt { + protobuf::parquet_options::FilterPushdownMinBytesPerSecOpt::FilterPushdownMinBytesPerSec(v) => v, + }).unwrap_or(f64::INFINITY), + filter_collecting_byte_ratio_threshold: value.filter_collecting_byte_ratio_threshold_opt.map(|opt| match opt { + protobuf::parquet_options::FilterCollectingByteRatioThresholdOpt::FilterCollectingByteRatioThreshold(v) => v, + }).unwrap_or(0.2), + filter_confidence_z: value.filter_confidence_z_opt.map(|opt| match opt { + protobuf::parquet_options::FilterConfidenceZOpt::FilterConfidenceZ(v) => v, + }).unwrap_or(2.0), }) } } diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs index 77a3b71488ece..5ded52808966d 100644 --- a/datafusion/proto-common/src/generated/pbjson.rs +++ b/datafusion/proto-common/src/generated/pbjson.rs @@ -5782,9 +5782,6 @@ impl serde::Serialize for ParquetOptions { if self.pushdown_filters { len += 1; } - if self.reorder_filters { - len += 1; - } if self.force_filter_selections { len += 1; } @@ -5869,6 +5866,15 @@ impl serde::Serialize for ParquetOptions { if self.max_predicate_cache_size_opt.is_some() { len += 1; } + if self.filter_collecting_byte_ratio_threshold_opt.is_some() { + len += 1; + } + if self.filter_confidence_z_opt.is_some() { + len += 1; + } + if self.filter_pushdown_min_bytes_per_sec_opt.is_some() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion_common.ParquetOptions", len)?; if self.enable_page_index { struct_ser.serialize_field("enablePageIndex", &self.enable_page_index)?; @@ -5882,9 +5888,6 @@ impl serde::Serialize for ParquetOptions { if self.pushdown_filters { struct_ser.serialize_field("pushdownFilters", &self.pushdown_filters)?; } - if self.reorder_filters { - struct_ser.serialize_field("reorderFilters", &self.reorder_filters)?; - } if self.force_filter_selections { struct_ser.serialize_field("forceFilterSelections", &self.force_filter_selections)?; } @@ -6037,6 +6040,27 @@ impl serde::Serialize for ParquetOptions { } } } + if let Some(v) = self.filter_collecting_byte_ratio_threshold_opt.as_ref() { + match v { + parquet_options::FilterCollectingByteRatioThresholdOpt::FilterCollectingByteRatioThreshold(v) => { + struct_ser.serialize_field("filterCollectingByteRatioThreshold", v)?; + } + } + } + if let Some(v) = self.filter_confidence_z_opt.as_ref() { + match v { + parquet_options::FilterConfidenceZOpt::FilterConfidenceZ(v) => { + struct_ser.serialize_field("filterConfidenceZ", v)?; + } + } + } + if let Some(v) = self.filter_pushdown_min_bytes_per_sec_opt.as_ref() { + match v { + parquet_options::FilterPushdownMinBytesPerSecOpt::FilterPushdownMinBytesPerSec(v) => { + struct_ser.serialize_field("filterPushdownMinBytesPerSec", v)?; + } + } + } struct_ser.end() } } @@ -6054,8 +6078,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "skipMetadata", "pushdown_filters", "pushdownFilters", - "reorder_filters", - "reorderFilters", "force_filter_selections", "forceFilterSelections", "data_pagesize_limit", @@ -6110,6 +6132,12 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "coerceInt96", "max_predicate_cache_size", "maxPredicateCacheSize", + "filter_collecting_byte_ratio_threshold", + "filterCollectingByteRatioThreshold", + "filter_confidence_z", + "filterConfidenceZ", + "filter_pushdown_min_bytes_per_sec", + "filterPushdownMinBytesPerSec", ]; #[allow(clippy::enum_variant_names)] @@ -6118,7 +6146,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { Pruning, SkipMetadata, PushdownFilters, - ReorderFilters, ForceFilterSelections, DataPagesizeLimit, WriteBatchSize, @@ -6147,6 +6174,9 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { BloomFilterNdv, CoerceInt96, MaxPredicateCacheSize, + FilterCollectingByteRatioThreshold, + FilterConfidenceZ, + FilterPushdownMinBytesPerSec, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -6172,7 +6202,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "pruning" => Ok(GeneratedField::Pruning), "skipMetadata" | "skip_metadata" => Ok(GeneratedField::SkipMetadata), "pushdownFilters" | "pushdown_filters" => Ok(GeneratedField::PushdownFilters), - "reorderFilters" | "reorder_filters" => Ok(GeneratedField::ReorderFilters), "forceFilterSelections" | "force_filter_selections" => Ok(GeneratedField::ForceFilterSelections), "dataPagesizeLimit" | "data_pagesize_limit" => Ok(GeneratedField::DataPagesizeLimit), "writeBatchSize" | "write_batch_size" => Ok(GeneratedField::WriteBatchSize), @@ -6201,6 +6230,9 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv), "coerceInt96" | "coerce_int96" => Ok(GeneratedField::CoerceInt96), "maxPredicateCacheSize" | "max_predicate_cache_size" => Ok(GeneratedField::MaxPredicateCacheSize), + "filterCollectingByteRatioThreshold" | "filter_collecting_byte_ratio_threshold" => Ok(GeneratedField::FilterCollectingByteRatioThreshold), + "filterConfidenceZ" | "filter_confidence_z" => Ok(GeneratedField::FilterConfidenceZ), + "filterPushdownMinBytesPerSec" | "filter_pushdown_min_bytes_per_sec" => Ok(GeneratedField::FilterPushdownMinBytesPerSec), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -6224,7 +6256,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { let mut pruning__ = None; let mut skip_metadata__ = None; let mut pushdown_filters__ = None; - let mut reorder_filters__ = None; let mut force_filter_selections__ = None; let mut data_pagesize_limit__ = None; let mut write_batch_size__ = None; @@ -6253,6 +6284,9 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { let mut bloom_filter_ndv_opt__ = None; let mut coerce_int96_opt__ = None; let mut max_predicate_cache_size_opt__ = None; + let mut filter_collecting_byte_ratio_threshold_opt__ = None; + let mut filter_confidence_z_opt__ = None; + let mut filter_pushdown_min_bytes_per_sec_opt__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::EnablePageIndex => { @@ -6279,12 +6313,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { } pushdown_filters__ = Some(map_.next_value()?); } - GeneratedField::ReorderFilters => { - if reorder_filters__.is_some() { - return Err(serde::de::Error::duplicate_field("reorderFilters")); - } - reorder_filters__ = Some(map_.next_value()?); - } GeneratedField::ForceFilterSelections => { if force_filter_selections__.is_some() { return Err(serde::de::Error::duplicate_field("forceFilterSelections")); @@ -6467,6 +6495,24 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { } max_predicate_cache_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(x.0)); } + GeneratedField::FilterCollectingByteRatioThreshold => { + if filter_collecting_byte_ratio_threshold_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("filterCollectingByteRatioThreshold")); + } + filter_collecting_byte_ratio_threshold_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::FilterCollectingByteRatioThresholdOpt::FilterCollectingByteRatioThreshold(x.0)); + } + GeneratedField::FilterConfidenceZ => { + if filter_confidence_z_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("filterConfidenceZ")); + } + filter_confidence_z_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::FilterConfidenceZOpt::FilterConfidenceZ(x.0)); + } + GeneratedField::FilterPushdownMinBytesPerSec => { + if filter_pushdown_min_bytes_per_sec_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("filterPushdownMinBytesPerSec")); + } + filter_pushdown_min_bytes_per_sec_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::FilterPushdownMinBytesPerSecOpt::FilterPushdownMinBytesPerSec(x.0)); + } } } Ok(ParquetOptions { @@ -6474,7 +6520,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { pruning: pruning__.unwrap_or_default(), skip_metadata: skip_metadata__.unwrap_or_default(), pushdown_filters: pushdown_filters__.unwrap_or_default(), - reorder_filters: reorder_filters__.unwrap_or_default(), force_filter_selections: force_filter_selections__.unwrap_or_default(), data_pagesize_limit: data_pagesize_limit__.unwrap_or_default(), write_batch_size: write_batch_size__.unwrap_or_default(), @@ -6503,6 +6548,9 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { bloom_filter_ndv_opt: bloom_filter_ndv_opt__, coerce_int96_opt: coerce_int96_opt__, max_predicate_cache_size_opt: max_predicate_cache_size_opt__, + filter_collecting_byte_ratio_threshold_opt: filter_collecting_byte_ratio_threshold_opt__, + filter_confidence_z_opt: filter_confidence_z_opt__, + filter_pushdown_min_bytes_per_sec_opt: filter_pushdown_min_bytes_per_sec_opt__, }) } } diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs index 1251a51ab0983..a712486fb8438 100644 --- a/datafusion/proto-common/src/generated/prost.rs +++ b/datafusion/proto-common/src/generated/prost.rs @@ -790,9 +790,6 @@ pub struct ParquetOptions { #[prost(bool, tag = "5")] pub pushdown_filters: bool, /// default = false - #[prost(bool, tag = "6")] - pub reorder_filters: bool, - /// default = false #[prost(bool, tag = "34")] pub force_filter_selections: bool, /// default = 1024 * 1024 @@ -874,6 +871,21 @@ pub struct ParquetOptions { pub max_predicate_cache_size_opt: ::core::option::Option< parquet_options::MaxPredicateCacheSizeOpt, >, + #[prost( + oneof = "parquet_options::FilterCollectingByteRatioThresholdOpt", + tags = "40" + )] + pub filter_collecting_byte_ratio_threshold_opt: ::core::option::Option< + parquet_options::FilterCollectingByteRatioThresholdOpt, + >, + #[prost(oneof = "parquet_options::FilterConfidenceZOpt", tags = "41")] + pub filter_confidence_z_opt: ::core::option::Option< + parquet_options::FilterConfidenceZOpt, + >, + #[prost(oneof = "parquet_options::FilterPushdownMinBytesPerSecOpt", tags = "42")] + pub filter_pushdown_min_bytes_per_sec_opt: ::core::option::Option< + parquet_options::FilterPushdownMinBytesPerSecOpt, + >, } /// Nested message and enum types in `ParquetOptions`. pub mod parquet_options { @@ -932,6 +944,21 @@ pub mod parquet_options { #[prost(uint64, tag = "33")] MaxPredicateCacheSize(u64), } + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + pub enum FilterCollectingByteRatioThresholdOpt { + #[prost(double, tag = "40")] + FilterCollectingByteRatioThreshold(f64), + } + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + pub enum FilterConfidenceZOpt { + #[prost(double, tag = "41")] + FilterConfidenceZ(f64), + } + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + pub enum FilterPushdownMinBytesPerSecOpt { + #[prost(double, tag = "42")] + FilterPushdownMinBytesPerSec(f64), + } } #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct CdcOptions { diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index 65089f029b866..6f2e5d2b4faa6 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -877,7 +877,7 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { skip_metadata: value.skip_metadata, metadata_size_hint_opt: value.metadata_size_hint.map(|v| protobuf::parquet_options::MetadataSizeHintOpt::MetadataSizeHint(v as u64)), pushdown_filters: value.pushdown_filters, - reorder_filters: value.reorder_filters, + force_filter_selections: value.force_filter_selections, data_pagesize_limit: value.data_pagesize_limit as u64, write_batch_size: value.write_batch_size as u64, @@ -911,6 +911,9 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { norm_level: cdc.norm_level, } ), + filter_pushdown_min_bytes_per_sec_opt: Some(protobuf::parquet_options::FilterPushdownMinBytesPerSecOpt::FilterPushdownMinBytesPerSec(value.filter_pushdown_min_bytes_per_sec)), + filter_collecting_byte_ratio_threshold_opt: Some(protobuf::parquet_options::FilterCollectingByteRatioThresholdOpt::FilterCollectingByteRatioThreshold(value.filter_collecting_byte_ratio_threshold)), + filter_confidence_z_opt: Some(protobuf::parquet_options::FilterConfidenceZOpt::FilterConfidenceZ(value.filter_confidence_z)), }) } } diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs index 1251a51ab0983..a712486fb8438 100644 --- a/datafusion/proto/src/generated/datafusion_proto_common.rs +++ b/datafusion/proto/src/generated/datafusion_proto_common.rs @@ -790,9 +790,6 @@ pub struct ParquetOptions { #[prost(bool, tag = "5")] pub pushdown_filters: bool, /// default = false - #[prost(bool, tag = "6")] - pub reorder_filters: bool, - /// default = false #[prost(bool, tag = "34")] pub force_filter_selections: bool, /// default = 1024 * 1024 @@ -874,6 +871,21 @@ pub struct ParquetOptions { pub max_predicate_cache_size_opt: ::core::option::Option< parquet_options::MaxPredicateCacheSizeOpt, >, + #[prost( + oneof = "parquet_options::FilterCollectingByteRatioThresholdOpt", + tags = "40" + )] + pub filter_collecting_byte_ratio_threshold_opt: ::core::option::Option< + parquet_options::FilterCollectingByteRatioThresholdOpt, + >, + #[prost(oneof = "parquet_options::FilterConfidenceZOpt", tags = "41")] + pub filter_confidence_z_opt: ::core::option::Option< + parquet_options::FilterConfidenceZOpt, + >, + #[prost(oneof = "parquet_options::FilterPushdownMinBytesPerSecOpt", tags = "42")] + pub filter_pushdown_min_bytes_per_sec_opt: ::core::option::Option< + parquet_options::FilterPushdownMinBytesPerSecOpt, + >, } /// Nested message and enum types in `ParquetOptions`. pub mod parquet_options { @@ -932,6 +944,21 @@ pub mod parquet_options { #[prost(uint64, tag = "33")] MaxPredicateCacheSize(u64), } + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + pub enum FilterCollectingByteRatioThresholdOpt { + #[prost(double, tag = "40")] + FilterCollectingByteRatioThreshold(f64), + } + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + pub enum FilterConfidenceZOpt { + #[prost(double, tag = "41")] + FilterConfidenceZ(f64), + } + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + pub enum FilterPushdownMinBytesPerSecOpt { + #[prost(double, tag = "42")] + FilterPushdownMinBytesPerSec(f64), + } } #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct CdcOptions { diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index 4b111a49e42a0..8a31af28bfbdb 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -377,7 +377,7 @@ mod parquet { parquet_options::MetadataSizeHintOpt::MetadataSizeHint(size as u64) }), pushdown_filters: global_options.global.pushdown_filters, - reorder_filters: global_options.global.reorder_filters, + force_filter_selections: global_options.global.force_filter_selections, data_pagesize_limit: global_options.global.data_pagesize_limit as u64, write_batch_size: global_options.global.write_batch_size as u64, @@ -431,6 +431,9 @@ mod parquet { norm_level: cdc.norm_level, } }), + filter_pushdown_min_bytes_per_sec_opt: Some(parquet_options::FilterPushdownMinBytesPerSecOpt::FilterPushdownMinBytesPerSec(global_options.global.filter_pushdown_min_bytes_per_sec)), + filter_collecting_byte_ratio_threshold_opt: Some(parquet_options::FilterCollectingByteRatioThresholdOpt::FilterCollectingByteRatioThreshold(global_options.global.filter_collecting_byte_ratio_threshold)), + filter_confidence_z_opt: Some(parquet_options::FilterConfidenceZOpt::FilterConfidenceZ(global_options.global.filter_confidence_z)), }), column_specific_options: column_specific_options.into_iter().map(|(column_name, options)| { ParquetColumnSpecificOptions { @@ -480,7 +483,7 @@ mod parquet { parquet_options::MetadataSizeHintOpt::MetadataSizeHint(size) => *size as usize, }), pushdown_filters: proto.pushdown_filters, - reorder_filters: proto.reorder_filters, + force_filter_selections: proto.force_filter_selections, data_pagesize_limit: proto.data_pagesize_limit as usize, write_batch_size: proto.write_batch_size as usize, @@ -533,14 +536,20 @@ mod parquet { use_content_defined_chunking: proto.content_defined_chunking.map(|cdc| { let defaults = CdcOptions::default(); CdcOptions { - // proto3 uses 0 as the wire default for uint64; a zero chunk size is - // invalid, so treat it as "field not set" and fall back to the default. min_chunk_size: if cdc.min_chunk_size != 0 { cdc.min_chunk_size as usize } else { defaults.min_chunk_size }, max_chunk_size: if cdc.max_chunk_size != 0 { cdc.max_chunk_size as usize } else { defaults.max_chunk_size }, - // norm_level = 0 is a valid value (and the default), so pass it through directly. norm_level: cdc.norm_level, } }), + filter_pushdown_min_bytes_per_sec: proto.filter_pushdown_min_bytes_per_sec_opt.as_ref().map(|opt| match opt { + parquet_options::FilterPushdownMinBytesPerSecOpt::FilterPushdownMinBytesPerSec(v) => *v, + }).unwrap_or(f64::INFINITY), + filter_collecting_byte_ratio_threshold: proto.filter_collecting_byte_ratio_threshold_opt.as_ref().map(|opt| match opt { + parquet_options::FilterCollectingByteRatioThresholdOpt::FilterCollectingByteRatioThreshold(v) => *v, + }).unwrap_or(0.2), + filter_confidence_z: proto.filter_confidence_z_opt.as_ref().map(|opt| match opt { + parquet_options::FilterConfidenceZOpt::FilterConfidenceZ(v) => *v, + }).unwrap_or(2.0), } } } diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index 206f4378d3d3b..6d8552ded1016 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -863,6 +863,12 @@ impl protobuf::PhysicalPlanNode { let reader_factory = Arc::new(CachedParquetFileReaderFactory::new(store, metadata_cache)); + options.global.filter_pushdown_min_bytes_per_sec = ctx + .session_config() + .options() + .execution + .parquet + .filter_pushdown_min_bytes_per_sec; let mut source = ParquetSource::new(table_schema) .with_parquet_file_reader_factory(reader_factory) .with_table_parquet_options(options); diff --git a/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt b/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt index 179a611d37e1f..9fa53bb5a3a4d 100644 --- a/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt +++ b/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt @@ -157,7 +157,7 @@ logical_plan physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[id@2, data@3, info@1] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # Disable Join dynamic filter pushdown statement ok @@ -235,7 +235,7 @@ logical_plan physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Left, on=[(id@0, id@0)], projection=[id@2, data@3, info@1] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # RIGHT JOIN correctness: all right rows appear, unmatched left rows produce NULLs query ITT @@ -284,7 +284,7 @@ logical_plan physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(id@0, id@0)] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # LEFT SEMI JOIN (physical LeftSemi): reverse table roles so optimizer keeps LeftSemi # (right_parquet has 3 rows < left_parquet has 5 rows, so no swap occurs). @@ -304,7 +304,7 @@ logical_plan physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(id@0, id@0)] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # LEFT SEMI (physical LeftSemi) correctness: only right rows with matching left ids query IT rowsort @@ -337,8 +337,8 @@ physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(id@0, id@0)] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet 03)--SortExec: expr=[data@1 DESC], preserve_partitioning=[false] -04)----FilterExec: DynamicFilter [ empty ] -05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] +04)----FilterExec: Optional(DynamicFilter [ empty ]) +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) statement count 0 SET datafusion.execution.parquet.pushdown_filters = true; @@ -361,7 +361,7 @@ physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(id@0, id@0)] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet 03)--SortExec: expr=[data@1 DESC], preserve_partitioning=[false] -04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) statement count 0 RESET datafusion.execution.parquet.pushdown_filters; @@ -383,7 +383,7 @@ logical_plan physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0, id@0)] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # LEFT MARK JOIN: the OR prevents decorrelation to LeftSemi, so the optimizer # uses LeftMark. Self-generated dynamic filter pushes to the probe side. @@ -407,7 +407,7 @@ physical_plan 02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)] 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet -05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # LEFT MARK correctness: all right rows match EXISTS, so all 3 appear query IT rowsort @@ -445,7 +445,7 @@ physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(id@0, id@0)] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] -04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] AND DynamicFilter [ empty ] +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) AND DynamicFilter [ empty ] # Correctness check query IT @@ -479,7 +479,7 @@ physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0, id@0)] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] -04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] AND DynamicFilter [ empty ] +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) AND DynamicFilter [ empty ] # Correctness check query IT @@ -516,7 +516,7 @@ logical_plan physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[id@2, data@3, info@1] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # Enable TopK, disable Join statement ok @@ -736,7 +736,7 @@ logical_plan physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[id@2, data@3, info@1] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # Test 6: Regression test for issue #20213 - dynamic filter applied to wrong table # when subquery join has same column names on both sides. diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index b04c78bd2774c..889602b7d60a4 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -245,6 +245,9 @@ datafusion.execution.parquet.dictionary_enabled true datafusion.execution.parquet.dictionary_page_size_limit 1048576 datafusion.execution.parquet.enable_page_index true datafusion.execution.parquet.encoding NULL +datafusion.execution.parquet.filter_collecting_byte_ratio_threshold 0.2 +datafusion.execution.parquet.filter_confidence_z 2 +datafusion.execution.parquet.filter_pushdown_min_bytes_per_sec 104857600 datafusion.execution.parquet.force_filter_selections false datafusion.execution.parquet.max_predicate_cache_size NULL datafusion.execution.parquet.max_row_group_size 1048576 @@ -253,7 +256,6 @@ datafusion.execution.parquet.maximum_parallel_row_group_writers 1 datafusion.execution.parquet.metadata_size_hint 524288 datafusion.execution.parquet.pruning true datafusion.execution.parquet.pushdown_filters false -datafusion.execution.parquet.reorder_filters false datafusion.execution.parquet.schema_force_view_types true datafusion.execution.parquet.skip_arrow_metadata false datafusion.execution.parquet.skip_metadata true @@ -392,6 +394,9 @@ datafusion.execution.parquet.dictionary_enabled true (writing) Sets if dictionar datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets best effort maximum dictionary page size, in bytes datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. datafusion.execution.parquet.encoding NULL (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting +datafusion.execution.parquet.filter_collecting_byte_ratio_threshold 0.2 (reading) Byte-ratio threshold for applying filters one at a time (iterative pruning; aka row-level) vs. all at once (post-scan). The ratio is computed as: (extra filter bytes not in projection) / (projected bytes). Filters whose extra columns consume a smaller fraction than this threshold are placed as row filters. Ratio of filter column bytes to projection bytes that controls initial filter placement. Computed as `filter_compressed_bytes / projection_compressed_bytes`. Filters below this ratio start as row-level filters (enabling late materialization); those above start as post-scan filters. Default: 0.20 — filters whose columns are less than 20% of the projection bytes start at row-level. **Interaction with `pushdown_filters`:** Only takes effect when `pushdown_filters = true`. +datafusion.execution.parquet.filter_confidence_z 2 (reading) Z-score for confidence intervals on filter effectiveness. Controls how much statistical evidence is required before promoting or demoting a filter. Lower values = faster decisions with less confidence. Higher values = more conservative, requiring more data. Default: 2.0 (~95% confidence). **Interaction with `pushdown_filters`:** Only takes effect when `pushdown_filters = true`. +datafusion.execution.parquet.filter_pushdown_min_bytes_per_sec 104857600 (reading) Minimum bytes/sec throughput for adaptive filter pushdown. Filters that achieve at least this throughput (bytes_saved / eval_time) are promoted to row filters. f64::INFINITY = no filters promoted (feature disabled). 0.0 = all filters pushed as row filters (no adaptive logic). Default: 104,857,600 bytes/sec (100 MiB/sec), empirically chosen based on TPC-H, TPC-DS, and ClickBench benchmarks on an m4 MacBook Pro. The optimal value for this setting likely depends on the relative cost of CPU vs. IO in your environment, and to some extent the shape of your query. **Interaction with `pushdown_filters`:** This option only takes effect when `pushdown_filters = true`. When pushdown is disabled, all filters run post-scan and this threshold is ignored. datafusion.execution.parquet.force_filter_selections false (reading) Force the use of RowSelections for filter results, when pushdown_filters is enabled. If false, the reader will automatically choose between a RowSelection and a Bitmap based on the number and pattern of selected rows. datafusion.execution.parquet.max_predicate_cache_size NULL (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching. datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. @@ -400,7 +405,6 @@ datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By d datafusion.execution.parquet.metadata_size_hint 524288 (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed. datafusion.execution.parquet.pruning true (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file datafusion.execution.parquet.pushdown_filters false (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization". -datafusion.execution.parquet.reorder_filters false (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query datafusion.execution.parquet.schema_force_view_types true (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. datafusion.execution.parquet.skip_arrow_metadata false (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index e0be63fe71525..f25c22ee88df3 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -2865,7 +2865,7 @@ physical_plan 02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)] 03)----DataSourceExec: partitions=1, partition_sizes=[1] 04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true] -05)------FilterExec: DynamicFilter [ empty ] +05)------FilterExec: Optional(DynamicFilter [ empty ]) 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 07)----------DataSourceExec: partitions=1, partition_sizes=[1] @@ -2901,7 +2901,7 @@ physical_plan 02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)] 03)----DataSourceExec: partitions=1, partition_sizes=[1] 04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true] -05)------FilterExec: DynamicFilter [ empty ] +05)------FilterExec: Optional(DynamicFilter [ empty ]) 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 07)----------DataSourceExec: partitions=1, partition_sizes=[1] @@ -2958,7 +2958,7 @@ physical_plan 02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)] 03)----DataSourceExec: partitions=1, partition_sizes=[1] 04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true] -05)------FilterExec: DynamicFilter [ empty ] +05)------FilterExec: Optional(DynamicFilter [ empty ]) 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 07)----------DataSourceExec: partitions=1, partition_sizes=[1] @@ -2994,7 +2994,7 @@ physical_plan 02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)] 03)----DataSourceExec: partitions=1, partition_sizes=[1] 04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true] -05)------FilterExec: DynamicFilter [ empty ] +05)------FilterExec: Optional(DynamicFilter [ empty ]) 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 07)----------DataSourceExec: partitions=1, partition_sizes=[1] @@ -3052,7 +3052,7 @@ physical_plan 02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@1 != t1_name@0 03)----DataSourceExec: partitions=1, partition_sizes=[1] 04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true] -05)------FilterExec: DynamicFilter [ empty ] +05)------FilterExec: Optional(DynamicFilter [ empty ]) 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 07)----------DataSourceExec: partitions=1, partition_sizes=[1] @@ -3069,7 +3069,7 @@ physical_plan 02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@0 != t1_name@1 03)----DataSourceExec: partitions=1, partition_sizes=[1] 04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true] -05)------FilterExec: DynamicFilter [ empty ] +05)------FilterExec: Optional(DynamicFilter [ empty ]) 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 07)----------DataSourceExec: partitions=1, partition_sizes=[1] @@ -3124,7 +3124,7 @@ physical_plan 02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@1 != t1_name@0 03)----DataSourceExec: partitions=1, partition_sizes=[1] 04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true] -05)------FilterExec: DynamicFilter [ empty ] +05)------FilterExec: Optional(DynamicFilter [ empty ]) 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 07)----------DataSourceExec: partitions=1, partition_sizes=[1] @@ -3141,7 +3141,7 @@ physical_plan 02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@0 != t1_name@1 03)----DataSourceExec: partitions=1, partition_sizes=[1] 04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true] -05)------FilterExec: DynamicFilter [ empty ] +05)------FilterExec: Optional(DynamicFilter [ empty ]) 06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 07)----------DataSourceExec: partitions=1, partition_sizes=[1] @@ -4149,7 +4149,7 @@ physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(b@1, y@1)], filter=a@0 < x@1 02)--DataSourceExec: partitions=1, partition_sizes=[0] 03)--SortExec: expr=[x@0 ASC NULLS LAST], preserve_partitioning=[false] -04)----FilterExec: DynamicFilter [ empty ] +04)----FilterExec: Optional(DynamicFilter [ empty ]) 05)------DataSourceExec: partitions=1, partition_sizes=[0] # Test full join with limit @@ -4452,7 +4452,7 @@ physical_plan 04)------FilterExec: b@1 > 3, projection=[a@0] 05)--------DataSourceExec: partitions=2, partition_sizes=[1, 1] 06)----SortExec: expr=[c@2 DESC], preserve_partitioning=[true] -07)------FilterExec: DynamicFilter [ empty ] +07)------FilterExec: Optional(DynamicFilter [ empty ]) 08)--------DataSourceExec: partitions=2, partition_sizes=[1, 1] query TT @@ -4473,7 +4473,7 @@ physical_plan 04)------FilterExec: b@1 > 3, projection=[a@0] 05)--------DataSourceExec: partitions=2, partition_sizes=[1, 1] 06)----SortExec: expr=[c@2 DESC NULLS LAST], preserve_partitioning=[true] -07)------FilterExec: DynamicFilter [ empty ] +07)------FilterExec: Optional(DynamicFilter [ empty ]) 08)--------DataSourceExec: partitions=2, partition_sizes=[1, 1] query III diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt index 85f9549357138..ec08e3917211a 100644 --- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt +++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt @@ -902,9 +902,6 @@ DROP TABLE t_struct_filter; statement ok set datafusion.execution.parquet.pushdown_filters = true; -statement ok -set datafusion.execution.parquet.reorder_filters = true; - statement ok COPY ( SELECT @@ -935,8 +932,5 @@ A 78 statement ok set datafusion.execution.parquet.pushdown_filters = false; -statement ok -set datafusion.execution.parquet.reorder_filters = false; - statement ok DROP TABLE dict_filter_bug; diff --git a/datafusion/sqllogictest/test_files/preserve_file_partitioning.slt b/datafusion/sqllogictest/test_files/preserve_file_partitioning.slt index 175d7d90cd8ed..a841143cb10ae 100644 --- a/datafusion/sqllogictest/test_files/preserve_file_partitioning.slt +++ b/datafusion/sqllogictest/test_files/preserve_file_partitioning.slt @@ -367,7 +367,7 @@ physical_plan 08)--------------FilterExec: service@2 = log 09)----------------RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1 10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension/data.parquet]]}, projection=[d_dkey, env, service], file_type=parquet, predicate=service@2 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] -11)------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], output_ordering=[f_dkey@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] +11)------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], output_ordering=[f_dkey@1 ASC NULLS LAST], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # Verify results without optimization query TTTIR rowsort @@ -418,7 +418,7 @@ physical_plan 06)----------FilterExec: service@2 = log 07)------------RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1 08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension/data.parquet]]}, projection=[d_dkey, env, service], file_type=parquet, predicate=service@2 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] -09)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], output_ordering=[f_dkey@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] +09)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], output_ordering=[f_dkey@1 ASC NULLS LAST], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) query TTTIR rowsort SELECT f.f_dkey, MAX(d.env), MAX(d.service), count(*), sum(f.value) @@ -643,7 +643,7 @@ physical_plan 05)--------RepartitionExec: partitioning=Hash([d_dkey@1], 3), input_partitions=3 06)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=C/data.parquet]]}, projection=[env, d_dkey], file_type=parquet 07)--------RepartitionExec: partitioning=Hash([f_dkey@1], 3), input_partitions=3 -08)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] +08)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) query TTR rowsort SELECT f.f_dkey, d.env, sum(f.value) diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt index a40c1b8c7e246..7b43ec651a53b 100644 --- a/datafusion/sqllogictest/test_files/projection_pushdown.slt +++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt @@ -637,7 +637,7 @@ SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 1 ORDER BY id LIMIT 2; # Config reset -# The SLT runner sets `target_partitions` to 4 instead of using the default, so +# The SLT runner sets `target_partitions` to 4 instead of using the default, so # reset it explicitly. statement ok SET datafusion.execution.target_partitions = 4; @@ -1424,7 +1424,7 @@ logical_plan physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(__datafusion_extracted_1@0, __datafusion_extracted_2 * Int64(10)@2)], projection=[id@1, id@3] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id, get_field(s@1, level) * 10 as __datafusion_extracted_2 * Int64(10)], file_type=parquet, predicate=DynamicFilter [ empty ] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id, get_field(s@1, level) * 10 as __datafusion_extracted_2 * Int64(10)], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # Verify correctness - value = level * 10 # simple_struct: (1,100), (2,200), (3,150), (4,300), (5,250) @@ -1460,7 +1460,7 @@ physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)] 02)--FilterExec: __datafusion_extracted_1@0 > 150, projection=[id@1] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=get_field(s@1, value) > 150 -04)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] +04)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # Verify correctness - id matches and value > 150 query II @@ -1500,7 +1500,7 @@ physical_plan 02)--FilterExec: __datafusion_extracted_1@0 > 100, projection=[id@1] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=get_field(s@1, value) > 100 04)--FilterExec: __datafusion_extracted_2@0 > 3, projection=[id@1] -05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet, predicate=get_field(s@1, level) > 3 AND DynamicFilter [ empty ] +05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet, predicate=get_field(s@1, level) > 3 AND Optional(DynamicFilter [ empty ]) # Verify correctness - id matches, value > 100, and level > 3 # Matching ids where value > 100: 2(200), 3(150), 4(300), 5(250) @@ -1536,7 +1536,7 @@ physical_plan 01)ProjectionExec: expr=[id@0 as id, __datafusion_extracted_1@1 as simple_struct.s[label], __datafusion_extracted_2@2 as join_right.s[role]] 02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@1, id@1)], projection=[id@1, __datafusion_extracted_1@0, __datafusion_extracted_2@2] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, id], file_type=parquet -04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, role) as __datafusion_extracted_2, id], file_type=parquet, predicate=DynamicFilter [ empty ] +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, role) as __datafusion_extracted_2, id], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # Verify correctness query ITT @@ -1568,7 +1568,7 @@ logical_plan physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id], file_type=parquet -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # Verify correctness query II @@ -1607,7 +1607,7 @@ physical_plan 02)--HashJoinExec: mode=CollectLeft, join_type=Left, on=[(id@1, id@0)], projection=[id@1, __datafusion_extracted_2@0, __datafusion_extracted_3@3] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_2, id], file_type=parquet 04)----FilterExec: __datafusion_extracted_1@0 > 5, projection=[id@1, __datafusion_extracted_3@2] -05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_1, id, get_field(s@1, level) as __datafusion_extracted_3], file_type=parquet, predicate=get_field(s@1, level) > 5 AND DynamicFilter [ empty ] +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_1, id, get_field(s@1, level) as __datafusion_extracted_3], file_type=parquet, predicate=get_field(s@1, level) > 5 AND Optional(DynamicFilter [ empty ]) # Verify correctness - left join with level > 5 condition # Only join_right rows with level > 5 are matched: id=1 (level=10), id=4 (level=8) @@ -1899,7 +1899,7 @@ physical_plan 01)ProjectionExec: expr=[__datafusion_extracted_3@0 as s.s[value], __datafusion_extracted_4@1 as j.s[role]] 02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@2, id@2)], filter=__datafusion_extracted_1@1 > __datafusion_extracted_2@0, projection=[__datafusion_extracted_3@4, __datafusion_extracted_4@1] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, get_field(s@1, role) as __datafusion_extracted_4, id], file_type=parquet -04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_3, id], file_type=parquet, predicate=DynamicFilter [ empty ] +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_3, id], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # Verify correctness - only admin roles match (ids 1 and 4) query II @@ -1935,7 +1935,7 @@ logical_plan physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@1, id@1)], filter=__datafusion_extracted_1@0 > __datafusion_extracted_2@1, projection=[id@1, id@3] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet, predicate=DynamicFilter [ empty ] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # Verify correctness - all rows match since value >> level for all ids # simple_struct: (1,100), (2,200), (3,150), (4,300), (5,250) diff --git a/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt b/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt index 8469c32a17033..3800172f6bad8 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt @@ -158,7 +158,7 @@ physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(k@0, k@0)] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/small_table.parquet]]}, projection=[k], file_type=parquet 03)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/large_table.parquet]]}, projection=[k, v], file_type=parquet, predicate=v@1 >= 50 AND DynamicFilter [ empty ], pruning_predicate=v_null_count@1 != row_count@2 AND v_max@0 >= 50, required_guarantees=[] +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/large_table.parquet]]}, projection=[k, v], file_type=parquet, predicate=v@1 >= 50 AND Optional(DynamicFilter [ empty ]), pruning_predicate=v_null_count@1 != row_count@2 AND v_max@0 >= 50, required_guarantees=[] statement ok drop table small_table; @@ -206,7 +206,7 @@ EXPLAIN ANALYZE SELECT t FROM topk_pushdown ORDER BY t * t LIMIT 10; ---- Plan with Metrics 01)SortExec: TopK(fetch=10), expr=[t@0 * t@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[t@0 * t@0 < 1884329474306198481], metrics=[output_rows=10, output_batches=1, row_replacements=10] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_pushdown.parquet]]}, projection=[t], output_ordering=[t@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ t@0 * t@0 < 1884329474306198481 ], metrics=[output_rows=128, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=782 total → 782 matched, row_groups_pruned_bloom_filter=782 total → 782 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=128, pushdown_rows_pruned=99.87 K, predicate_cache_inner_records=128, predicate_cache_records=128, scan_efficiency_ratio=64.87% (258.7 K/398.8 K)] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_pushdown.parquet]]}, projection=[t], output_ordering=[t@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ t@0 * t@0 < 1884329474306198481 ], metrics=[output_rows=128, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=782 total → 782 matched, row_groups_pruned_bloom_filter=782 total → 782 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=64.87% (258.7 K/398.8 K)] statement ok reset datafusion.explain.analyze_categories; @@ -268,7 +268,7 @@ EXPLAIN ANALYZE SELECT * FROM topk_single_col ORDER BY b DESC LIMIT 1; ---- Plan with Metrics 01)SortExec: TopK(fetch=1), expr=[b@1 DESC], preserve_partitioning=[false], filter=[b@1 IS NULL OR b@1 > bd], metrics=[output_rows=1, output_batches=1, row_replacements=1] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_single_col.parquet]]}, projection=[a, b, c], file_type=parquet, predicate=DynamicFilter [ b@1 IS NULL OR b@1 > bd ], pruning_predicate=b_null_count@0 > 0 OR b_null_count@0 != row_count@2 AND b_max@1 > bd, required_guarantees=[], metrics=[output_rows=4, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=4, pushdown_rows_pruned=0, predicate_cache_inner_records=4, predicate_cache_records=4, scan_efficiency_ratio=22.37% (240/1.07 K)] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_single_col.parquet]]}, projection=[a, b, c], file_type=parquet, predicate=DynamicFilter [ b@1 IS NULL OR b@1 > bd ], pruning_predicate=b_null_count@0 > 0 OR b_null_count@0 != row_count@2 AND b_max@1 > bd, required_guarantees=[], metrics=[output_rows=4, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=22.37% (240/1.07 K)] statement ok reset datafusion.explain.analyze_categories; @@ -319,7 +319,7 @@ EXPLAIN ANALYZE SELECT * FROM topk_multi_col ORDER BY b ASC NULLS LAST, a DESC L ---- Plan with Metrics 01)SortExec: TopK(fetch=2), expr=[b@1 ASC NULLS LAST, a@0 DESC], preserve_partitioning=[false], filter=[b@1 < bb OR b@1 = bb AND (a@0 IS NULL OR a@0 > ac)], metrics=[output_rows=2, output_batches=1, row_replacements=2] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_multi_col.parquet]]}, projection=[a, b, c], file_type=parquet, predicate=DynamicFilter [ b@1 < bb OR b@1 = bb AND (a@0 IS NULL OR a@0 > ac) ], pruning_predicate=b_null_count@1 != row_count@2 AND b_min@0 < bb OR b_null_count@1 != row_count@2 AND b_min@0 <= bb AND bb <= b_max@3 AND (a_null_count@4 > 0 OR a_null_count@4 != row_count@2 AND a_max@5 > ac), required_guarantees=[], metrics=[output_rows=4, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=4, pushdown_rows_pruned=0, predicate_cache_inner_records=8, predicate_cache_records=8, scan_efficiency_ratio=22.37% (240/1.07 K)] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_multi_col.parquet]]}, projection=[a, b, c], file_type=parquet, predicate=DynamicFilter [ b@1 < bb OR b@1 = bb AND (a@0 IS NULL OR a@0 > ac) ], pruning_predicate=b_null_count@1 != row_count@2 AND b_min@0 < bb OR b_null_count@1 != row_count@2 AND b_min@0 <= bb AND bb <= b_max@3 AND (a_null_count@4 > 0 OR a_null_count@4 != row_count@2 AND a_max@5 > ac), required_guarantees=[], metrics=[output_rows=4, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=22.37% (240/1.07 K)] statement ok reset datafusion.explain.analyze_categories; @@ -389,7 +389,7 @@ FROM join_probe p INNER JOIN join_build AS build Plan with Metrics 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)], projection=[a@3, b@4, c@2, e@5], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=1, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/join_build.parquet]]}, projection=[a, b, c], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=20.48% (214/1.04 K)] -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/join_probe.parquet]]}, projection=[a, b, e], file_type=parquet, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= ba AND b_null_count@5 != row_count@2 AND b_min@6 <= bb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=2, predicate_cache_inner_records=8, predicate_cache_records=4, scan_efficiency_ratio=22.78% (246/1.08 K)] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/join_probe.parquet]]}, projection=[a, b, e], file_type=parquet, predicate=Optional(DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ]), pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= ba AND b_null_count@5 != row_count@2 AND b_min@6 <= bb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=22.78% (246/1.08 K)] statement ok reset datafusion.explain.analyze_categories; @@ -475,8 +475,8 @@ Plan with Metrics 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c@3, d@0)], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=1, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)] 02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, b@0)], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=1, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/nested_t1.parquet]]}, projection=[a, x], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=18.23% (144/790)] -04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/nested_t2.parquet]]}, projection=[b, c, y], file_type=parquet, predicate=DynamicFilter [ b@0 >= aa AND b@0 <= ab AND b@0 IN (SET) ([aa, ab]) ], pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 >= aa AND b_null_count@1 != row_count@2 AND b_min@3 <= ab AND (b_null_count@1 != row_count@2 AND b_min@3 <= aa AND aa <= b_max@0 OR b_null_count@1 != row_count@2 AND b_min@3 <= ab AND ab <= b_max@0), required_guarantees=[b in (aa, ab)], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, page_index_rows_pruned=5 total → 5 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=3, predicate_cache_inner_records=5, predicate_cache_records=2, scan_efficiency_ratio=23.2% (252/1.09 K)] -05)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/nested_t3.parquet]]}, projection=[d, z], file_type=parquet, predicate=DynamicFilter [ d@0 >= ca AND d@0 <= cb AND hash_lookup ], pruning_predicate=d_null_count@1 != row_count@2 AND d_max@0 >= ca AND d_null_count@1 != row_count@2 AND d_min@3 <= cb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, page_index_rows_pruned=8 total → 8 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=6, predicate_cache_inner_records=8, predicate_cache_records=2, scan_efficiency_ratio=22.12% (184/832)] +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/nested_t2.parquet]]}, projection=[b, c, y], file_type=parquet, predicate=Optional(DynamicFilter [ b@0 >= aa AND b@0 <= ab AND b@0 IN (SET) ([aa, ab]) ]), pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 >= aa AND b_null_count@1 != row_count@2 AND b_min@3 <= ab AND (b_null_count@1 != row_count@2 AND b_min@3 <= aa AND aa <= b_max@0 OR b_null_count@1 != row_count@2 AND b_min@3 <= ab AND ab <= b_max@0), required_guarantees=[b in (aa, ab)], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, page_index_rows_pruned=5 total → 5 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=23.2% (252/1.09 K)] +05)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/nested_t3.parquet]]}, projection=[d, z], file_type=parquet, predicate=Optional(DynamicFilter [ d@0 >= ca AND d@0 <= cb AND hash_lookup ]), pruning_predicate=d_null_count@1 != row_count@2 AND d_max@0 >= ca AND d_null_count@1 != row_count@2 AND d_min@3 <= cb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, page_index_rows_pruned=8 total → 8 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=22.12% (184/832)] statement ok reset datafusion.explain.analyze_categories; @@ -541,7 +541,7 @@ physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, d@0)] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/parent_build.parquet]]}, projection=[a, b, c], file_type=parquet, predicate=a@0 = aa, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= aa AND aa <= a_max@1, required_guarantees=[a in (aa)] 03)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/parent_probe.parquet]]}, projection=[d, e, f], file_type=parquet, predicate=e@1 = ba AND d@0 = aa AND DynamicFilter [ empty ], pruning_predicate=e_null_count@2 != row_count@3 AND e_min@0 <= ba AND ba <= e_max@1 AND d_null_count@6 != row_count@3 AND d_min@4 <= aa AND aa <= d_max@5, required_guarantees=[d in (aa), e in (ba)] +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/parent_probe.parquet]]}, projection=[d, e, f], file_type=parquet, predicate=e@1 = ba AND d@0 = aa AND Optional(DynamicFilter [ empty ]), pruning_predicate=e_null_count@2 != row_count@3 AND e_min@0 <= ba AND ba <= e_max@1 AND d_null_count@6 != row_count@3 AND d_min@4 <= aa AND aa <= d_max@5, required_guarantees=[d in (aa), e in (ba)] statement ok drop table parent_build; @@ -606,7 +606,7 @@ Plan with Metrics 01)SortExec: TopK(fetch=2), expr=[e@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[e@0 < bb], metrics=[output_rows=2, output_batches=1, row_replacements=2] 02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, d@0)], projection=[e@2], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=1, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_join_build.parquet]]}, projection=[a], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=6.7% (70/1.04 K)] -04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_join_probe.parquet]]}, projection=[d, e], file_type=parquet, predicate=DynamicFilter [ d@0 >= aa AND d@0 <= ab AND d@0 IN (SET) ([aa, ab]) ] AND DynamicFilter [ e@1 < bb ], pruning_predicate=d_null_count@1 != row_count@2 AND d_max@0 >= aa AND d_null_count@1 != row_count@2 AND d_min@3 <= ab AND (d_null_count@1 != row_count@2 AND d_min@3 <= aa AND aa <= d_max@0 OR d_null_count@1 != row_count@2 AND d_min@3 <= ab AND ab <= d_max@0) AND e_null_count@5 != row_count@2 AND e_min@4 < bb, required_guarantees=[d in (aa, ab)], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, page_index_rows_pruned=4 total → 4 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=2, predicate_cache_inner_records=8, predicate_cache_records=4, scan_efficiency_ratio=15.37% (166/1.08 K)] +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_join_probe.parquet]]}, projection=[d, e], file_type=parquet, predicate=Optional(DynamicFilter [ d@0 >= aa AND d@0 <= ab AND d@0 IN (SET) ([aa, ab]) ]) AND DynamicFilter [ e@1 < bb ], pruning_predicate=d_null_count@1 != row_count@2 AND d_max@0 >= aa AND d_null_count@1 != row_count@2 AND d_min@3 <= ab AND (d_null_count@1 != row_count@2 AND d_min@3 <= aa AND aa <= d_max@0 OR d_null_count@1 != row_count@2 AND d_min@3 <= ab AND ab <= d_max@0) AND e_null_count@5 != row_count@2 AND e_min@4 < bb, required_guarantees=[d in (aa, ab)], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, page_index_rows_pruned=4 total → 4 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=15.37% (166/1.08 K)] statement ok reset datafusion.explain.analyze_categories; @@ -655,7 +655,7 @@ EXPLAIN ANALYZE SELECT b, a FROM topk_proj ORDER BY a LIMIT 2; ---- Plan with Metrics 01)SortExec: TopK(fetch=2), expr=[a@1 ASC NULLS LAST], preserve_partitioning=[false], filter=[a@1 < 2], metrics=[output_rows=2, output_batches=1, row_replacements=2] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_proj.parquet]]}, projection=[b, a], file_type=parquet, predicate=DynamicFilter [ a@0 < 2 ], pruning_predicate=a_null_count@1 != row_count@2 AND a_min@0 < 2, required_guarantees=[], metrics=[output_rows=3, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=3, pushdown_rows_pruned=0, predicate_cache_inner_records=3, predicate_cache_records=3, scan_efficiency_ratio=13.72% (153/1.11 K)] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_proj.parquet]]}, projection=[b, a], file_type=parquet, predicate=DynamicFilter [ a@0 < 2 ], pruning_predicate=a_null_count@1 != row_count@2 AND a_min@0 < 2, required_guarantees=[], metrics=[output_rows=3, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=13.72% (153/1.11 K)] # Case 2: prune — `SELECT a` — filter stays as `a < 2` on the scan. query TT @@ -663,7 +663,7 @@ EXPLAIN ANALYZE SELECT a FROM topk_proj ORDER BY a LIMIT 2; ---- Plan with Metrics 01)SortExec: TopK(fetch=2), expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[a@0 < 2], metrics=[output_rows=2, output_batches=1, row_replacements=2] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_proj.parquet]]}, projection=[a], file_type=parquet, predicate=DynamicFilter [ a@0 < 2 ], pruning_predicate=a_null_count@1 != row_count@2 AND a_min@0 < 2, required_guarantees=[], metrics=[output_rows=3, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=3, pushdown_rows_pruned=0, predicate_cache_inner_records=3, predicate_cache_records=3, scan_efficiency_ratio=7.09% (79/1.11 K)] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_proj.parquet]]}, projection=[a], file_type=parquet, predicate=DynamicFilter [ a@0 < 2 ], pruning_predicate=a_null_count@1 != row_count@2 AND a_min@0 < 2, required_guarantees=[], metrics=[output_rows=3, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=7.09% (79/1.11 K)] # Case 3: expression — `SELECT a+1 AS a_plus_1` — the TopK filter is on # `a_plus_1`, the scan predicate must read `a@0 + 1`. @@ -672,7 +672,7 @@ EXPLAIN ANALYZE SELECT a + 1 AS a_plus_1, b FROM topk_proj ORDER BY a_plus_1 LIM ---- Plan with Metrics 01)SortExec: TopK(fetch=2), expr=[a_plus_1@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[a_plus_1@0 < 3], metrics=[output_rows=2, output_batches=1, row_replacements=2] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_proj.parquet]]}, projection=[CAST(a@0 AS Int64) + 1 as a_plus_1, b], file_type=parquet, predicate=DynamicFilter [ CAST(a@0 AS Int64) + 1 < 3 ], metrics=[output_rows=3, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=3, pushdown_rows_pruned=0, predicate_cache_inner_records=3, predicate_cache_records=3, scan_efficiency_ratio=13.72% (153/1.11 K)] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_proj.parquet]]}, projection=[CAST(a@0 AS Int64) + 1 as a_plus_1, b], file_type=parquet, predicate=DynamicFilter [ CAST(a@0 AS Int64) + 1 < 3 ], metrics=[output_rows=3, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=13.72% (153/1.11 K)] # Case 4: alias shadowing — `SELECT a+1 AS a` — the projection renames # `a+1` to `a`, so the TopK's `a < 3` must still be rewritten to @@ -682,7 +682,7 @@ EXPLAIN ANALYZE SELECT a + 1 AS a, b FROM topk_proj ORDER BY a LIMIT 2; ---- Plan with Metrics 01)SortExec: TopK(fetch=2), expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[a@0 < 3], metrics=[output_rows=2, output_batches=1, row_replacements=2] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_proj.parquet]]}, projection=[CAST(a@0 AS Int64) + 1 as a, b], file_type=parquet, predicate=DynamicFilter [ CAST(a@0 AS Int64) + 1 < 3 ], metrics=[output_rows=3, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=3, pushdown_rows_pruned=0, predicate_cache_inner_records=3, predicate_cache_records=3, scan_efficiency_ratio=13.72% (153/1.11 K)] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/topk_proj.parquet]]}, projection=[CAST(a@0 AS Int64) + 1 as a, b], file_type=parquet, predicate=DynamicFilter [ CAST(a@0 AS Int64) + 1 < 3 ], metrics=[output_rows=3, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=13.72% (153/1.11 K)] statement ok reset datafusion.explain.analyze_categories; @@ -744,7 +744,7 @@ Plan with Metrics 04)----AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[min(join_agg_probe.value)], metrics=[output_rows=2, output_batches=2, spill_count=0, spilled_rows=0] 05)------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1, metrics=[output_rows=2, output_batches=2, spill_count=0, spilled_rows=0] 06)--------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[min(join_agg_probe.value)], metrics=[output_rows=2, output_batches=1, spill_count=0, spilled_rows=0, skipped_aggregation_rows=0, reduction_factor=100% (2/2)] -07)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/join_agg_probe.parquet]]}, projection=[a, value], file_type=parquet, predicate=DynamicFilter [ a@0 >= h1 AND a@0 <= h2 AND a@0 IN (SET) ([h1, h2]) ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= h1 AND a_null_count@1 != row_count@2 AND a_min@3 <= h2 AND (a_null_count@1 != row_count@2 AND a_min@3 <= h1 AND h1 <= a_max@0 OR a_null_count@1 != row_count@2 AND a_min@3 <= h2 AND h2 <= a_max@0), required_guarantees=[a in (h1, h2)], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, page_index_rows_pruned=4 total → 4 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=2, predicate_cache_inner_records=4, predicate_cache_records=2, scan_efficiency_ratio=19.81% (163/823)] +07)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/join_agg_probe.parquet]]}, projection=[a, value], file_type=parquet, predicate=Optional(DynamicFilter [ a@0 >= h1 AND a@0 <= h2 AND a@0 IN (SET) ([h1, h2]) ]), pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= h1 AND a_null_count@1 != row_count@2 AND a_min@3 <= h2 AND (a_null_count@1 != row_count@2 AND a_min@3 <= h1 AND h1 <= a_max@0 OR a_null_count@1 != row_count@2 AND a_min@3 <= h2 AND h2 <= a_max@0), required_guarantees=[a in (h1, h2)], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, page_index_rows_pruned=4 total → 4 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=19.81% (163/823)] statement ok reset datafusion.explain.analyze_categories; @@ -807,7 +807,7 @@ ON nulls_build.a = nulls_probe.a AND nulls_build.b = nulls_probe.b; Plan with Metrics 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)], metrics=[output_rows=1, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=3, input_batches=1, input_rows=1, avg_fanout=100% (1/1), probe_hit_rate=100% (1/1)] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/nulls_build.parquet]]}, projection=[a, b], file_type=parquet, metrics=[output_rows=3, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=18.6% (144/774)] -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/nulls_probe.parquet]]}, projection=[a, b, c], file_type=parquet, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= 1 AND b@1 <= 2 AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:1}, {c0:,c1:2}, {c0:ab,c1:}]) ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= 1 AND b_null_count@5 != row_count@2 AND b_min@6 <= 2, required_guarantees=[], metrics=[output_rows=1, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=1, pushdown_rows_pruned=3, predicate_cache_inner_records=8, predicate_cache_records=2, scan_efficiency_ratio=21.1% (237/1.12 K)] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/nulls_probe.parquet]]}, projection=[a, b, c], file_type=parquet, predicate=Optional(DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= 1 AND b@1 <= 2 AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:1}, {c0:,c1:2}, {c0:ab,c1:}]) ]), pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= 1 AND b_null_count@5 != row_count@2 AND b_min@6 <= 2, required_guarantees=[], metrics=[output_rows=1, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=21.1% (237/1.12 K)] statement ok reset datafusion.explain.analyze_categories; @@ -873,7 +873,7 @@ ON lj_build.a = lj_probe.a AND lj_build.b = lj_probe.b; Plan with Metrics 01)HashJoinExec: mode=CollectLeft, join_type=Left, on=[(a@0, a@0), (b@1, b@1)], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=2, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/lj_build.parquet]]}, projection=[a, b, c], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=20.48% (214/1.04 K)] -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/lj_probe.parquet]]}, projection=[a, b, e], file_type=parquet, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= ba AND b_null_count@5 != row_count@2 AND b_min@6 <= bb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=2, predicate_cache_inner_records=8, predicate_cache_records=4, scan_efficiency_ratio=22.78% (246/1.08 K)] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/lj_probe.parquet]]}, projection=[a, b, e], file_type=parquet, predicate=Optional(DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ]), pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= ba AND b_null_count@5 != row_count@2 AND b_min@6 <= bb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=22.78% (246/1.08 K)] # LEFT SEMI JOIN: only matching build rows are returned; probe scan still # receives the dynamic filter. @@ -889,7 +889,7 @@ WHERE EXISTS ( Plan with Metrics 01)HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(a@0, a@0), (b@1, b@1)], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=2, input_rows=4, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/lj_build.parquet]]}, projection=[a, b, c], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=20.48% (214/1.04 K)] -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/lj_probe.parquet]]}, projection=[a, b], file_type=parquet, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= ba AND b_null_count@5 != row_count@2 AND b_min@6 <= bb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=2, predicate_cache_inner_records=8, predicate_cache_records=4, scan_efficiency_ratio=15.37% (166/1.08 K)] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/lj_probe.parquet]]}, projection=[a, b], file_type=parquet, predicate=Optional(DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ]), pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= ba AND b_null_count@5 != row_count@2 AND b_min@6 <= bb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=15.37% (166/1.08 K)] statement ok reset datafusion.explain.analyze_categories; @@ -959,7 +959,7 @@ FROM hl_probe p INNER JOIN hl_build AS build Plan with Metrics 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)], projection=[a@3, b@4, c@2, e@5], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=1, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/hl_build.parquet]]}, projection=[a, b, c], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=20.48% (214/1.04 K)] -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/hl_probe.parquet]]}, projection=[a, b, e], file_type=parquet, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND hash_lookup ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= ba AND b_null_count@5 != row_count@2 AND b_min@6 <= bb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=2, predicate_cache_inner_records=8, predicate_cache_records=4, scan_efficiency_ratio=22.78% (246/1.08 K)] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/hl_probe.parquet]]}, projection=[a, b, e], file_type=parquet, predicate=Optional(DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND hash_lookup ]), pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 >= aa AND a_null_count@1 != row_count@2 AND a_min@3 <= ab AND b_null_count@5 != row_count@2 AND b_max@4 >= ba AND b_null_count@5 != row_count@2 AND b_min@6 <= bb, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=22.78% (246/1.08 K)] statement ok drop table hl_build; @@ -1008,7 +1008,7 @@ FROM int_build b INNER JOIN int_probe p Plan with Metrics 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id1@0, id1@0), (id2@1, id2@1)], projection=[id1@0, id2@1, value@2, data@5], metrics=[output_rows=2, output_batches=1, array_map_created_count=0, build_input_batches=1, build_input_rows=2, input_batches=1, input_rows=2, avg_fanout=100% (2/2), probe_hit_rate=100% (2/2)] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/int_build.parquet]]}, projection=[id1, id2, value], file_type=parquet, metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=19.02% (222/1.17 K)] -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/int_probe.parquet]]}, projection=[id1, id2, data], file_type=parquet, predicate=DynamicFilter [ id1@0 >= 1 AND id1@0 <= 2 AND id2@1 >= 10 AND id2@1 <= 20 AND hash_lookup ], pruning_predicate=id1_null_count@1 != row_count@2 AND id1_max@0 >= 1 AND id1_null_count@1 != row_count@2 AND id1_min@3 <= 2 AND id2_null_count@5 != row_count@2 AND id2_max@4 >= 10 AND id2_null_count@5 != row_count@2 AND id2_min@6 <= 20, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=2, pushdown_rows_pruned=2, predicate_cache_inner_records=8, predicate_cache_records=4, scan_efficiency_ratio=21.43% (239/1.11 K)] +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/int_probe.parquet]]}, projection=[id1, id2, data], file_type=parquet, predicate=Optional(DynamicFilter [ id1@0 >= 1 AND id1@0 <= 2 AND id2@1 >= 10 AND id2@1 <= 20 AND hash_lookup ]), pruning_predicate=id1_null_count@1 != row_count@2 AND id1_max@0 >= 1 AND id1_null_count@1 != row_count@2 AND id1_min@3 <= 2 AND id2_null_count@5 != row_count@2 AND id2_max@4 >= 10 AND id2_null_count@5 != row_count@2 AND id2_min@6 <= 20, required_guarantees=[], metrics=[output_rows=2, output_batches=1, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=0 total → 0 matched, page_index_rows_pruned=0 total → 0 matched, limit_pruned_row_groups=0 total → 0 matched, batches_split=0, file_open_errors=0, file_scan_errors=0, files_opened=1, files_processed=1, num_predicate_creation_errors=0, predicate_evaluation_errors=0, pushdown_rows_matched=0, pushdown_rows_pruned=0, predicate_cache_inner_records=0, predicate_cache_records=0, scan_efficiency_ratio=21.43% (239/1.11 K)] statement ok reset datafusion.explain.analyze_categories; diff --git a/datafusion/sqllogictest/test_files/repartition_subset_satisfaction.slt b/datafusion/sqllogictest/test_files/repartition_subset_satisfaction.slt index dbf31dec5e118..a83cb1113715d 100644 --- a/datafusion/sqllogictest/test_files/repartition_subset_satisfaction.slt +++ b/datafusion/sqllogictest/test_files/repartition_subset_satisfaction.slt @@ -380,7 +380,7 @@ physical_plan 12)----------------------CoalescePartitionsExec 13)------------------------FilterExec: service@1 = log, projection=[env@0, d_dkey@2] 14)--------------------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=A/data.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=D/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=C/data.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] -15)----------------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] +15)----------------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # Verify results without subset satisfaction query TPR rowsort @@ -475,7 +475,7 @@ physical_plan 10)------------------CoalescePartitionsExec 11)--------------------FilterExec: service@1 = log, projection=[env@0, d_dkey@2] 12)----------------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=A/data.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=D/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=C/data.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] -13)------------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] +13)------------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # Verify results match with subset satisfaction query TPR rowsort @@ -517,7 +517,7 @@ prod 2023-01-01T09:12:30 197.7 # Config reset -# The SLT runner sets `target_partitions` to 4 instead of using the default, so +# The SLT runner sets `target_partitions` to 4 instead of using the default, so # reset it explicitly. statement ok set datafusion.execution.target_partitions = 4; diff --git a/datafusion/sqllogictest/test_files/statistics_registry.slt b/datafusion/sqllogictest/test_files/statistics_registry.slt index 6baa4e218ed20..596f31309882d 100644 --- a/datafusion/sqllogictest/test_files/statistics_registry.slt +++ b/datafusion/sqllogictest/test_files/statistics_registry.slt @@ -104,9 +104,9 @@ physical_plan 02)--RepartitionExec: partitioning=Hash([small_id@2], 4), input_partitions=1 03)----HashJoinExec: mode=Partitioned, join_type=Inner, on=[(customer_id@0, customer_id@1)], projection=[region_id@1, order_id@2, small_id@4] 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/statistics_registry/customers.parquet]]}, projection=[customer_id, region_id], file_type=parquet -05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/statistics_registry/orders.parquet]]}, projection=[order_id, customer_id, small_id], file_type=parquet, predicate=DynamicFilter [ empty ] +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/statistics_registry/orders.parquet]]}, projection=[order_id, customer_id, small_id], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) 06)--RepartitionExec: partitioning=Hash([small_id@0], 4), input_partitions=1 -07)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/statistics_registry/dim_small.parquet]]}, projection=[small_id, label], file_type=parquet, predicate=DynamicFilter [ empty ] +07)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/statistics_registry/dim_small.parquet]]}, projection=[small_id, label], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) # -- With registry ----------------------------------------------------------- # Conservative estimate 100 > 50: dim_small correctly swapped to build side @@ -127,7 +127,7 @@ physical_plan 04)--RepartitionExec: partitioning=Hash([small_id@2], 4), input_partitions=1 05)----HashJoinExec: mode=Partitioned, join_type=Inner, on=[(customer_id@0, customer_id@1)], projection=[region_id@1, order_id@2, small_id@4] 06)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/statistics_registry/customers.parquet]]}, projection=[customer_id, region_id], file_type=parquet -07)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/statistics_registry/orders.parquet]]}, projection=[order_id, customer_id, small_id], file_type=parquet, predicate=DynamicFilter [ empty ] AND DynamicFilter [ empty ] +07)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/statistics_registry/orders.parquet]]}, projection=[order_id, customer_id, small_id], file_type=parquet, predicate=Optional(DynamicFilter [ empty ]) AND Optional(DynamicFilter [ empty ]) # -- Verify results are identical regardless of join order -------------------- diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 49c9eea29ef73..4fbb403cadd47 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -85,13 +85,15 @@ The following configuration settings are available: | datafusion.execution.parquet.skip_metadata | true | (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata | | datafusion.execution.parquet.metadata_size_hint | 524288 | (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed. | | datafusion.execution.parquet.pushdown_filters | false | (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization". | -| datafusion.execution.parquet.reorder_filters | false | (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query | | datafusion.execution.parquet.force_filter_selections | false | (reading) Force the use of RowSelections for filter results, when pushdown_filters is enabled. If false, the reader will automatically choose between a RowSelection and a Bitmap based on the number and pattern of selected rows. | | datafusion.execution.parquet.schema_force_view_types | true | (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. | | datafusion.execution.parquet.binary_as_string | false | (reading) If true, parquet reader will read columns of `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`. Parquet files generated by some legacy writers do not correctly set the UTF8 flag for strings, causing string columns to be loaded as BLOB instead. | | datafusion.execution.parquet.coerce_int96 | NULL | (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution. | | datafusion.execution.parquet.bloom_filter_on_read | true | (reading) Use any available bloom filters when reading parquet files | | datafusion.execution.parquet.max_predicate_cache_size | NULL | (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching. | +| datafusion.execution.parquet.filter_pushdown_min_bytes_per_sec | 104857600 | (reading) Minimum bytes/sec throughput for adaptive filter pushdown. Filters that achieve at least this throughput (bytes_saved / eval_time) are promoted to row filters. f64::INFINITY = no filters promoted (feature disabled). 0.0 = all filters pushed as row filters (no adaptive logic). Default: 104,857,600 bytes/sec (100 MiB/sec), empirically chosen based on TPC-H, TPC-DS, and ClickBench benchmarks on an m4 MacBook Pro. The optimal value for this setting likely depends on the relative cost of CPU vs. IO in your environment, and to some extent the shape of your query. **Interaction with `pushdown_filters`:** This option only takes effect when `pushdown_filters = true`. When pushdown is disabled, all filters run post-scan and this threshold is ignored. | +| datafusion.execution.parquet.filter_collecting_byte_ratio_threshold | 0.2 | (reading) Byte-ratio threshold for applying filters one at a time (iterative pruning; aka row-level) vs. all at once (post-scan). The ratio is computed as: (extra filter bytes not in projection) / (projected bytes). Filters whose extra columns consume a smaller fraction than this threshold are placed as row filters. Ratio of filter column bytes to projection bytes that controls initial filter placement. Computed as `filter_compressed_bytes / projection_compressed_bytes`. Filters below this ratio start as row-level filters (enabling late materialization); those above start as post-scan filters. Default: 0.20 — filters whose columns are less than 20% of the projection bytes start at row-level. **Interaction with `pushdown_filters`:** Only takes effect when `pushdown_filters = true`. | +| datafusion.execution.parquet.filter_confidence_z | 2 | (reading) Z-score for confidence intervals on filter effectiveness. Controls how much statistical evidence is required before promoting or demoting a filter. Lower values = faster decisions with less confidence. Higher values = more conservative, requiring more data. Default: 2.0 (~95% confidence). **Interaction with `pushdown_filters`:** Only takes effect when `pushdown_filters = true`. | | datafusion.execution.parquet.data_pagesize_limit | 1048576 | (writing) Sets best effort maximum size of data page in bytes | | datafusion.execution.parquet.write_batch_size | 1024 | (writing) Sets write_batch_size in rows | | datafusion.execution.parquet.writer_version | 1.0 | (writing) Sets parquet writer version valid values are "1.0" and "2.0" | diff --git a/docs/source/user-guide/sql/format_options.md b/docs/source/user-guide/sql/format_options.md index 338508031413c..3f4b8bc3f184b 100644 --- a/docs/source/user-guide/sql/format_options.md +++ b/docs/source/user-guide/sql/format_options.md @@ -147,7 +147,6 @@ The following options are available when reading or writing Parquet files. If an | SKIP_METADATA | No | If true, skips optional embedded metadata in the file schema. | `'skip_metadata'` | true | | METADATA_SIZE_HINT | No | Sets the size hint (in bytes) for fetching Parquet file metadata. | `'metadata_size_hint'` | None | | PUSHDOWN_FILTERS | No | If true, enables filter pushdown during Parquet decoding. | `'pushdown_filters'` | false | -| REORDER_FILTERS | No | If true, enables heuristic reordering of filters during Parquet decoding. | `'reorder_filters'` | false | | SCHEMA_FORCE_VIEW_TYPES | No | If true, reads Utf8/Binary columns as view types. | `'schema_force_view_types'` | true | | BINARY_AS_STRING | No | If true, reads Binary columns as strings. | `'binary_as_string'` | false | | DATA_PAGESIZE_LIMIT | No | Sets best effort maximum size of data page in bytes. | `'data_pagesize_limit'` | 1048576 |