From 2ec4dc32166dd38d796b3b4e1a9d4ce8ad9a749c Mon Sep 17 00:00:00 2001 From: lvlongxiang Date: Mon, 10 Feb 2025 00:58:10 +0000 Subject: [PATCH 1/4] Implement predicate pruning for not like expressions --- datafusion/core/tests/fuzz_cases/pruning.rs | 14 ++ datafusion/physical-optimizer/src/pruning.rs | 213 +++++++++++++++++-- 2 files changed, 214 insertions(+), 13 deletions(-) diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs index fef009fa911c6..32999007e2399 100644 --- a/datafusion/core/tests/fuzz_cases/pruning.rs +++ b/datafusion/core/tests/fuzz_cases/pruning.rs @@ -110,6 +110,13 @@ async fn test_utf8_not_like_prefix() { .await; } +#[tokio::test] +async fn test_utf8_not_like_ecsape() { + Utf8Test::new(|value| col("a").not_like(lit(format!("\\%{}%", value)))) + .run() + .await; +} + #[tokio::test] async fn test_utf8_not_like_suffix() { Utf8Test::new(|value| col("a").not_like(lit(format!("{}%", value)))) @@ -117,6 +124,13 @@ async fn test_utf8_not_like_suffix() { .await; } +#[tokio::test] +async fn test_utf8_not_like_suffix_one() { + Utf8Test::new(|value| col("a").not_like(lit(format!("{}_", value)))) + .run() + .await; +} + /// Fuzz testing for UTF8 predicate pruning /// The basic idea is that query results should always be the same with or without stats/pruning /// If we get this right we at least guarantee that there are no incorrect results diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index 8bf0ffbd3c32f..3f6374dc34581 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -1590,6 +1590,7 @@ fn build_statistics_expr( )), )) } + Operator::NotLikeMatch => build_not_like_match(expr_builder)?, Operator::LikeMatch => build_like_match(expr_builder).ok_or_else(|| { plan_datafusion_err!( "LIKE expression with wildcard at the beginning is not supported" @@ -1638,6 +1639,19 @@ fn build_statistics_expr( Ok(statistics_expr) } +/// returns the string literal of the scalar value if it is a string +fn unpack_string(s: &ScalarValue) -> Option<&str> { + s.try_as_str().flatten() +} + +fn extract_string_literal(expr: &Arc) -> Option<&str> { + if let Some(lit) = expr.as_any().downcast_ref::() { + let s = unpack_string(lit.value())?; + return Some(s); + } + None +} + /// Convert `column LIKE literal` where P is a constant prefix of the literal /// to a range check on the column: `P <= column && column < P'`, where P' is the /// lowest string after all P* strings. @@ -1650,19 +1664,6 @@ fn build_like_match( // column LIKE '%foo%' => min <= '' && '' <= max => true // column LIKE 'foo' => min <= 'foo' && 'foo' <= max - /// returns the string literal of the scalar value if it is a string - fn unpack_string(s: &ScalarValue) -> Option<&str> { - s.try_as_str().flatten() - } - - fn extract_string_literal(expr: &Arc) -> Option<&str> { - if let Some(lit) = expr.as_any().downcast_ref::() { - let s = unpack_string(lit.value())?; - return Some(s); - } - None - } - // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase // this may involve building the physical expressions that call lower() and upper() let min_column_expr = expr_builder.min_column_expr().ok()?; @@ -1710,6 +1711,66 @@ fn build_like_match( Some(combined) } +// For predicate `col NOT LIKE 'foo%'`, we rewrite it as `(col_min NOT LIKE 'foo%' OR col_max NOT LIKE 'foo%')`. If both col_min and col_max have the prefix foo, we skip the entire row group (as we can be certain that all data in this row group has the prefix foo). +fn build_not_like_match( + expr_builder: &mut PruningExpressionBuilder<'_>, +) -> Result> { + // col NOT LIKE 'prefix%' -> !(col_min LIKE 'prefix%' && col_max LIKE 'prefix%') -> (col_min NOT LIKE 'prefix%' || col_max NOT LIKE 'prefix%') + + let min_column_expr = expr_builder.min_column_expr()?; + let max_column_expr = expr_builder.max_column_expr()?; + + let scalar_expr = expr_builder.scalar_expr(); + + let pattern = extract_string_literal(scalar_expr).ok_or_else(|| { + plan_datafusion_err!("cannot extract literal from NOT LIKE expression") + })?; + + let chars: Vec = pattern.chars().collect(); + for i in 0..chars.len() - 1 { + // Check if current char is a wildcard and is not escaped with backslash + if (chars[i] == '%' || chars[i] == '_') && (i == 0 || chars[i - 1] != '\\') { + // Example: For pattern "foo%bar", the row group might include values like + // ["foobar", "food", "foodbar"], making it unsafe to prune. + // Even if the min/max values in the group (e.g., "foobar" and "foodbar") + // match the pattern, intermediate values like "food" may not + // match the full pattern "foo%bar", making pruning unsafe. + // (truncate foo%bar to foo% have same problem) + return Err(plan_datafusion_err!( + "NOT LIKE expressions with unescaped wildcards ('%' or '_') at the beginning or middle of the pattern are not supported" + )); + } + } + + if chars.last() == Some(&'_') && (chars.len() > 1 && chars[chars.len() - 2] != '\\') { + // Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"], + // which means not every row is guaranteed to match the pattern. + return Err(plan_datafusion_err!( + "NOT LIKE expressions with unescaped '_' at the end of the pattern are not supported" + )); + } + + let min_col_not_like_epxr = Arc::new(phys_expr::LikeExpr::new( + true, + false, + Arc::clone(&min_column_expr), + Arc::clone(scalar_expr), + )); + + let max_col_not_like_expr = Arc::new(phys_expr::LikeExpr::new( + true, + false, + Arc::clone(&max_column_expr), + Arc::clone(scalar_expr), + )); + + Ok(Arc::new(phys_expr::BinaryExpr::new( + min_col_not_like_epxr, + Operator::Or, + max_col_not_like_expr, + ))) +} + /// Increment a UTF8 string by one, returning `None` if it can't be incremented. /// This makes it so that the returned string will always compare greater than the input string /// or any other string with the same prefix. @@ -4061,6 +4122,132 @@ mod tests { prune_with_expr(expr, &schema, &statistics, expected_ret); } + #[test] + fn prune_utf8_not_like_one() { + let (schema, statistics) = utf8_setup(); + + let expr = col("s1").not_like(lit("A\u{10ffff}_")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["M", "M"] ==> some rows could pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> some rows could pass (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> some rows could pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match. (min, max) maybe truncate + // orignal (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}") + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + } + + #[test] + fn prune_utf8_not_like_many() { + let (schema, statistics) = utf8_setup(); + + let expr = col("s1").not_like(lit("A\u{10ffff}%")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["M", "M"] ==> some rows could pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> some rows could pass (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> some rows could pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match + false, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").not_like(lit("A\u{10ffff}%\u{10ffff}")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["M", "M"] ==> some rows could pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> some rows could pass (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> some rows could pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").not_like(lit("M")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["M", "M"] ==> no row match + false, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> some rows could pass (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> some rows could pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").not_like(lit("A\\%%")); + let statistics = TestStatistics::new().with( + "s1", + ContainerStats::new_utf8( + vec![Some("A%a"), Some("A")], + vec![Some("A%c"), Some("A")], + ), + ); + let expected_ret = &[false, true]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + } + #[test] fn test_rewrite_expr_to_prunable() { let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); From 44e1468a49669f8f9cb32d590231d0ca593565ec Mon Sep 17 00:00:00 2001 From: lvlongxiang Date: Wed, 12 Feb 2025 18:39:00 +0800 Subject: [PATCH 2/4] add split_constant_prefix --- datafusion/physical-optimizer/src/pruning.rs | 76 ++++++++------------ 1 file changed, 30 insertions(+), 46 deletions(-) diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index 3f6374dc34581..1c7c62e4ad300 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -1711,11 +1711,11 @@ fn build_like_match( Some(combined) } -// For predicate `col NOT LIKE 'foo%'`, we rewrite it as `(col_min NOT LIKE 'foo%' OR col_max NOT LIKE 'foo%')`. If both col_min and col_max have the prefix foo, we skip the entire row group (as we can be certain that all data in this row group has the prefix foo). +// For predicate `col NOT LIKE 'const_prefix%'`, we rewrite it as `(col_min NOT LIKE 'const_prefix%' OR col_max NOT LIKE 'const_prefix%')`. If both col_min and col_max have the prefix const_prefix, we skip the entire row group (as we can be certain that all data in this row group has the prefix const_prefix). fn build_not_like_match( expr_builder: &mut PruningExpressionBuilder<'_>, ) -> Result> { - // col NOT LIKE 'prefix%' -> !(col_min LIKE 'prefix%' && col_max LIKE 'prefix%') -> (col_min NOT LIKE 'prefix%' || col_max NOT LIKE 'prefix%') + // col NOT LIKE 'const_prefix%' -> !(col_min LIKE 'const_prefix%' && col_max LIKE 'const_prefix%') -> (col_min NOT LIKE 'const_prefix%' || col_max NOT LIKE 'const_prefix%') let min_column_expr = expr_builder.min_column_expr()?; let max_column_expr = expr_builder.max_column_expr()?; @@ -1726,27 +1726,21 @@ fn build_not_like_match( plan_datafusion_err!("cannot extract literal from NOT LIKE expression") })?; - let chars: Vec = pattern.chars().collect(); - for i in 0..chars.len() - 1 { - // Check if current char is a wildcard and is not escaped with backslash - if (chars[i] == '%' || chars[i] == '_') && (i == 0 || chars[i - 1] != '\\') { - // Example: For pattern "foo%bar", the row group might include values like - // ["foobar", "food", "foodbar"], making it unsafe to prune. - // Even if the min/max values in the group (e.g., "foobar" and "foodbar") - // match the pattern, intermediate values like "food" may not - // match the full pattern "foo%bar", making pruning unsafe. - // (truncate foo%bar to foo% have same problem) - return Err(plan_datafusion_err!( - "NOT LIKE expressions with unescaped wildcards ('%' or '_') at the beginning or middle of the pattern are not supported" - )); - } - } - - if chars.last() == Some(&'_') && (chars.len() > 1 && chars[chars.len() - 2] != '\\') { + let (const_prefix, remaining) = split_constant_prefix(pattern); + if const_prefix.is_empty() || remaining != "%" { + // we can not handle `%` at the beginning or in the middle of the pattern + // Example: For pattern "foo%bar", the row group might include values like + // ["foobar", "food", "foodbar"], making it unsafe to prune. + // Even if the min/max values in the group (e.g., "foobar" and "foodbar") + // match the pattern, intermediate values like "food" may not + // match the full pattern "foo%bar", making pruning unsafe. + // (truncate foo%bar to foo% have same problem) + + // we can not handle pattern containing `_` // Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"], // which means not every row is guaranteed to match the pattern. return Err(plan_datafusion_err!( - "NOT LIKE expressions with unescaped '_' at the end of the pattern are not supported" + "NOT LIKE expressions only support constant_prefix+wildcard`%`" )); } @@ -1771,6 +1765,22 @@ fn build_not_like_match( ))) } +/// Returns unescaped constant prefix of a LIKE pattern (possibly empty) and the remaining pattern (possibly empty) +fn split_constant_prefix(pattern: &str) -> (&str, &str) { + let char_indices = pattern.char_indices().collect::>(); + for i in 0..char_indices.len() { + let (idx, char) = char_indices[i]; + if char == '%' || char == '_' { + if i != 0 && char_indices[i - 1].1 == '\\' { + // ecsaped by `\` + continue; + } + return (&pattern[..idx], &pattern[idx..]); + } + } + (pattern, "") +} + /// Increment a UTF8 string by one, returning `None` if it can't be incremented. /// This makes it so that the returned string will always compare greater than the input string /// or any other string with the same prefix. @@ -4210,32 +4220,6 @@ mod tests { ]; prune_with_expr(expr, &schema, &statistics, expected_ret); - let expr = col("s1").not_like(lit("M")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["A", "L"] ==> some rows could pass (must keep) - true, - // s1 ["N", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["M", "M"] ==> no row match - false, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> some rows could pass (must keep) - true, - // s1 ["", "A"] ==> some rows could pass (must keep) - true, - // s1 ["", ""] ==> some rows could pass (must keep) - true, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - let expr = col("s1").not_like(lit("A\\%%")); let statistics = TestStatistics::new().with( "s1", From e8914c078687061c51d32717d415c33b09aa440b Mon Sep 17 00:00:00 2001 From: UBarney Date: Thu, 13 Feb 2025 21:53:47 +0800 Subject: [PATCH 3/4] Update datafusion/physical-optimizer/src/pruning.rs Co-authored-by: Andrew Lamb --- datafusion/physical-optimizer/src/pruning.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index 1c7c62e4ad300..ac6a9600a67a8 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -1711,7 +1711,11 @@ fn build_like_match( Some(combined) } -// For predicate `col NOT LIKE 'const_prefix%'`, we rewrite it as `(col_min NOT LIKE 'const_prefix%' OR col_max NOT LIKE 'const_prefix%')`. If both col_min and col_max have the prefix const_prefix, we skip the entire row group (as we can be certain that all data in this row group has the prefix const_prefix). +// For predicate `col NOT LIKE 'const_prefix%'`, we rewrite it as `(col_min NOT LIKE 'const_prefix%' OR col_max NOT LIKE 'const_prefix%')`. +// +// The intuition is that if both `col_min` and `col_max` begin with `const_prefix` that means +// **all** data in this row group begins with `const_prefix` as well (and therefore the predicate +// looking for rows that don't begin with `const_prefix` can never be true) fn build_not_like_match( expr_builder: &mut PruningExpressionBuilder<'_>, ) -> Result> { From c620d30d3c404206e840d189cebe0078e45df67f Mon Sep 17 00:00:00 2001 From: lvlongxiang Date: Mon, 10 Feb 2025 00:58:10 +0000 Subject: [PATCH 4/4] add more testcase --- datafusion/physical-optimizer/src/pruning.rs | 28 +++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index ac6a9600a67a8..2004aeafb8933 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -1711,7 +1711,7 @@ fn build_like_match( Some(combined) } -// For predicate `col NOT LIKE 'const_prefix%'`, we rewrite it as `(col_min NOT LIKE 'const_prefix%' OR col_max NOT LIKE 'const_prefix%')`. +// For predicate `col NOT LIKE 'const_prefix%'`, we rewrite it as `(col_min NOT LIKE 'const_prefix%' OR col_max NOT LIKE 'const_prefix%')`. // // The intuition is that if both `col_min` and `col_max` begin with `const_prefix` that means // **all** data in this row group begins with `const_prefix` as well (and therefore the predicate @@ -4224,6 +4224,32 @@ mod tests { ]; prune_with_expr(expr, &schema, &statistics, expected_ret); + let expr = col("s1").not_like(lit("A\u{10ffff}%\u{10ffff}_")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["M", "M"] ==> some rows could pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> some rows could pass (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> some rows could pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + let expr = col("s1").not_like(lit("A\\%%")); let statistics = TestStatistics::new().with( "s1",