From 2ec4dc32166dd38d796b3b4e1a9d4ce8ad9a749c Mon Sep 17 00:00:00 2001
From: lvlongxiang <lvlongxiang1994@gmail.com>
Date: Mon, 10 Feb 2025 00:58:10 +0000
Subject: [PATCH 1/4] Implement predicate pruning for not like expressions

---
 datafusion/core/tests/fuzz_cases/pruning.rs  |  14 ++
 datafusion/physical-optimizer/src/pruning.rs | 213 +++++++++++++++++--
 2 files changed, 214 insertions(+), 13 deletions(-)
diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs
index fef009fa911c6..32999007e2399 100644
--- a/datafusion/core/tests/fuzz_cases/pruning.rs
+++ b/datafusion/core/tests/fuzz_cases/pruning.rs
@@ -110,6 +110,13 @@ async fn test_utf8_not_like_prefix() {
         .await;
 }
 
+#[tokio::test]
+async fn test_utf8_not_like_ecsape() {
+    Utf8Test::new(|value| col("a").not_like(lit(format!("\\%{}%", value))))
+        .run()
+        .await;
+}
+
 #[tokio::test]
 async fn test_utf8_not_like_suffix() {
     Utf8Test::new(|value| col("a").not_like(lit(format!("{}%", value))))
@@ -117,6 +124,13 @@ async fn test_utf8_not_like_suffix() {
         .await;
 }
 
+#[tokio::test]
+async fn test_utf8_not_like_suffix_one() {
+    Utf8Test::new(|value| col("a").not_like(lit(format!("{}_", value))))
+        .run()
+        .await;
+}
+
 /// Fuzz testing for UTF8 predicate pruning
 /// The basic idea is that query results should always be the same with or without stats/pruning
 /// If we get this right we at least guarantee that there are no incorrect results
diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs
index 8bf0ffbd3c32f..3f6374dc34581 100644
--- a/datafusion/physical-optimizer/src/pruning.rs
+++ b/datafusion/physical-optimizer/src/pruning.rs
@@ -1590,6 +1590,7 @@ fn build_statistics_expr(
                 )),
             ))
         }
+        Operator::NotLikeMatch => build_not_like_match(expr_builder)?,
         Operator::LikeMatch => build_like_match(expr_builder).ok_or_else(|| {
             plan_datafusion_err!(
                 "LIKE expression with wildcard at the beginning is not supported"
@@ -1638,6 +1639,19 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+/// returns the string literal of the scalar value if it is a string
+fn unpack_string(s: &ScalarValue) -> Option<&str> {
+    s.try_as_str().flatten()
+}
+
+fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&str> {
+    if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+        let s = unpack_string(lit.value())?;
+        return Some(s);
+    }
+    None
+}
+
 /// Convert `column LIKE literal` where P is a constant prefix of the literal
 /// to a range check on the column: `P <= column && column < P'`, where P' is the
 /// lowest string after all P* strings.
@@ -1650,19 +1664,6 @@ fn build_like_match(
     // column LIKE '%foo%' => min <= '' && '' <= max => true
     // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
 
-    /// returns the string literal of the scalar value if it is a string
-    fn unpack_string(s: &ScalarValue) -> Option<&str> {
-        s.try_as_str().flatten()
-    }
-
-    fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&str> {
-        if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
-            let s = unpack_string(lit.value())?;
-            return Some(s);
-        }
-        None
-    }
-
     // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
     //  this may involve building the physical expressions that call lower() and upper()
     let min_column_expr = expr_builder.min_column_expr().ok()?;
@@ -1710,6 +1711,66 @@ fn build_like_match(
     Some(combined)
 }
 
+// For predicate `col NOT LIKE 'foo%'`, we rewrite it as `(col_min NOT LIKE 'foo%' OR col_max NOT LIKE 'foo%')`. If both col_min and col_max have the prefix foo, we skip the entire row group (as we can be certain that all data in this row group has the prefix foo).
+fn build_not_like_match(
+    expr_builder: &mut PruningExpressionBuilder<'_>,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    // col NOT LIKE 'prefix%' ->  !(col_min LIKE 'prefix%' && col_max LIKE 'prefix%') -> (col_min NOT LIKE 'prefix%' || col_max NOT LIKE 'prefix%')
+
+    let min_column_expr = expr_builder.min_column_expr()?;
+    let max_column_expr = expr_builder.max_column_expr()?;
+
+    let scalar_expr = expr_builder.scalar_expr();
+
+    let pattern = extract_string_literal(scalar_expr).ok_or_else(|| {
+        plan_datafusion_err!("cannot extract literal from NOT LIKE expression")
+    })?;
+
+    let chars: Vec<char> = pattern.chars().collect();
+    for i in 0..chars.len() - 1 {
+        // Check if current char is a wildcard and is not escaped with backslash
+        if (chars[i] == '%' || chars[i] == '_') && (i == 0 || chars[i - 1] != '\\') {
+            // Example: For pattern "foo%bar", the row group might include values like
+            // ["foobar", "food", "foodbar"], making it unsafe to prune.
+            // Even if the min/max values in the group (e.g., "foobar" and "foodbar")
+            // match the pattern, intermediate values like "food" may not
+            // match the full pattern "foo%bar", making pruning unsafe.
+            // (truncate foo%bar to foo% have same problem)
+            return Err(plan_datafusion_err!(
+                "NOT LIKE expressions with unescaped wildcards ('%' or '_') at the beginning or middle of the pattern are not supported"
+            ));
+        }
+    }
+
+    if chars.last() == Some(&'_') && (chars.len() > 1 && chars[chars.len() - 2] != '\\') {
+        // Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"],
+        // which means not every row is guaranteed to match the pattern.
+        return Err(plan_datafusion_err!(
+            "NOT LIKE expressions with unescaped '_' at the end of the pattern are not supported"
+        ));
+    }
+
+    let min_col_not_like_epxr = Arc::new(phys_expr::LikeExpr::new(
+        true,
+        false,
+        Arc::clone(&min_column_expr),
+        Arc::clone(scalar_expr),
+    ));
+
+    let max_col_not_like_expr = Arc::new(phys_expr::LikeExpr::new(
+        true,
+        false,
+        Arc::clone(&max_column_expr),
+        Arc::clone(scalar_expr),
+    ));
+
+    Ok(Arc::new(phys_expr::BinaryExpr::new(
+        min_col_not_like_epxr,
+        Operator::Or,
+        max_col_not_like_expr,
+    )))
+}
+
 /// Increment a UTF8 string by one, returning `None` if it can't be incremented.
 /// This makes it so that the returned string will always compare greater than the input string
 /// or any other string with the same prefix.
@@ -4061,6 +4122,132 @@ mod tests {
         prune_with_expr(expr, &schema, &statistics, expected_ret);
     }
 
+    #[test]
+    fn prune_utf8_not_like_one() {
+        let (schema, statistics) = utf8_setup();
+
+        let expr = col("s1").not_like(lit("A\u{10ffff}_"));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["A", "L"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["N", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["M", "M"] ==> some rows could pass (must keep)
+            true,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", "A"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", ""]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> no row match. (min, max) maybe truncate 
+            // orignal (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
+            true,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+    }
+
+    #[test]
+    fn prune_utf8_not_like_many() {
+        let (schema, statistics) = utf8_setup();
+
+        let expr = col("s1").not_like(lit("A\u{10ffff}%"));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["A", "L"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["N", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["M", "M"] ==> some rows could pass (must keep)
+            true,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", "A"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", ""]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> no row match
+            false,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+        let expr = col("s1").not_like(lit("A\u{10ffff}%\u{10ffff}"));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["A", "L"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["N", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["M", "M"] ==> some rows could pass (must keep)
+            true,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", "A"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", ""]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> some rows could pass (must keep)
+            true,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+        let expr = col("s1").not_like(lit("M"));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["A", "L"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["N", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["M", "M"] ==> no row match
+            false,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", "A"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", ""]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> some rows could pass (must keep)
+            true,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+        let expr = col("s1").not_like(lit("A\\%%"));
+        let statistics = TestStatistics::new().with(
+            "s1",
+            ContainerStats::new_utf8(
+                vec![Some("A%a"), Some("A")],
+                vec![Some("A%c"), Some("A")],
+            ),
+        );
+        let expected_ret = &[false, true];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+    }
+
     #[test]
     fn test_rewrite_expr_to_prunable() {
         let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);

From 44e1468a49669f8f9cb32d590231d0ca593565ec Mon Sep 17 00:00:00 2001
From: lvlongxiang <lvlongxiang1994@gmail.com>
Date: Wed, 12 Feb 2025 18:39:00 +0800
Subject: [PATCH 2/4] add split_constant_prefix

---
 datafusion/physical-optimizer/src/pruning.rs | 76 ++++++++------------
 1 file changed, 30 insertions(+), 46 deletions(-)

diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs
index 3f6374dc34581..1c7c62e4ad300 100644
--- a/datafusion/physical-optimizer/src/pruning.rs
+++ b/datafusion/physical-optimizer/src/pruning.rs
@@ -1711,11 +1711,11 @@ fn build_like_match(
     Some(combined)
 }
 
-// For predicate `col NOT LIKE 'foo%'`, we rewrite it as `(col_min NOT LIKE 'foo%' OR col_max NOT LIKE 'foo%')`. If both col_min and col_max have the prefix foo, we skip the entire row group (as we can be certain that all data in this row group has the prefix foo).
+// For predicate `col NOT LIKE 'const_prefix%'`, we rewrite it as `(col_min NOT LIKE 'const_prefix%' OR col_max NOT LIKE 'const_prefix%')`. If both col_min and col_max have the prefix const_prefix, we skip the entire row group (as we can be certain that all data in this row group has the prefix const_prefix).
 fn build_not_like_match(
     expr_builder: &mut PruningExpressionBuilder<'_>,
 ) -> Result<Arc<dyn PhysicalExpr>> {
-    // col NOT LIKE 'prefix%' ->  !(col_min LIKE 'prefix%' && col_max LIKE 'prefix%') -> (col_min NOT LIKE 'prefix%' || col_max NOT LIKE 'prefix%')
+    // col NOT LIKE 'const_prefix%' -> !(col_min LIKE 'const_prefix%' && col_max LIKE 'const_prefix%') -> (col_min NOT LIKE 'const_prefix%' || col_max NOT LIKE 'const_prefix%')
 
     let min_column_expr = expr_builder.min_column_expr()?;
     let max_column_expr = expr_builder.max_column_expr()?;
@@ -1726,27 +1726,21 @@ fn build_not_like_match(
         plan_datafusion_err!("cannot extract literal from NOT LIKE expression")
     })?;
 
-    let chars: Vec<char> = pattern.chars().collect();
-    for i in 0..chars.len() - 1 {
-        // Check if current char is a wildcard and is not escaped with backslash
-        if (chars[i] == '%' || chars[i] == '_') && (i == 0 || chars[i - 1] != '\\') {
-            // Example: For pattern "foo%bar", the row group might include values like
-            // ["foobar", "food", "foodbar"], making it unsafe to prune.
-            // Even if the min/max values in the group (e.g., "foobar" and "foodbar")
-            // match the pattern, intermediate values like "food" may not
-            // match the full pattern "foo%bar", making pruning unsafe.
-            // (truncate foo%bar to foo% have same problem)
-            return Err(plan_datafusion_err!(
-                "NOT LIKE expressions with unescaped wildcards ('%' or '_') at the beginning or middle of the pattern are not supported"
-            ));
-        }
-    }
-
-    if chars.last() == Some(&'_') && (chars.len() > 1 && chars[chars.len() - 2] != '\\') {
+    let (const_prefix, remaining) = split_constant_prefix(pattern);
+    if const_prefix.is_empty() || remaining != "%" {
+        // we can not handle `%` at the beginning or in the middle of the pattern
+        // Example: For pattern "foo%bar", the row group might include values like
+        // ["foobar", "food", "foodbar"], making it unsafe to prune.
+        // Even if the min/max values in the group (e.g., "foobar" and "foodbar")
+        // match the pattern, intermediate values like "food" may not
+        // match the full pattern "foo%bar", making pruning unsafe.
+        // (truncate foo%bar to foo% have same problem)
+
+        // we can not handle pattern containing `_`
         // Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"],
         // which means not every row is guaranteed to match the pattern.
         return Err(plan_datafusion_err!(
-            "NOT LIKE expressions with unescaped '_' at the end of the pattern are not supported"
+            "NOT LIKE expressions only support constant_prefix+wildcard`%`"
         ));
     }
 
@@ -1771,6 +1765,22 @@ fn build_not_like_match(
     )))
 }
 
+/// Returns unescaped constant prefix of a LIKE pattern (possibly empty) and the remaining pattern (possibly empty)
+fn split_constant_prefix(pattern: &str) -> (&str, &str) {
+    let char_indices = pattern.char_indices().collect::<Vec<_>>();
+    for i in 0..char_indices.len() {
+        let (idx, char) = char_indices[i];
+        if char == '%' || char == '_' {
+            if i != 0 && char_indices[i - 1].1 == '\\' {
+                // ecsaped by `\`
+                continue;
+            }
+            return (&pattern[..idx], &pattern[idx..]);
+        }
+    }
+    (pattern, "")
+}
+
 /// Increment a UTF8 string by one, returning `None` if it can't be incremented.
 /// This makes it so that the returned string will always compare greater than the input string
 /// or any other string with the same prefix.
@@ -4210,32 +4220,6 @@ mod tests {
         ];
         prune_with_expr(expr, &schema, &statistics, expected_ret);
 
-        let expr = col("s1").not_like(lit("M"));
-        #[rustfmt::skip]
-        let expected_ret = &[
-            // s1 ["A", "Z"] ==> some rows could pass (must keep)
-            true,
-            // s1 ["A", "L"] ==> some rows could pass (must keep)
-            true,
-            // s1 ["N", "Z"] ==> some rows could pass (must keep)
-            true,
-            // s1 ["M", "M"] ==> no row match
-            false,
-            // s1 [NULL, NULL]  ==> unknown (must keep)
-            true,
-            // s1 ["A", NULL]  ==> some rows could pass (must keep)
-            true,
-            // s1 ["", "A"]  ==> some rows could pass (must keep)
-            true,
-            // s1 ["", ""]  ==> some rows could pass (must keep)
-            true,
-            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> some rows could pass (must keep)
-            true,
-            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> some rows could pass (must keep)
-            true,
-        ];
-        prune_with_expr(expr, &schema, &statistics, expected_ret);
-
         let expr = col("s1").not_like(lit("A\\%%"));
         let statistics = TestStatistics::new().with(
             "s1",

From e8914c078687061c51d32717d415c33b09aa440b Mon Sep 17 00:00:00 2001
From: UBarney <UBarney@users.noreply.github.com>
Date: Thu, 13 Feb 2025 21:53:47 +0800
Subject: [PATCH 3/4] Update datafusion/physical-optimizer/src/pruning.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion/physical-optimizer/src/pruning.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs
index 1c7c62e4ad300..ac6a9600a67a8 100644
--- a/datafusion/physical-optimizer/src/pruning.rs
+++ b/datafusion/physical-optimizer/src/pruning.rs
@@ -1711,7 +1711,11 @@ fn build_like_match(
     Some(combined)
 }
 
-// For predicate `col NOT LIKE 'const_prefix%'`, we rewrite it as `(col_min NOT LIKE 'const_prefix%' OR col_max NOT LIKE 'const_prefix%')`. If both col_min and col_max have the prefix const_prefix, we skip the entire row group (as we can be certain that all data in this row group has the prefix const_prefix).
+// For predicate `col NOT LIKE 'const_prefix%'`, we rewrite it as `(col_min NOT LIKE 'const_prefix%' OR col_max NOT LIKE 'const_prefix%')`. 
+//
+// The intuition is that if both `col_min` and `col_max` begin with `const_prefix` that means
+// **all** data in this row group begins with `const_prefix` as well (and therefore the predicate
+// looking for rows that don't begin with `const_prefix` can never be true)
 fn build_not_like_match(
     expr_builder: &mut PruningExpressionBuilder<'_>,
 ) -> Result<Arc<dyn PhysicalExpr>> {

From c620d30d3c404206e840d189cebe0078e45df67f Mon Sep 17 00:00:00 2001
From: lvlongxiang <lvlongxiang1994@gmail.com>
Date: Mon, 10 Feb 2025 00:58:10 +0000
Subject: [PATCH 4/4] add more testcase

---
 datafusion/physical-optimizer/src/pruning.rs | 28 +++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs
index ac6a9600a67a8..2004aeafb8933 100644
--- a/datafusion/physical-optimizer/src/pruning.rs
+++ b/datafusion/physical-optimizer/src/pruning.rs
@@ -1711,7 +1711,7 @@ fn build_like_match(
     Some(combined)
 }
 
-// For predicate `col NOT LIKE 'const_prefix%'`, we rewrite it as `(col_min NOT LIKE 'const_prefix%' OR col_max NOT LIKE 'const_prefix%')`. 
+// For predicate `col NOT LIKE 'const_prefix%'`, we rewrite it as `(col_min NOT LIKE 'const_prefix%' OR col_max NOT LIKE 'const_prefix%')`.
 //
 // The intuition is that if both `col_min` and `col_max` begin with `const_prefix` that means
 // **all** data in this row group begins with `const_prefix` as well (and therefore the predicate
@@ -4224,6 +4224,32 @@ mod tests {
         ];
         prune_with_expr(expr, &schema, &statistics, expected_ret);
 
+        let expr = col("s1").not_like(lit("A\u{10ffff}%\u{10ffff}_"));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["A", "L"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["N", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["M", "M"] ==> some rows could pass (must keep)
+            true,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", "A"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", ""]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> some rows could pass (must keep)
+            true,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+
         let expr = col("s1").not_like(lit("A\\%%"));
         let statistics = TestStatistics::new().with(
             "s1",