From c94e03c63ed7524b53798ea7e64703086be470d2 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 26 Feb 2026 11:21:26 -0800 Subject: [PATCH 01/10] Support utf8view in regex simplify optimization --- .../src/simplify_expressions/regex.rs | 61 +++++++++++-------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index d388aaf74cdac..b28ffde5c8194 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -46,10 +46,16 @@ pub fn simplify_regex_expr( ) -> Result { let mode = OperatorMode::new(&op); - if let Expr::Literal(ScalarValue::Utf8(Some(pattern)), _) = right.as_ref() { - // Handle the special case for ".*" pattern - if pattern == ANY_CHAR_REGEX_PATTERN { - let new_expr = if mode.not { + let (pattern, is_utf8) = match right.as_ref() { + Expr::Literal(ScalarValue::Utf8(Some(p)), _) => (p.as_str(), true), + Expr::Literal(ScalarValue::Utf8View(Some(p)), _) => (p.as_str(), false), + _ => return Ok(Expr::BinaryExpr(BinaryExpr { left, op, right })), + }; + + // Handle the special case for ".*" pattern + if pattern == ANY_CHAR_REGEX_PATTERN { + let new_expr = if mode.not { + if is_utf8 { // not empty let empty_lit = Box::new(lit("")); Expr::BinaryExpr(BinaryExpr { @@ -58,32 +64,35 @@ pub fn simplify_regex_expr( right: empty_lit, }) } else { - // not null - left.is_not_null() - }; - return Ok(new_expr); - } + // Leave untouched because optimization doesn't work for Utf8View + Expr::BinaryExpr(BinaryExpr { left, op, right }) + } + } else { + // not null + left.is_not_null() + }; + return Ok(new_expr); + } - match regex_syntax::Parser::new().parse(pattern) { - Ok(hir) => { - let kind = hir.kind(); - if let HirKind::Alternation(alts) = kind { - if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION - && let Some(expr) = lower_alt(&mode, &left, alts) - { - return Ok(expr); - } - } else if let Some(expr) = lower_simple(&mode, &left, &hir) { + match regex_syntax::Parser::new().parse(pattern) { + Ok(hir) => { + let kind = hir.kind(); + if let HirKind::Alternation(alts) = kind { + if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION + && let Some(expr) = lower_alt(&mode, &left, alts) + { return Ok(expr); } + } else if let Some(expr) = lower_simple(&mode, &left, &hir) { + return Ok(expr); } - Err(e) => { - // error out early since the execution may fail anyways - return Err(DataFusionError::Context( - "Invalid regex".to_owned(), - Box::new(DataFusionError::External(Box::new(e))), - )); - } + } + Err(e) => { + // error out early since the execution may fail anyways + return Err(DataFusionError::Context( + "Invalid regex".to_owned(), + Box::new(DataFusionError::External(Box::new(e))), + )); } } From 295d9c88ff94bb5b4f5684914502497087dcef41 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 26 Feb 2026 11:22:58 -0800 Subject: [PATCH 02/10] Fix sqllogictest expected plans --- datafusion/sqllogictest/test_files/simplify_expr.slt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt index 99fc9900ef619..dc55e836f7ed6 100644 --- a/datafusion/sqllogictest/test_files/simplify_expr.slt +++ b/datafusion/sqllogictest/test_files/simplify_expr.slt @@ -34,10 +34,10 @@ query TT explain select b from t where b ~ '.*' ---- logical_plan -01)Filter: t.b ~ Utf8View(".*") +01)Filter: t.b IS NOT NULL 02)--TableScan: t projection=[b] physical_plan -01)FilterExec: b@0 ~ .* +01)FilterExec: b@0 IS NOT NULL 02)--DataSourceExec: partitions=1, partition_sizes=[1] query TT From 23737823d94f2d418fa0900a6607394dd0c75a1a Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 26 Feb 2026 13:40:59 -0800 Subject: [PATCH 03/10] Fix the core issue by checking and returning the proper str datatype (utf8view and largeutf8 support) --- .../simplify_expressions/expr_simplifier.rs | 4 +- .../src/simplify_expressions/regex.rs | 46 ++++++++++--------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index c6644e008645a..701ffc421de09 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2126,7 +2126,7 @@ fn is_literal_or_literal_cast(expr: &Expr) -> bool { } } -fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { +pub fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { match expr { Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)), Expr::Literal(ScalarValue::LargeUtf8(s), _) => Some((DataType::LargeUtf8, s)), @@ -2135,7 +2135,7 @@ fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { } } -fn to_string_scalar(data_type: &DataType, value: Option) -> Expr { +pub fn to_string_scalar(data_type: &DataType, value: Option) -> Expr { match data_type { DataType::Utf8 => Expr::Literal(ScalarValue::Utf8(value), None), DataType::LargeUtf8 => Expr::Literal(ScalarValue::LargeUtf8(value), None), diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index b28ffde5c8194..6126646ce0ce4 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -15,10 +15,13 @@ // specific language governing permissions and limitations // under the License. -use datafusion_common::{DataFusionError, Result, ScalarValue}; +use arrow::datatypes::DataType; +use datafusion_common::{DataFusionError, Result}; use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit}; use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look}; +use crate::simplify_expressions::expr_simplifier::{as_string_scalar, to_string_scalar}; + /// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions. const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4; @@ -44,29 +47,27 @@ pub fn simplify_regex_expr( op: Operator, right: Box, ) -> Result { - let mode = OperatorMode::new(&op); - - let (pattern, is_utf8) = match right.as_ref() { - Expr::Literal(ScalarValue::Utf8(Some(p)), _) => (p.as_str(), true), - Expr::Literal(ScalarValue::Utf8View(Some(p)), _) => (p.as_str(), false), - _ => return Ok(Expr::BinaryExpr(BinaryExpr { left, op, right })), + // Check if the right operand is a string literal + let Some((datatype, pattern_opt)) = as_string_scalar(&right) else { + return Ok(Expr::BinaryExpr(BinaryExpr { left, op, right })); + }; + let Some(pattern_owned) = pattern_opt.as_ref() else { + return Ok(Expr::BinaryExpr(BinaryExpr { left, op, right })); }; + let pattern = pattern_owned.as_str(); + + let mode = OperatorMode::new(&op, datatype.clone()); // Handle the special case for ".*" pattern if pattern == ANY_CHAR_REGEX_PATTERN { let new_expr = if mode.not { - if is_utf8 { - // not empty - let empty_lit = Box::new(lit("")); - Expr::BinaryExpr(BinaryExpr { - left, - op: Operator::Eq, - right: empty_lit, - }) - } else { - // Leave untouched because optimization doesn't work for Utf8View - Expr::BinaryExpr(BinaryExpr { left, op, right }) - } + // not empty + let empty_lit = Box::new(to_string_scalar(&datatype, Some("".to_string()))); + Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::Eq, + right: empty_lit, + }) } else { // not null left.is_not_null() @@ -106,10 +107,11 @@ struct OperatorMode { not: bool, /// Ignore case (`true` for case-insensitive). i: bool, + datatype: DataType, } impl OperatorMode { - fn new(op: &Operator) -> Self { + fn new(op: &Operator, datatype: DataType) -> Self { let not = match op { Operator::RegexMatch | Operator::RegexIMatch => false, Operator::RegexNotMatch | Operator::RegexNotIMatch => true, @@ -122,7 +124,7 @@ impl OperatorMode { _ => unreachable!(), }; - Self { not, i } + Self { not, i, datatype } } /// Creates an [`LIKE`](Expr::Like) from the given `LIKE` pattern. @@ -130,7 +132,7 @@ impl OperatorMode { let like = Like { negated: self.not, expr, - pattern: Box::new(Expr::Literal(ScalarValue::from(pattern), None)), + pattern: Box::new(to_string_scalar(&self.datatype, Some(pattern))), escape_char: None, case_insensitive: self.i, }; From e1f661bc11675e938b8106016bfa2e58bf560bb0 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 26 Feb 2026 13:41:12 -0800 Subject: [PATCH 04/10] Update sqllogictests --- datafusion/sqllogictest/test_files/simplify_expr.slt | 4 ++-- datafusion/sqllogictest/test_files/string/string_view.slt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt index dc55e836f7ed6..f8c219e052f80 100644 --- a/datafusion/sqllogictest/test_files/simplify_expr.slt +++ b/datafusion/sqllogictest/test_files/simplify_expr.slt @@ -44,10 +44,10 @@ query TT explain select b from t where b !~ '.*' ---- logical_plan -01)Filter: t.b !~ Utf8View(".*") +01)Filter: t.b = Utf8View("") 02)--TableScan: t projection=[b] physical_plan -01)FilterExec: b@0 !~ .* +01)FilterExec: b@0 = 02)--DataSourceExec: partitions=1, partition_sizes=[1] query T diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 13b0aba653efb..4dcc2f663a830 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -1100,7 +1100,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: test.column1_utf8view ~ Utf8View("an") AS c1 +01)Projection: test.column1_utf8view LIKE Utf8View("%an%") AS c1 02)--TableScan: test projection=[column1_utf8view] # `~*` operator (regex match case-insensitive) From 24387995427e3c04fdbb740ee8913de80299396a Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 26 Feb 2026 14:10:04 -0800 Subject: [PATCH 05/10] Properly indicate Transformed::no() when appropriate --- .../src/simplify_expressions/expr_simplifier.rs | 3 ++- .../optimizer/src/simplify_expressions/regex.rs | 16 +++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 701ffc421de09..f1023a31209ef 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1660,7 +1660,8 @@ impl TreeNodeRewriter for Simplifier<'_> { left, op: op @ (RegexMatch | RegexNotMatch | RegexIMatch | RegexNotIMatch), right, - }) => Transformed::yes(simplify_regex_expr(left, op, right)?), + // }) => Transformed::yes(simplify_regex_expr(left, op, right)?), + }) => simplify_regex_expr(left, op, right)?, // Rules for Like Expr::Like(like) => { diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 6126646ce0ce4..dfa7461557075 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -17,6 +17,7 @@ use arrow::datatypes::DataType; use datafusion_common::{DataFusionError, Result}; +use datafusion_common::tree_node::Transformed; use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit}; use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look}; @@ -46,13 +47,13 @@ pub fn simplify_regex_expr( left: Box, op: Operator, right: Box, -) -> Result { +) -> Result> { // Check if the right operand is a string literal let Some((datatype, pattern_opt)) = as_string_scalar(&right) else { - return Ok(Expr::BinaryExpr(BinaryExpr { left, op, right })); + return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))); }; let Some(pattern_owned) = pattern_opt.as_ref() else { - return Ok(Expr::BinaryExpr(BinaryExpr { left, op, right })); + return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))); }; let pattern = pattern_owned.as_str(); @@ -72,7 +73,7 @@ pub fn simplify_regex_expr( // not null left.is_not_null() }; - return Ok(new_expr); + return Ok(Transformed::yes(new_expr)); } match regex_syntax::Parser::new().parse(pattern) { @@ -82,10 +83,10 @@ pub fn simplify_regex_expr( if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION && let Some(expr) = lower_alt(&mode, &left, alts) { - return Ok(expr); + return Ok(Transformed::yes(expr)); } } else if let Some(expr) = lower_simple(&mode, &left, &hir) { - return Ok(expr); + return Ok(Transformed::yes(expr)); } } Err(e) => { @@ -98,7 +99,7 @@ pub fn simplify_regex_expr( } // Leave untouched if optimization didn't work - Ok(Expr::BinaryExpr(BinaryExpr { left, op, right })) + Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))) } #[derive(Debug)] @@ -107,6 +108,7 @@ struct OperatorMode { not: bool, /// Ignore case (`true` for case-insensitive). i: bool, + /// Data type of the pattern (e.g. Utf8, Utf8View, LargeUtf8) datatype: DataType, } From 155f8d428f23e9a0adc4e1584aa58c52fd5281ed Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 26 Feb 2026 14:24:12 -0800 Subject: [PATCH 06/10] Clean up and fmt --- .../simplify_expressions/expr_simplifier.rs | 3 +-- .../src/simplify_expressions/regex.rs | 20 +++++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index f1023a31209ef..56d3529248e9d 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1660,8 +1660,7 @@ impl TreeNodeRewriter for Simplifier<'_> { left, op: op @ (RegexMatch | RegexNotMatch | RegexIMatch | RegexNotIMatch), right, - // }) => Transformed::yes(simplify_regex_expr(left, op, right)?), - }) => simplify_regex_expr(left, op, right)?, + }) => simplify_regex_expr(left, op, right)?, // Rules for Like Expr::Like(like) => { diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index dfa7461557075..e6d4feac42c39 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -16,8 +16,8 @@ // under the License. use arrow::datatypes::DataType; -use datafusion_common::{DataFusionError, Result}; use datafusion_common::tree_node::Transformed; +use datafusion_common::{DataFusionError, Result}; use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit}; use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look}; @@ -50,10 +50,18 @@ pub fn simplify_regex_expr( ) -> Result> { // Check if the right operand is a string literal let Some((datatype, pattern_opt)) = as_string_scalar(&right) else { - return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))); + return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { + left, + op, + right, + }))); }; let Some(pattern_owned) = pattern_opt.as_ref() else { - return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))); + return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { + left, + op, + right, + }))); }; let pattern = pattern_owned.as_str(); @@ -99,7 +107,11 @@ pub fn simplify_regex_expr( } // Leave untouched if optimization didn't work - Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))) + Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { + left, + op, + right, + }))) } #[derive(Debug)] From edcdba723aeffed7624d8f8e10ad3d8aace34f7e Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 26 Feb 2026 17:23:51 -0800 Subject: [PATCH 07/10] empty commit to retrigger ci From 11b7cbac301c471baeea0130d99f6e318d417359 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 27 Feb 2026 10:51:01 -0800 Subject: [PATCH 08/10] Review Feedback: Replace as_string_scalar and to_string_scalar with StringScalar enum --- .../simplify_expressions/expr_simplifier.rs | 67 +++++++++++++----- .../src/simplify_expressions/regex.rs | 69 +++++++++++++------ 2 files changed, 95 insertions(+), 41 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 56d3529248e9d..4fe9d6ae55638 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1666,8 +1666,14 @@ impl TreeNodeRewriter for Simplifier<'_> { Expr::Like(like) => { // `\` is implicit escape, see https://github.com/apache/datafusion/issues/13291 let escape_char = like.escape_char.unwrap_or('\\'); - match as_string_scalar(&like.pattern) { - Some((data_type, pattern_str)) => { + + let pattern_scalar = match like.pattern.as_ref() { + Expr::Literal(scalar, _) => scalar, + _ => return Ok(Transformed::no(Expr::Like(like))), + }; + match StringScalar::try_from_scalar(pattern_scalar) { + Some(string_scalar) => { + let pattern_str = string_scalar.as_str(); match pattern_str { None => return Ok(Transformed::yes(lit_bool_null())), Some(pattern_str) if pattern_str == "%" => { @@ -1702,10 +1708,9 @@ impl TreeNodeRewriter for Simplifier<'_> { .replace_all(pattern_str, "%") .to_string(); Transformed::yes(Expr::Like(Like { - pattern: Box::new(to_string_scalar( - &data_type, - Some(simplified_pattern), - )), + pattern: Box::new( + string_scalar.to_scalar(&simplified_pattern), + ), ..like })) } @@ -2126,21 +2131,45 @@ fn is_literal_or_literal_cast(expr: &Expr) -> bool { } } -pub fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { - match expr { - Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)), - Expr::Literal(ScalarValue::LargeUtf8(s), _) => Some((DataType::LargeUtf8, s)), - Expr::Literal(ScalarValue::Utf8View(s), _) => Some((DataType::Utf8View, s)), - _ => None, - } +/// Helper for working with string scalar values (Utf8, LargeUtf8, Utf8View) +pub(crate) enum StringScalar<'a> { + Utf8(&'a ScalarValue), + LargeUtf8(&'a ScalarValue), + Utf8View(&'a ScalarValue), } -pub fn to_string_scalar(data_type: &DataType, value: Option) -> Expr { - match data_type { - DataType::Utf8 => Expr::Literal(ScalarValue::Utf8(value), None), - DataType::LargeUtf8 => Expr::Literal(ScalarValue::LargeUtf8(value), None), - DataType::Utf8View => Expr::Literal(ScalarValue::Utf8View(value), None), - _ => unreachable!(), +impl<'a> StringScalar<'a> { + /// Create a `StringScalar` view from a `ScalarValue` if it is a supported string type. + /// Returns `None` if the scalar value is not a supported string type. + pub(crate) fn try_from_scalar(scalar: &'a ScalarValue) -> Option { + match scalar { + ScalarValue::Utf8(_) => Some(Self::Utf8(scalar)), + ScalarValue::LargeUtf8(_) => Some(Self::LargeUtf8(scalar)), + ScalarValue::Utf8View(_) => Some(Self::Utf8View(scalar)), + _ => None, + } + } + + /// Returns the underlying string slice. + pub(crate) fn as_str(&self) -> Option<&'a str> { + match self { + Self::Utf8(scalar) | Self::LargeUtf8(scalar) | Self::Utf8View(scalar) => { + scalar.try_as_str().flatten() + } + } + } + + /// Build a new `Expr` of the same string type with the given value. + pub(crate) fn to_scalar(&self, val: &str) -> Expr { + match self { + Self::Utf8(_) => Expr::Literal(ScalarValue::Utf8(Some(val.to_owned())), None), + Self::LargeUtf8(_) => { + Expr::Literal(ScalarValue::LargeUtf8(Some(val.to_owned())), None) + } + Self::Utf8View(_) => { + Expr::Literal(ScalarValue::Utf8View(Some(val.to_owned())), None) + } + } } } diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index e6d4feac42c39..714f387f607a9 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -15,13 +15,12 @@ // specific language governing permissions and limitations // under the License. -use arrow::datatypes::DataType; use datafusion_common::tree_node::Transformed; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit}; use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look}; -use crate::simplify_expressions::expr_simplifier::{as_string_scalar, to_string_scalar}; +use crate::simplify_expressions::expr_simplifier::StringScalar; /// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions. const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4; @@ -48,30 +47,39 @@ pub fn simplify_regex_expr( op: Operator, right: Box, ) -> Result> { + let right_scalar = match right.as_ref() { + Expr::Literal(scalar, _) => scalar, + _ => { + return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { + left, + op, + right, + }))); + } + }; // Check if the right operand is a string literal - let Some((datatype, pattern_opt)) = as_string_scalar(&right) else { + let Some(string_scalar) = StringScalar::try_from_scalar(right_scalar) else { return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right, }))); }; - let Some(pattern_owned) = pattern_opt.as_ref() else { + let pattern = string_scalar.as_str(); + let Some(pattern) = pattern else { return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right, }))); }; - let pattern = pattern_owned.as_str(); - - let mode = OperatorMode::new(&op, datatype.clone()); + let mode = OperatorMode::new(&op); // Handle the special case for ".*" pattern if pattern == ANY_CHAR_REGEX_PATTERN { let new_expr = if mode.not { // not empty - let empty_lit = Box::new(to_string_scalar(&datatype, Some("".to_string()))); + let empty_lit = Box::new(string_scalar.to_scalar("")); Expr::BinaryExpr(BinaryExpr { left, op: Operator::Eq, @@ -89,11 +97,11 @@ pub fn simplify_regex_expr( let kind = hir.kind(); if let HirKind::Alternation(alts) = kind { if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION - && let Some(expr) = lower_alt(&mode, &left, alts) + && let Some(expr) = lower_alt(&mode, &left, alts, &string_scalar) { return Ok(Transformed::yes(expr)); } - } else if let Some(expr) = lower_simple(&mode, &left, &hir) { + } else if let Some(expr) = lower_simple(&mode, &left, &hir, &string_scalar) { return Ok(Transformed::yes(expr)); } } @@ -120,12 +128,10 @@ struct OperatorMode { not: bool, /// Ignore case (`true` for case-insensitive). i: bool, - /// Data type of the pattern (e.g. Utf8, Utf8View, LargeUtf8) - datatype: DataType, } impl OperatorMode { - fn new(op: &Operator, datatype: DataType) -> Self { + fn new(op: &Operator) -> Self { let not = match op { Operator::RegexMatch | Operator::RegexIMatch => false, Operator::RegexNotMatch | Operator::RegexNotIMatch => true, @@ -138,15 +144,15 @@ impl OperatorMode { _ => unreachable!(), }; - Self { not, i, datatype } + Self { not, i } } /// Creates an [`LIKE`](Expr::Like) from the given `LIKE` pattern. - fn expr(&self, expr: Box, pattern: String) -> Expr { + fn expr(&self, expr: Box, pattern: Box) -> Expr { let like = Like { negated: self.not, expr, - pattern: Box::new(to_string_scalar(&self.datatype, Some(pattern))), + pattern, escape_char: None, case_insensitive: self.i, }; @@ -336,14 +342,25 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option> { } /// Tries to lower (transform) a simple regex pattern to a LIKE expression. -fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { +fn lower_simple( + mode: &OperatorMode, + left: &Expr, + hir: &Hir, + string_scalar: &StringScalar, +) -> Option { match hir.kind() { HirKind::Empty => { - return Some(mode.expr(Box::new(left.clone()), "%".to_owned())); + return Some(mode.expr( + Box::new(left.clone()), + Box::new(string_scalar.to_scalar("%")), + )); } HirKind::Literal(l) => { let s = like_str_from_literal(l)?; - return Some(mode.expr(Box::new(left.clone()), format!("%{s}%"))); + return Some(mode.expr( + Box::new(left.clone()), + Box::new(string_scalar.to_scalar(&format!("%{s}%"))), + )); } HirKind::Concat(inner) if is_anchored_literal(inner) => { return anchored_literal_to_expr(inner).map(|right| { @@ -358,7 +375,10 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { if let Some(pattern) = partial_anchored_literal_to_like(inner) .or_else(|| collect_concat_to_like_string(inner)) { - return Some(mode.expr(Box::new(left.clone()), pattern)); + return Some(mode.expr( + Box::new(left.clone()), + Box::new(string_scalar.to_scalar(&pattern)), + )); } } _ => {} @@ -369,11 +389,16 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { /// Calls [`lower_simple`] for each alternative and combine the results with `or` or `and` /// based on [`OperatorMode`]. Any fail attempt to lower an alternative will makes this /// function to return `None`. -fn lower_alt(mode: &OperatorMode, left: &Expr, alts: &[Hir]) -> Option { +fn lower_alt( + mode: &OperatorMode, + left: &Expr, + alts: &[Hir], + string_scalar: &StringScalar, +) -> Option { let mut accu: Option = None; for part in alts { - if let Some(expr) = lower_simple(mode, left, part) { + if let Some(expr) = lower_simple(mode, left, part, string_scalar) { accu = match accu { Some(accu) => { if mode.not { From 94cdcc4b65c5d2dec074b1eefb22d6f0b248cda4 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 27 Feb 2026 11:04:23 -0800 Subject: [PATCH 09/10] Use new try_from_expr() constructor and rename to_scalar() to to_expr() --- .../simplify_expressions/expr_simplifier.rs | 21 +++++++++------ .../src/simplify_expressions/regex.rs | 27 ++++++------------- 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 4fe9d6ae55638..b6e7cb3994f3e 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1667,11 +1667,7 @@ impl TreeNodeRewriter for Simplifier<'_> { // `\` is implicit escape, see https://github.com/apache/datafusion/issues/13291 let escape_char = like.escape_char.unwrap_or('\\'); - let pattern_scalar = match like.pattern.as_ref() { - Expr::Literal(scalar, _) => scalar, - _ => return Ok(Transformed::no(Expr::Like(like))), - }; - match StringScalar::try_from_scalar(pattern_scalar) { + match StringScalar::try_from_expr(&like.pattern) { Some(string_scalar) => { let pattern_str = string_scalar.as_str(); match pattern_str { @@ -1709,7 +1705,7 @@ impl TreeNodeRewriter for Simplifier<'_> { .to_string(); Transformed::yes(Expr::Like(Like { pattern: Box::new( - string_scalar.to_scalar(&simplified_pattern), + string_scalar.to_expr(&simplified_pattern), ), ..like })) @@ -2139,9 +2135,18 @@ pub(crate) enum StringScalar<'a> { } impl<'a> StringScalar<'a> { + /// Create a `StringScalar` view from an `Expr` if it is a supported string literal. + /// Returns `None` if the expression is not a string literal. + pub(crate) fn try_from_expr(expr: &'a Expr) -> Option { + match expr { + Expr::Literal(scalar, _) => Self::try_from_scalar(scalar), + _ => None, + } + } + /// Create a `StringScalar` view from a `ScalarValue` if it is a supported string type. /// Returns `None` if the scalar value is not a supported string type. - pub(crate) fn try_from_scalar(scalar: &'a ScalarValue) -> Option { + fn try_from_scalar(scalar: &'a ScalarValue) -> Option { match scalar { ScalarValue::Utf8(_) => Some(Self::Utf8(scalar)), ScalarValue::LargeUtf8(_) => Some(Self::LargeUtf8(scalar)), @@ -2160,7 +2165,7 @@ impl<'a> StringScalar<'a> { } /// Build a new `Expr` of the same string type with the given value. - pub(crate) fn to_scalar(&self, val: &str) -> Expr { + pub(crate) fn to_expr(&self, val: &str) -> Expr { match self { Self::Utf8(_) => Expr::Literal(ScalarValue::Utf8(Some(val.to_owned())), None), Self::LargeUtf8(_) => { diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 714f387f607a9..6c2492d05404d 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -47,18 +47,8 @@ pub fn simplify_regex_expr( op: Operator, right: Box, ) -> Result> { - let right_scalar = match right.as_ref() { - Expr::Literal(scalar, _) => scalar, - _ => { - return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { - left, - op, - right, - }))); - } - }; - // Check if the right operand is a string literal - let Some(string_scalar) = StringScalar::try_from_scalar(right_scalar) else { + // Check if the right operand is a supported string literal + let Some(string_scalar) = StringScalar::try_from_expr(right.as_ref()) else { return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, @@ -79,7 +69,7 @@ pub fn simplify_regex_expr( if pattern == ANY_CHAR_REGEX_PATTERN { let new_expr = if mode.not { // not empty - let empty_lit = Box::new(string_scalar.to_scalar("")); + let empty_lit = Box::new(string_scalar.to_expr("")); Expr::BinaryExpr(BinaryExpr { left, op: Operator::Eq, @@ -350,16 +340,15 @@ fn lower_simple( ) -> Option { match hir.kind() { HirKind::Empty => { - return Some(mode.expr( - Box::new(left.clone()), - Box::new(string_scalar.to_scalar("%")), - )); + return Some( + mode.expr(Box::new(left.clone()), Box::new(string_scalar.to_expr("%"))), + ); } HirKind::Literal(l) => { let s = like_str_from_literal(l)?; return Some(mode.expr( Box::new(left.clone()), - Box::new(string_scalar.to_scalar(&format!("%{s}%"))), + Box::new(string_scalar.to_expr(&format!("%{s}%"))), )); } HirKind::Concat(inner) if is_anchored_literal(inner) => { @@ -377,7 +366,7 @@ fn lower_simple( { return Some(mode.expr( Box::new(left.clone()), - Box::new(string_scalar.to_scalar(&pattern)), + Box::new(string_scalar.to_expr(&pattern)), )); } } From ab79321626f2ad0d25617070318a149058fac43c Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 27 Feb 2026 11:16:31 -0800 Subject: [PATCH 10/10] Fix clippy --- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index b6e7cb3994f3e..fe2e1a3b0408a 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1672,7 +1672,7 @@ impl TreeNodeRewriter for Simplifier<'_> { let pattern_str = string_scalar.as_str(); match pattern_str { None => return Ok(Transformed::yes(lit_bool_null())), - Some(pattern_str) if pattern_str == "%" => { + Some("%") => { // exp LIKE '%' is // - when exp is not NULL, it's true // - when exp is NULL, it's NULL