diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 6c99f18ab0f64..0b47cdee212f2 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -22,6 +22,8 @@ use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look}; /// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions. const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4; +const ANY_CHAR_REGEX_PATTERN: &str = ".*"; + /// Tries to convert a regexp expression to a `LIKE` or `Eq`/`NotEq` expression. /// /// This function also validates the regex pattern. And will return error if the @@ -33,6 +35,8 @@ const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4; /// - full anchored regex patterns (e.g. `^foo$`) to `= 'foo'` /// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'` /// - combinations (alternatives) of the above, will be concatenated with `OR` or `AND` +/// - `EQ .*` to NotNull +/// - `NE .*` means IS EMPTY /// /// Dev note: unit tests of this function are in `expr_simplifier.rs`, case `test_simplify_regex`. pub fn simplify_regex_expr( @@ -43,6 +47,23 @@ pub fn simplify_regex_expr( let mode = OperatorMode::new(&op); if let Expr::Literal(ScalarValue::Utf8(Some(pattern))) = right.as_ref() { + // Handle the special case for ".*" pattern + if pattern == ANY_CHAR_REGEX_PATTERN { + let new_expr = if mode.not { + // not empty + let empty_lit = Box::new(lit("")); + Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::Eq, + right: empty_lit, + }) + } else { + // not null + left.is_not_null() + }; + return Ok(new_expr); + } + match regex_syntax::Parser::new().parse(pattern) { Ok(hir) => { let kind = hir.kind(); diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index 709d8f79c3d98..e33869ca2b636 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -765,4 +765,51 @@ mod tests { assert_optimized_plan_eq(plan, expected) } + + #[test] + fn test_simplify_regex_special_cases() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Utf8, false), + ]); + let table_scan = table_scan(Some("test"), &schema, None)?.build()?; + + // Test `= ".*"` transforms to true (except for empty strings) + let plan = LogicalPlanBuilder::from(table_scan.clone()) + .filter(binary_expr(col("a"), Operator::RegexMatch, lit(".*")))? + .build()?; + let expected = "Filter: test.a IS NOT NULL\ + \n TableScan: test"; + + assert_optimized_plan_eq(plan, expected)?; + + // Test `!= ".*"` transforms to checking if the column is empty + let plan = LogicalPlanBuilder::from(table_scan.clone()) + .filter(binary_expr(col("a"), Operator::RegexNotMatch, lit(".*")))? + .build()?; + let expected = "Filter: test.a = Utf8(\"\")\ + \n TableScan: test"; + + assert_optimized_plan_eq(plan, expected)?; + + // Test case-insensitive versions + + // Test `=~ ".*"` (case-insensitive) transforms to true (except for empty strings) + let plan = LogicalPlanBuilder::from(table_scan.clone()) + .filter(binary_expr(col("b"), Operator::RegexIMatch, lit(".*")))? + .build()?; + let expected = "Filter: Boolean(true)\ + \n TableScan: test"; + + assert_optimized_plan_eq(plan, expected)?; + + // Test `!~ ".*"` (case-insensitive) transforms to checking if the column is empty + let plan = LogicalPlanBuilder::from(table_scan.clone()) + .filter(binary_expr(col("a"), Operator::RegexNotIMatch, lit(".*")))? + .build()?; + let expected = "Filter: test.a = Utf8(\"\")\ + \n TableScan: test"; + + assert_optimized_plan_eq(plan, expected) + } } diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index a2f1657e4085a..aa14faf984e40 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -1869,5 +1869,5 @@ select *, count(*) over() as ta from t; 3 4 1 4 -query -drop table t; \ No newline at end of file +statement count 0 +drop table t; diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt index d10e603ea5f33..43193fb41cfad 100644 --- a/datafusion/sqllogictest/test_files/simplify_expr.slt +++ b/datafusion/sqllogictest/test_files/simplify_expr.slt @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -statement count 0 -create table t(a int) as values (1); +statement ok +create table t(a int, b string) as values (1, 'a'), (2, NULL), (NULL, 'c'); # test between simplification query TT @@ -30,5 +30,38 @@ physical_plan 02)--FilterExec: a@0 = 3 03)----DataSourceExec: partitions=1, partition_sizes=[1] -statement count 0 +# test regex exprs +query TT +explain select b from t where b ~ '.*' +---- +logical_plan +01)Filter: t.b IS NOT NULL +02)--TableScan: t projection=[b] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: b@0 IS NOT NULL +03)----DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select b from t where b !~ '.*' +---- +logical_plan +01)Filter: t.b = Utf8("") +02)--TableScan: t projection=[b] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: b@0 = +03)----DataSourceExec: partitions=1, partition_sizes=[1] + +query T +select b from t where b ~ '.*' +---- +a +c + +query T +select b from t where b !~ '.*' +---- + +statement ok drop table t; diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index 1cf648510bd55..9e9a40c510efb 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -898,13 +898,13 @@ statement ok CREATE VIEW v1 AS SELECT y FROM u1 UNION ALL SELECT y FROM u2 ORDER BY y; -query I +query I rowsort SELECT * FROM (SELECT y FROM u1 UNION ALL SELECT y FROM u2) ORDER BY y; ---- 1 +20 3 3 -20 40 query TT