From d3b1618ef69e9957db6687c5756d6bd213dc7648 Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Tue, 30 May 2023 17:46:02 +0800 Subject: [PATCH 01/14] draft --- datafusion-examples/examples/rewrite_expr.rs | 6 +- .../simplify_expressions/expr_simplifier.rs | 6 +- .../src/simplify_expressions/regex.rs | 78 ++++++++++++++++++- .../simplify_expressions/simplify_exprs.rs | 7 +- 4 files changed, 89 insertions(+), 8 deletions(-) diff --git a/datafusion-examples/examples/rewrite_expr.rs b/datafusion-examples/examples/rewrite_expr.rs index 2777781eb98db..9a7e73abf6027 100644 --- a/datafusion-examples/examples/rewrite_expr.rs +++ b/datafusion-examples/examples/rewrite_expr.rs @@ -35,7 +35,8 @@ use std::sync::Arc; pub fn main() -> Result<()> { // produce a logical plan using the datafusion-sql crate let dialect = PostgreSqlDialect {}; - let sql = "SELECT * FROM person WHERE age BETWEEN 21 AND 32"; + // let sql = "SELECT * FROM person WHERE age BETWEEN 21 AND 32"; + let sql = "select * from person where name ~ '1|2';"; let statements = Parser::parse_sql(&dialect, sql)?; // produce a logical plan using the datafusion-sql crate @@ -136,8 +137,10 @@ impl OptimizerRule for MyOptimizerRule { ) -> Result> { // recurse down and optimize children first let optimized_plan = utils::optimize_children(self, plan, config)?; + dbg!(&optimized_plan); match optimized_plan { Some(LogicalPlan::Filter(filter)) => { + dbg!(&filter.predicate); let predicate = my_rewrite(filter.predicate.clone())?; Ok(Some(LogicalPlan::Filter(Filter::try_new( predicate, @@ -147,6 +150,7 @@ impl OptimizerRule for MyOptimizerRule { Some(optimized_plan) => Ok(Some(optimized_plan)), None => match plan { LogicalPlan::Filter(filter) => { + println!("{:?}", filter.predicate); let predicate = my_rewrite(filter.predicate.clone())?; Ok(Some(LogicalPlan::Filter(Filter::try_new( predicate, diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 75f50aa3c576d..8a440bd52470e 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2401,7 +2401,11 @@ mod tests { let expected = concat(&[col("c0"), lit("hello rust"), col("c1")]); assert_eq!(simplify(expr), expected) } - + #[test] + fn test_my_simplify_regex() { + assert_no_change(regex_match(col("c1"), lit("^(foo|bar)$"))); + // assert_no_change(regex_match(col("c1"), lit("^$"))); + } #[test] fn test_simplify_regex() { // malformed regex diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index ca298abcfa00d..04693e95d9a81 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -16,8 +16,10 @@ // under the License. use datafusion_common::{DataFusionError, Result, ScalarValue}; -use datafusion_expr::{lit, BinaryExpr, Expr, Like, Operator}; -use regex_syntax::hir::{Hir, HirKind, Literal, Look}; +use datafusion_expr::{lit, BinaryExpr, Expr, Like, Operator, or}; +use regex_syntax::hir::{Hir, HirKind, Literal, Look, Capture}; + +use crate::utils::disjunction; /// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions. const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4; @@ -33,7 +35,7 @@ pub fn simplify_regex_expr( match regex_syntax::Parser::new().parse(pattern) { Ok(hir) => { let kind = hir.kind(); - + println!("{:?}", kind); if let HirKind::Alternation(alts) = kind { if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION { if let Some(expr) = lower_alt(&mode, &left, alts) { @@ -166,6 +168,32 @@ fn is_anchored_literal(v: &[Hir]) -> bool { .all(|h| matches!(h.kind(), HirKind::Literal(_))) } +/// returns true if the elements in a `Concat` pattern are: +/// - `[Look::Start, Capture(Alternation), Look::End]` +fn is_anchored_capture(v: &[Hir]) -> bool { + match v.len() { + 2..=3 => (), + _ => return false, + }; + + let first_last = ( + v.first().expect("length checked"), + v.last().expect("length checked"), + ); + if !matches!(first_last, + (s, e) if s.kind() == &HirKind::Look(Look::Start) + && e.kind() == &HirKind::Look(Look::End) + ) + { + return false; + } + + v.iter() + .skip(1) + .take(v.len() - 2) + .all(|h| matches!(h.kind(), HirKind::Capture(_))) +} + /// extracts a string literal expression assuming that [`is_anchored_literal`] /// returned true. fn anchored_literal_to_expr(v: &[Hir]) -> Option { @@ -179,6 +207,41 @@ fn anchored_literal_to_expr(v: &[Hir]) -> Option { } } +fn anchored_alternation_to_expr(v: &[Hir]) -> Option { + match v.len() { + 2 => Some(lit("")), + 3 => { + println!("{:?}", v); + if let HirKind::Capture(cap,..) = v[1].kind(){ + if let Capture { sub,.. } = cap { + if let HirKind::Alternation(alters) = sub.kind() { + let literals : Vec<_> = alters.iter().map(|l| if let HirKind::Literal(l) = l.kind(){ + str_from_literal(l).map(lit) + } else { + None + }).collect(); + + if literals.iter().any(|l| l.is_none()) { + return None; + }; + + return disjunction(literals.into_iter().map(|v|v.unwrap())); + + + } else { + return None; + } + }; + + return None; + }else { + return None; + } + } + _ => None, + } +} + fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { match hir.kind() { HirKind::Empty => { @@ -194,6 +257,12 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { mode.expr_matches_literal(Box::new(left.clone()), Box::new(right)), ); } + HirKind::Concat(inner) if is_anchored_capture(inner) => { + let right = anchored_alternation_to_expr(inner)?; + return Some( + mode.expr_matches_literal(Box::new(left.clone()), Box::new(right)), + ); + } HirKind::Concat(inner) => { if let Some(pattern) = collect_concat_to_like_string(inner) { return Some(mode.expr(Box::new(left.clone()), pattern)); @@ -202,7 +271,8 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { _ => {} } - None + + left.in_list(right, false); None } fn lower_alt(mode: &OperatorMode, left: &Expr, alts: &[Hir]) -> Option { diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index 42850178136e5..ae0d8f187c6cd 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -52,6 +52,7 @@ impl OptimizerRule for SimplifyExpressions { plan: &LogicalPlan, config: &dyn OptimizerConfig, ) -> Result> { + println!("try_optimize: {:?}",plan); let mut execution_props = ExecutionProps::new(); execution_props.query_execution_start_time = config.query_execution_start_time(); Ok(Some(Self::optimize_internal(plan, &execution_props)?)) @@ -63,6 +64,7 @@ impl SimplifyExpressions { plan: &LogicalPlan, execution_props: &ExecutionProps, ) -> Result { + println!("now plan: {:?}", plan); let schema = if !plan.inputs().is_empty() { DFSchemaRef::new(merge_schema(plan.inputs())) } else if let LogicalPlan::TableScan(_) = plan { @@ -91,8 +93,9 @@ impl SimplifyExpressions { let name = &e.display_name(); // Apply the actual simplification logic + dbg!(&e); let new_e = simplifier.simplify(e)?; - + dbg!(&new_e); let new_name = &new_e.display_name(); if let (Ok(expr_name), Ok(new_expr_name)) = (name, new_name) { @@ -193,7 +196,7 @@ mod tests { .project(vec![col("a")])? .filter(and(col("b").gt(lit(1)), col("b").gt(lit(1))))? .build()?; - + println!("{:?}", plan); assert_optimized_plan_eq( &plan, "\ From 2ed7e500de7821f7b6b5b4506ffcf25f653477df Mon Sep 17 00:00:00 2001 From: jiacai2050 Date: Tue, 30 May 2023 19:24:56 +0800 Subject: [PATCH 02/14] finish anchored capture match --- .../simplify_expressions/expr_simplifier.rs | 10 +-- .../src/simplify_expressions/regex.rs | 86 +++++++++---------- 2 files changed, 46 insertions(+), 50 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 8a440bd52470e..9aaf27420f457 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2401,11 +2401,7 @@ mod tests { let expected = concat(&[col("c0"), lit("hello rust"), col("c1")]); assert_eq!(simplify(expr), expected) } - #[test] - fn test_my_simplify_regex() { - assert_no_change(regex_match(col("c1"), lit("^(foo|bar)$"))); - // assert_no_change(regex_match(col("c1"), lit("^$"))); - } + #[test] fn test_simplify_regex() { // malformed regex @@ -2452,6 +2448,10 @@ mod tests { regex_not_match(col("c1"), lit("^foo$")), col("c1").not_eq(lit("foo")), ); + assert_change( + regex_match(col("c1"), lit("^(foo|bar)$")), + col("c1").eq(lit("bar")).or(col("c1").eq(lit("foo"))), + ); assert_no_change(regex_match(col("c1"), lit("^foo|bar$"))); assert_no_change(regex_match(col("c1"), lit("^(foo)(bar)$"))); assert_no_change(regex_match(col("c1"), lit("^"))); diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 04693e95d9a81..977b6ff59c119 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -16,10 +16,8 @@ // under the License. use datafusion_common::{DataFusionError, Result, ScalarValue}; -use datafusion_expr::{lit, BinaryExpr, Expr, Like, Operator, or}; -use regex_syntax::hir::{Hir, HirKind, Literal, Look, Capture}; - -use crate::utils::disjunction; +use datafusion_expr::{lit, BinaryExpr, Expr, Like, Operator}; +use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look}; /// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions. const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4; @@ -35,7 +33,6 @@ pub fn simplify_regex_expr( match regex_syntax::Parser::new().parse(pattern) { Ok(hir) => { let kind = hir.kind(); - println!("{:?}", kind); if let HirKind::Alternation(alts) = kind { if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION { if let Some(expr) = lower_alt(&mode, &left, alts) { @@ -171,10 +168,9 @@ fn is_anchored_literal(v: &[Hir]) -> bool { /// returns true if the elements in a `Concat` pattern are: /// - `[Look::Start, Capture(Alternation), Look::End]` fn is_anchored_capture(v: &[Hir]) -> bool { - match v.len() { - 2..=3 => (), - _ => return false, - }; + if 3 != v.len() { + return false; + } let first_last = ( v.first().expect("length checked"), @@ -188,10 +184,19 @@ fn is_anchored_capture(v: &[Hir]) -> bool { return false; } - v.iter() - .skip(1) - .take(v.len() - 2) - .all(|h| matches!(h.kind(), HirKind::Capture(_))) + if let HirKind::Capture(cap, ..) = v[1].kind() { + let Capture { sub, .. } = cap; + if let HirKind::Alternation(alters) = sub.kind() { + let has_non_literal = alters + .iter() + .any(|v| !matches!(v.kind(), &HirKind::Literal(_))); + if has_non_literal { + return false; + } + } + } + + true } /// extracts a string literal expression assuming that [`is_anchored_literal`] @@ -207,39 +212,34 @@ fn anchored_literal_to_expr(v: &[Hir]) -> Option { } } -fn anchored_alternation_to_expr(v: &[Hir]) -> Option { - match v.len() { - 2 => Some(lit("")), - 3 => { - println!("{:?}", v); - if let HirKind::Capture(cap,..) = v[1].kind(){ - if let Capture { sub,.. } = cap { - if let HirKind::Alternation(alters) = sub.kind() { - let literals : Vec<_> = alters.iter().map(|l| if let HirKind::Literal(l) = l.kind(){ - str_from_literal(l).map(lit) - } else { - None - }).collect(); - - if literals.iter().any(|l| l.is_none()) { - return None; - }; - - return disjunction(literals.into_iter().map(|v|v.unwrap())); - +fn anchored_alternation_to_expr(v: &[Hir]) -> Option> { + if 3 != v.len() { + return None; + } + if let HirKind::Capture(cap, ..) = v[1].kind() { + let Capture { sub, .. } = cap; + if let HirKind::Alternation(alters) = sub.kind() { + let literals: Vec<_> = alters + .iter() + .map(|l| { + if let HirKind::Literal(l) = l.kind() { + str_from_literal(l).map(lit) } else { - return None; + None } - }; + }) + .collect(); + if literals.iter().any(|l| l.is_none()) { return None; - }else { - return None; - } + }; + + return Some(literals.into_iter().map(|v| v.unwrap()).collect()); } - _ => None, } + + return None; } fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { @@ -259,9 +259,7 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { } HirKind::Concat(inner) if is_anchored_capture(inner) => { let right = anchored_alternation_to_expr(inner)?; - return Some( - mode.expr_matches_literal(Box::new(left.clone()), Box::new(right)), - ); + return Some(left.clone().in_list(right, false)); } HirKind::Concat(inner) => { if let Some(pattern) = collect_concat_to_like_string(inner) { @@ -270,9 +268,7 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { } _ => {} } - - - left.in_list(right, false); None + None } fn lower_alt(mode: &OperatorMode, left: &Expr, alts: &[Hir]) -> Option { From f0c4ec4e8a5912467e2dd822ffdf5112ecd9eeb1 Mon Sep 17 00:00:00 2001 From: jiacai2050 Date: Tue, 30 May 2023 21:05:30 +0800 Subject: [PATCH 03/14] add testcase for regex_not_match --- .../simplify_expressions/expr_simplifier.rs | 6 +++ .../src/simplify_expressions/regex.rs | 41 +++++++++---------- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 9aaf27420f457..2ecd4706a54e4 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2452,6 +2452,12 @@ mod tests { regex_match(col("c1"), lit("^(foo|bar)$")), col("c1").eq(lit("bar")).or(col("c1").eq(lit("foo"))), ); + assert_change( + regex_not_match(col("c1"), lit("^(foo|bar)$")), + col("c1") + .not_eq(lit("bar")) + .and(col("c1").not_eq(lit("foo"))), + ); assert_no_change(regex_match(col("c1"), lit("^foo|bar$"))); assert_no_change(regex_match(col("c1"), lit("^(foo)(bar)$"))); assert_no_change(regex_match(col("c1"), lit("^"))); diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 977b6ff59c119..299e48d3ad6c7 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -166,7 +166,7 @@ fn is_anchored_literal(v: &[Hir]) -> bool { } /// returns true if the elements in a `Concat` pattern are: -/// - `[Look::Start, Capture(Alternation), Look::End]` +/// - `[Look::Start, Capture(Alternation(Literals...)), Look::End]` fn is_anchored_capture(v: &[Hir]) -> bool { if 3 != v.len() { return false; @@ -212,7 +212,7 @@ fn anchored_literal_to_expr(v: &[Hir]) -> Option { } } -fn anchored_alternation_to_expr(v: &[Hir]) -> Option> { +fn anchored_alternation_to_exprs(v: &[Hir]) -> Option> { if 3 != v.len() { return None; } @@ -220,22 +220,22 @@ fn anchored_alternation_to_expr(v: &[Hir]) -> Option> { if let HirKind::Capture(cap, ..) = v[1].kind() { let Capture { sub, .. } = cap; if let HirKind::Alternation(alters) = sub.kind() { - let literals: Vec<_> = alters - .iter() - .map(|l| { - if let HirKind::Literal(l) = l.kind() { - str_from_literal(l).map(lit) - } else { - None + let mut literals = Vec::with_capacity(alters.len()); + for hir in alters { + let mut is_safe = false; + if let HirKind::Literal(l) = hir.kind() { + if let Some(safe_literal) = str_from_literal(l).map(lit) { + literals.push(safe_literal); + is_safe = true; } - }) - .collect(); + } - if literals.iter().any(|l| l.is_none()) { - return None; - }; + if !is_safe { + return None; + } + } - return Some(literals.into_iter().map(|v| v.unwrap()).collect()); + return Some(literals); } } @@ -252,14 +252,13 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { return Some(mode.expr(Box::new(left.clone()), format!("%{s}%"))); } HirKind::Concat(inner) if is_anchored_literal(inner) => { - let right = anchored_literal_to_expr(inner)?; - return Some( - mode.expr_matches_literal(Box::new(left.clone()), Box::new(right)), - ); + return anchored_literal_to_expr(inner).map(|right| { + mode.expr_matches_literal(Box::new(left.clone()), Box::new(right)) + }); } HirKind::Concat(inner) if is_anchored_capture(inner) => { - let right = anchored_alternation_to_expr(inner)?; - return Some(left.clone().in_list(right, false)); + return anchored_alternation_to_exprs(inner) + .map(|right| left.clone().in_list(right, mode.not)); } HirKind::Concat(inner) => { if let Some(pattern) = collect_concat_to_like_string(inner) { From c0787e18b2935d99b2e8d047c2775c7b59829dde Mon Sep 17 00:00:00 2001 From: tanruixiang <819464715@qq.com> Date: Tue, 30 May 2023 21:28:15 +0800 Subject: [PATCH 04/14] resotre --- datafusion-examples/examples/rewrite_expr.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion-examples/examples/rewrite_expr.rs b/datafusion-examples/examples/rewrite_expr.rs index 9a7e73abf6027..a20bcf74c5439 100644 --- a/datafusion-examples/examples/rewrite_expr.rs +++ b/datafusion-examples/examples/rewrite_expr.rs @@ -35,8 +35,7 @@ use std::sync::Arc; pub fn main() -> Result<()> { // produce a logical plan using the datafusion-sql crate let dialect = PostgreSqlDialect {}; - // let sql = "SELECT * FROM person WHERE age BETWEEN 21 AND 32"; - let sql = "select * from person where name ~ '1|2';"; + let sql = "SELECT * FROM person WHERE age BETWEEN 21 AND 32"; let statements = Parser::parse_sql(&dialect, sql)?; // produce a logical plan using the datafusion-sql crate From 05b530e6f0e0c9f455a0b1f632a3110339d9e948 Mon Sep 17 00:00:00 2001 From: tanruixiang <819464715@qq.com> Date: Tue, 30 May 2023 21:31:06 +0800 Subject: [PATCH 05/14] resotre --- datafusion-examples/examples/rewrite_expr.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/datafusion-examples/examples/rewrite_expr.rs b/datafusion-examples/examples/rewrite_expr.rs index a20bcf74c5439..2777781eb98db 100644 --- a/datafusion-examples/examples/rewrite_expr.rs +++ b/datafusion-examples/examples/rewrite_expr.rs @@ -136,10 +136,8 @@ impl OptimizerRule for MyOptimizerRule { ) -> Result> { // recurse down and optimize children first let optimized_plan = utils::optimize_children(self, plan, config)?; - dbg!(&optimized_plan); match optimized_plan { Some(LogicalPlan::Filter(filter)) => { - dbg!(&filter.predicate); let predicate = my_rewrite(filter.predicate.clone())?; Ok(Some(LogicalPlan::Filter(Filter::try_new( predicate, @@ -149,7 +147,6 @@ impl OptimizerRule for MyOptimizerRule { Some(optimized_plan) => Ok(Some(optimized_plan)), None => match plan { LogicalPlan::Filter(filter) => { - println!("{:?}", filter.predicate); let predicate = my_rewrite(filter.predicate.clone())?; Ok(Some(LogicalPlan::Filter(Filter::try_new( predicate, From 9e0d562b2abcd6a047223c65c9e362d0c8abd9f6 Mon Sep 17 00:00:00 2001 From: tanruixiang <819464715@qq.com> Date: Tue, 30 May 2023 21:33:49 +0800 Subject: [PATCH 06/14] remove debug print --- .../optimizer/src/simplify_expressions/simplify_exprs.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index ae0d8f187c6cd..895708e74dae8 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -52,7 +52,6 @@ impl OptimizerRule for SimplifyExpressions { plan: &LogicalPlan, config: &dyn OptimizerConfig, ) -> Result> { - println!("try_optimize: {:?}",plan); let mut execution_props = ExecutionProps::new(); execution_props.query_execution_start_time = config.query_execution_start_time(); Ok(Some(Self::optimize_internal(plan, &execution_props)?)) @@ -64,7 +63,6 @@ impl SimplifyExpressions { plan: &LogicalPlan, execution_props: &ExecutionProps, ) -> Result { - println!("now plan: {:?}", plan); let schema = if !plan.inputs().is_empty() { DFSchemaRef::new(merge_schema(plan.inputs())) } else if let LogicalPlan::TableScan(_) = plan { @@ -93,9 +91,7 @@ impl SimplifyExpressions { let name = &e.display_name(); // Apply the actual simplification logic - dbg!(&e); let new_e = simplifier.simplify(e)?; - dbg!(&new_e); let new_name = &new_e.display_name(); if let (Ok(expr_name), Ok(new_expr_name)) = (name, new_name) { @@ -196,7 +192,6 @@ mod tests { .project(vec![col("a")])? .filter(and(col("b").gt(lit(1)), col("b").gt(lit(1))))? .build()?; - println!("{:?}", plan); assert_optimized_plan_eq( &plan, "\ From 290364e9a362d303b05214745ddf801720a6f8b1 Mon Sep 17 00:00:00 2001 From: tanruixiang <819464715@qq.com> Date: Tue, 30 May 2023 21:42:35 +0800 Subject: [PATCH 07/14] delete debug print --- datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index 895708e74dae8..42850178136e5 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -92,6 +92,7 @@ impl SimplifyExpressions { // Apply the actual simplification logic let new_e = simplifier.simplify(e)?; + let new_name = &new_e.display_name(); if let (Ok(expr_name), Ok(new_expr_name)) = (name, new_name) { @@ -192,6 +193,7 @@ mod tests { .project(vec![col("a")])? .filter(and(col("b").gt(lit(1)), col("b").gt(lit(1))))? .build()?; + assert_optimized_plan_eq( &plan, "\ From b58afec2849668f53ab61c2dc352616f054a3f9b Mon Sep 17 00:00:00 2001 From: tanruixiang <819464715@qq.com> Date: Wed, 31 May 2023 00:40:02 +0800 Subject: [PATCH 08/14] add testcase --- .../src/simplify_expressions/expr_simplifier.rs | 14 ++++++++++++++ .../optimizer/src/simplify_expressions/regex.rs | 8 ++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 2ecd4706a54e4..39cdb7258f58d 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2458,6 +2458,20 @@ mod tests { .not_eq(lit("bar")) .and(col("c1").not_eq(lit("foo"))), ); + assert_change( + regex_match(col("c1"), lit("^(foo)$")), + col("c1").eq(lit("foo")), + ); + assert_change( + regex_match(col("c1"), lit("^(foo|bar|baz)$")), + col("c1") + .eq(lit("baz")) + .or(col("c1").eq(lit("bar"))) + .or(col("c1").eq(lit("foo"))), + ); + assert_no_change(regex_match(col("c1"), lit("(foo|bar)"))); + assert_no_change(regex_match(col("c1"), lit("(foo|bar)*"))); + assert_no_change(regex_match(col("c1"), lit("^(foo|bar)*"))); assert_no_change(regex_match(col("c1"), lit("^foo|bar$"))); assert_no_change(regex_match(col("c1"), lit("^(foo)(bar)$"))); assert_no_change(regex_match(col("c1"), lit("^"))); diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 299e48d3ad6c7..e9c4497b07ee6 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -236,10 +236,14 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option> { } return Some(literals); + } else if let HirKind::Literal(l) = sub.kind() { + if let Some(safe_literal) = str_from_literal(l).map(lit) { + return Some(vec![safe_literal]); + } + return None; } } - - return None; + None } fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { From dd342d02fc15137c36569b5e09c787fc7443b590 Mon Sep 17 00:00:00 2001 From: jiacai2050 Date: Wed, 31 May 2023 10:51:28 +0800 Subject: [PATCH 09/14] fix test --- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 39cdb7258f58d..b1eaf0a77d05e 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2448,6 +2448,8 @@ mod tests { regex_not_match(col("c1"), lit("^foo$")), col("c1").not_eq(lit("foo")), ); + + // regular expressions that match exact captured literals assert_change( regex_match(col("c1"), lit("^(foo|bar)$")), col("c1").eq(lit("bar")).or(col("c1").eq(lit("foo"))), @@ -2466,9 +2468,10 @@ mod tests { regex_match(col("c1"), lit("^(foo|bar|baz)$")), col("c1") .eq(lit("baz")) - .or(col("c1").eq(lit("bar"))) - .or(col("c1").eq(lit("foo"))), + .or((col("c1").eq(lit("bar"))).or(col("c1").eq(lit("foo")))), ); + + // regular expressions that mismatch captured literals assert_no_change(regex_match(col("c1"), lit("(foo|bar)"))); assert_no_change(regex_match(col("c1"), lit("(foo|bar)*"))); assert_no_change(regex_match(col("c1"), lit("^(foo|bar)*"))); From 8a4ad78c5a8e66aa16e8fa01478b068ca0e57bee Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Wed, 31 May 2023 11:06:04 +0800 Subject: [PATCH 10/14] add testcase --- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index b1eaf0a77d05e..26160d3b927cf 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2470,6 +2470,11 @@ mod tests { .eq(lit("baz")) .or((col("c1").eq(lit("bar"))).or(col("c1").eq(lit("foo")))), ); + assert_change( + regex_match(col("c1"), lit("^(foo|bar|baz|qux)$")), + col("c1") + .in_list(vec![lit("foo"), lit("bar"), lit("baz"), lit("qux")], false), + ); // regular expressions that mismatch captured literals assert_no_change(regex_match(col("c1"), lit("(foo|bar)"))); From 29f433383934110178b2a2902785233dbc590ffe Mon Sep 17 00:00:00 2001 From: jakevin Date: Thu, 1 Jun 2023 15:28:38 +0800 Subject: [PATCH 11/14] simplify code --- .../optimizer/src/simplify_expressions/regex.rs | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index e9c4497b07ee6..77c41c12858b2 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -168,18 +168,9 @@ fn is_anchored_literal(v: &[Hir]) -> bool { /// returns true if the elements in a `Concat` pattern are: /// - `[Look::Start, Capture(Alternation(Literals...)), Look::End]` fn is_anchored_capture(v: &[Hir]) -> bool { - if 3 != v.len() { - return false; - } - - let first_last = ( - v.first().expect("length checked"), - v.last().expect("length checked"), - ); - if !matches!(first_last, - (s, e) if s.kind() == &HirKind::Look(Look::Start) - && e.kind() == &HirKind::Look(Look::End) - ) + if v.len() != 3 + || !matches!((v.first().unwrap().kind(), v.last().unwrap().kind()), + (&HirKind::Look(Look::Start), &HirKind::Look(Look::End))) { return false; } From 2756bc6815fb7a5627d473a35a555167b8396085 Mon Sep 17 00:00:00 2001 From: jakevin Date: Thu, 1 Jun 2023 15:30:23 +0800 Subject: [PATCH 12/14] simplify code --- datafusion/optimizer/src/simplify_expressions/regex.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 77c41c12858b2..108f1774b42c0 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -169,8 +169,10 @@ fn is_anchored_literal(v: &[Hir]) -> bool { /// - `[Look::Start, Capture(Alternation(Literals...)), Look::End]` fn is_anchored_capture(v: &[Hir]) -> bool { if v.len() != 3 - || !matches!((v.first().unwrap().kind(), v.last().unwrap().kind()), - (&HirKind::Look(Look::Start), &HirKind::Look(Look::End))) + || !matches!( + (v.first().unwrap().kind(), v.last().unwrap().kind()), + (&HirKind::Look(Look::Start), &HirKind::Look(Look::End)) + ) { return false; } From 1d17854955728257caa511f42ce059bc840da0de Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Thu, 1 Jun 2023 16:21:57 +0800 Subject: [PATCH 13/14] update testcase --- .../src/simplify_expressions/expr_simplifier.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 27a2d67057f48..b25a96e7652bf 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2482,13 +2482,13 @@ mod tests { // regular expressions that match exact captured literals assert_change( regex_match(col("c1"), lit("^(foo|bar)$")), - col("c1").eq(lit("bar")).or(col("c1").eq(lit("foo"))), + col("c1").eq(lit("foo")).or(col("c1").eq(lit("bar"))), ); assert_change( regex_not_match(col("c1"), lit("^(foo|bar)$")), col("c1") - .not_eq(lit("bar")) - .and(col("c1").not_eq(lit("foo"))), + .not_eq(lit("foo")) + .and(col("c1").not_eq(lit("bar"))), ); assert_change( regex_match(col("c1"), lit("^(foo)$")), @@ -2496,9 +2496,7 @@ mod tests { ); assert_change( regex_match(col("c1"), lit("^(foo|bar|baz)$")), - col("c1") - .eq(lit("baz")) - .or((col("c1").eq(lit("bar"))).or(col("c1").eq(lit("foo")))), + ((col("c1").eq(lit("foo"))).or(col("c1").eq(lit("bar")))).or(col("c1").eq(lit("baz"))), ); assert_change( regex_match(col("c1"), lit("^(foo|bar|baz|qux)$")), From bcd79b4dd80c3fe42a797462dae8ad6da8678b1c Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Thu, 1 Jun 2023 16:24:57 +0800 Subject: [PATCH 14/14] fmt --- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index b25a96e7652bf..8aebae18c1ae9 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2496,7 +2496,8 @@ mod tests { ); assert_change( regex_match(col("c1"), lit("^(foo|bar|baz)$")), - ((col("c1").eq(lit("foo"))).or(col("c1").eq(lit("bar")))).or(col("c1").eq(lit("baz"))), + ((col("c1").eq(lit("foo"))).or(col("c1").eq(lit("bar")))) + .or(col("c1").eq(lit("baz"))), ); assert_change( regex_match(col("c1"), lit("^(foo|bar|baz|qux)$")),