-
Notifications
You must be signed in to change notification settings - Fork 1.9k
simplify regex expressions #4646
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,9 @@ | |
| //! Expression simplification API | ||
|
|
||
| use super::utils::*; | ||
| use crate::type_coercion::TypeCoercionRewriter; | ||
| use crate::{ | ||
| simplify_expressions::regex::simplify_regex_expr, type_coercion::TypeCoercionRewriter, | ||
| }; | ||
| use arrow::{ | ||
| array::new_null_array, | ||
| datatypes::{DataType, Field, Schema}, | ||
|
|
@@ -332,7 +334,10 @@ impl<'a, S> Simplifier<'a, S> { | |
| impl<'a, S: SimplifyInfo> ExprRewriter for Simplifier<'a, S> { | ||
| /// rewrite the expression simplifying any constant expressions | ||
| fn mutate(&mut self, expr: Expr) -> Result<Expr> { | ||
| use datafusion_expr::Operator::{And, Divide, Eq, Modulo, Multiply, NotEq, Or}; | ||
| use datafusion_expr::Operator::{ | ||
| And, Divide, Eq, Modulo, Multiply, NotEq, Or, RegexIMatch, RegexMatch, | ||
| RegexNotIMatch, RegexNotMatch, | ||
| }; | ||
|
|
||
| let info = self.info; | ||
| let new_expr = match expr { | ||
|
|
@@ -779,10 +784,18 @@ impl<'a, S: SimplifyInfo> ExprRewriter for Simplifier<'a, S> { | |
| ) | ||
| } | ||
| } | ||
| expr => { | ||
| // no additional rewrites possible | ||
| expr | ||
| } | ||
|
|
||
| // | ||
| // Rules for regexes | ||
| // | ||
| Expr::BinaryExpr(BinaryExpr { | ||
| left, | ||
| op: op @ (RegexMatch | RegexNotMatch | RegexIMatch | RegexNotIMatch), | ||
| right, | ||
| }) => simplify_regex_expr(left, op, right)?, | ||
|
|
||
| // no additional rewrites possible | ||
| expr => expr, | ||
| }; | ||
| Ok(new_expr) | ||
| } | ||
|
|
@@ -803,7 +816,7 @@ mod tests { | |
| datatypes::{DataType, Field, Schema}, | ||
| }; | ||
| use chrono::{DateTime, TimeZone, Utc}; | ||
| use datafusion_common::{cast::as_int32_array, DFField, ToDFSchema}; | ||
| use datafusion_common::{assert_contains, cast::as_int32_array, DFField, ToDFSchema}; | ||
| use datafusion_expr::*; | ||
| use datafusion_physical_expr::{ | ||
| execution_props::ExecutionProps, functions::make_scalar_function, | ||
|
|
@@ -1576,17 +1589,168 @@ mod tests { | |
| assert_eq!(simplify(expr), expected) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_simplify_regex() { | ||
| // malformed regex | ||
| assert_contains!( | ||
| try_simplify(regex_match(col("c1"), lit("foo{"))) | ||
| .unwrap_err() | ||
| .to_string(), | ||
| "regex parse error" | ||
| ); | ||
|
|
||
| // unsupported cases | ||
| assert_no_change(regex_match(col("c1"), lit("foo.*"))); | ||
| assert_no_change(regex_match(col("c1"), lit("(foo)"))); | ||
| assert_no_change(regex_match(col("c1"), lit("^foo"))); | ||
| assert_no_change(regex_match(col("c1"), lit("foo$"))); | ||
| assert_no_change(regex_match(col("c1"), lit("%"))); | ||
| assert_no_change(regex_match(col("c1"), lit("_"))); | ||
| assert_no_change(regex_match(col("c1"), lit("f%o"))); | ||
| assert_no_change(regex_match(col("c1"), lit("f_o"))); | ||
|
|
||
| // empty cases | ||
| assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%")); | ||
| assert_change( | ||
| regex_not_match(col("c1"), lit("")), | ||
| not_like(col("c1"), "%"), | ||
| ); | ||
| assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%")); | ||
| assert_change( | ||
| regex_not_imatch(col("c1"), lit("")), | ||
| not_ilike(col("c1"), "%"), | ||
| ); | ||
|
|
||
| // single character | ||
| assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), "%x%")); | ||
|
|
||
| // single word | ||
| assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), "%foo%")); | ||
|
|
||
| // OR-chain | ||
| assert_change( | ||
| regex_match(col("c1"), lit("foo|bar|baz")), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it might be better to convert this to an I remember a discussion about
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you can express it as an IN that will be fastest, but there isn't a IN LIKE AFAIK (although we could implement such a thing)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can extend the rewrite logic at one point to spot fully anchored regex expressions (i.e. the ones that span the whole string, not any substring). In this case we could emit EQ/NEQ/IN. |
||
| like(col("c1"), "%foo%") | ||
| .or(like(col("c1"), "%bar%")) | ||
| .or(like(col("c1"), "%baz%")), | ||
| ); | ||
| assert_change( | ||
| regex_match(col("c1"), lit("foo|x|baz")), | ||
| like(col("c1"), "%foo%") | ||
| .or(like(col("c1"), "%x%")) | ||
| .or(like(col("c1"), "%baz%")), | ||
| ); | ||
| assert_change( | ||
| regex_match(col("c1"), lit("foo||baz")), | ||
| like(col("c1"), "%foo%") | ||
| .or(like(col("c1"), "%")) | ||
| .or(like(col("c1"), "%baz%")), | ||
| ); | ||
| assert_change( | ||
| regex_not_match(col("c1"), lit("foo|bar|baz")), | ||
| not_like(col("c1"), "%foo%") | ||
| .and(not_like(col("c1"), "%bar%")) | ||
| .and(not_like(col("c1"), "%baz%")), | ||
| ); | ||
| // Too many patterns (MAX_REGEX_ALTERNATIONS_EXPANSION) | ||
| assert_no_change(regex_match(col("c1"), lit("foo|bar|baz|blarg|bozo|etc"))); | ||
| } | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't see any tests here for
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
| #[track_caller] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| fn assert_no_change(expr: Expr) { | ||
| let optimized = simplify(expr.clone()); | ||
| assert_eq!(expr, optimized); | ||
| } | ||
|
|
||
| #[track_caller] | ||
| fn assert_change(expr: Expr, expected: Expr) { | ||
| let optimized = simplify(expr); | ||
| assert_eq!(optimized, expected); | ||
| } | ||
|
|
||
| fn regex_match(left: Expr, right: Expr) -> Expr { | ||
| Expr::BinaryExpr(BinaryExpr { | ||
| left: Box::new(left), | ||
| op: Operator::RegexMatch, | ||
| right: Box::new(right), | ||
| }) | ||
| } | ||
|
|
||
| fn regex_not_match(left: Expr, right: Expr) -> Expr { | ||
| Expr::BinaryExpr(BinaryExpr { | ||
| left: Box::new(left), | ||
| op: Operator::RegexNotMatch, | ||
| right: Box::new(right), | ||
| }) | ||
| } | ||
|
|
||
| fn regex_imatch(left: Expr, right: Expr) -> Expr { | ||
| Expr::BinaryExpr(BinaryExpr { | ||
| left: Box::new(left), | ||
| op: Operator::RegexIMatch, | ||
| right: Box::new(right), | ||
| }) | ||
| } | ||
|
|
||
| fn regex_not_imatch(left: Expr, right: Expr) -> Expr { | ||
| Expr::BinaryExpr(BinaryExpr { | ||
| left: Box::new(left), | ||
| op: Operator::RegexNotIMatch, | ||
| right: Box::new(right), | ||
| }) | ||
| } | ||
|
|
||
| fn like(expr: Expr, pattern: &str) -> Expr { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there are already functions like this : https://docs.rs/datafusion/15.0.0/datafusion/prelude/enum.Expr.html#method.like
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤔 but it turns out that those make
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Filed #4765 which I think is fairly serious |
||
| Expr::Like(Like { | ||
| negated: false, | ||
| expr: Box::new(expr), | ||
| pattern: Box::new(lit(pattern)), | ||
| escape_char: None, | ||
| }) | ||
| } | ||
|
|
||
| fn not_like(expr: Expr, pattern: &str) -> Expr { | ||
| Expr::Like(Like { | ||
| negated: true, | ||
| expr: Box::new(expr), | ||
| pattern: Box::new(lit(pattern)), | ||
| escape_char: None, | ||
| }) | ||
| } | ||
|
|
||
| fn ilike(expr: Expr, pattern: &str) -> Expr { | ||
| Expr::ILike(Like { | ||
| negated: false, | ||
| expr: Box::new(expr), | ||
| pattern: Box::new(lit(pattern)), | ||
| escape_char: None, | ||
| }) | ||
| } | ||
|
|
||
| fn not_ilike(expr: Expr, pattern: &str) -> Expr { | ||
| Expr::ILike(Like { | ||
| negated: true, | ||
| expr: Box::new(expr), | ||
| pattern: Box::new(lit(pattern)), | ||
| escape_char: None, | ||
| }) | ||
| } | ||
|
|
||
| // ------------------------------ | ||
| // ----- Simplifier tests ------- | ||
| // ------------------------------ | ||
|
|
||
| fn simplify(expr: Expr) -> Expr { | ||
| fn try_simplify(expr: Expr) -> Result<Expr> { | ||
| let schema = expr_test_schema(); | ||
| let execution_props = ExecutionProps::new(); | ||
| let simplifier = ExprSimplifier::new( | ||
| SimplifyContext::new(&execution_props).with_schema(schema), | ||
| ); | ||
| simplifier.simplify(expr).unwrap() | ||
| simplifier.simplify(expr) | ||
| } | ||
|
|
||
| fn simplify(expr: Expr) -> Expr { | ||
| try_simplify(expr).unwrap() | ||
| } | ||
|
|
||
| fn expr_test_schema() -> DFSchemaRef { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
|
|
||
| pub mod context; | ||
| pub mod expr_simplifier; | ||
| mod regex; | ||
| pub mod simplify_exprs; | ||
| mod utils; | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is technically not a new dependency (as datafusion-physical-expr already depends on
regexwhich depends onregex-syntax). However, regex is an optional dependency whereas this is not optional.I think it would be better to move this dependency into
datafusion-physical-exprso regex can still be optional. I will try to do so over the next few days