Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions datafusion/optimizer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ datafusion-expr = { path = "../expr", version = "15.0.0" }
datafusion-physical-expr = { path = "../physical-expr", version = "15.0.0" }
hashbrown = { version = "0.13", features = ["raw"] }
log = "^0.4"
regex-syntax = "0.6.28"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is technically not a new dependency (as datafusion-physical-expr already depends on regex which depends on regex-syntax). However, regex is an optional dependency whereas this is not optional.

I think it would be better to move this dependency into datafusion-physical-expr so regex can still be optional. I will try to do so over the next few days


[dev-dependencies]
ctor = "0.1.22"
Expand Down
182 changes: 173 additions & 9 deletions datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
//! Expression simplification API

use super::utils::*;
use crate::type_coercion::TypeCoercionRewriter;
use crate::{
simplify_expressions::regex::simplify_regex_expr, type_coercion::TypeCoercionRewriter,
};
use arrow::{
array::new_null_array,
datatypes::{DataType, Field, Schema},
Expand Down Expand Up @@ -332,7 +334,10 @@ impl<'a, S> Simplifier<'a, S> {
impl<'a, S: SimplifyInfo> ExprRewriter for Simplifier<'a, S> {
/// rewrite the expression simplifying any constant expressions
fn mutate(&mut self, expr: Expr) -> Result<Expr> {
use datafusion_expr::Operator::{And, Divide, Eq, Modulo, Multiply, NotEq, Or};
use datafusion_expr::Operator::{
And, Divide, Eq, Modulo, Multiply, NotEq, Or, RegexIMatch, RegexMatch,
RegexNotIMatch, RegexNotMatch,
};

let info = self.info;
let new_expr = match expr {
Expand Down Expand Up @@ -779,10 +784,18 @@ impl<'a, S: SimplifyInfo> ExprRewriter for Simplifier<'a, S> {
)
}
}
expr => {
// no additional rewrites possible
expr
}

//
// Rules for regexes
//
Expr::BinaryExpr(BinaryExpr {
left,
op: op @ (RegexMatch | RegexNotMatch | RegexIMatch | RegexNotIMatch),
right,
}) => simplify_regex_expr(left, op, right)?,

// no additional rewrites possible
expr => expr,
};
Ok(new_expr)
}
Expand All @@ -803,7 +816,7 @@ mod tests {
datatypes::{DataType, Field, Schema},
};
use chrono::{DateTime, TimeZone, Utc};
use datafusion_common::{cast::as_int32_array, DFField, ToDFSchema};
use datafusion_common::{assert_contains, cast::as_int32_array, DFField, ToDFSchema};
use datafusion_expr::*;
use datafusion_physical_expr::{
execution_props::ExecutionProps, functions::make_scalar_function,
Expand Down Expand Up @@ -1576,17 +1589,168 @@ mod tests {
assert_eq!(simplify(expr), expected)
}

#[test]
fn test_simplify_regex() {
// malformed regex
assert_contains!(
try_simplify(regex_match(col("c1"), lit("foo{")))
.unwrap_err()
.to_string(),
"regex parse error"
);

// unsupported cases
assert_no_change(regex_match(col("c1"), lit("foo.*")));
assert_no_change(regex_match(col("c1"), lit("(foo)")));
assert_no_change(regex_match(col("c1"), lit("^foo")));
assert_no_change(regex_match(col("c1"), lit("foo$")));
assert_no_change(regex_match(col("c1"), lit("%")));
assert_no_change(regex_match(col("c1"), lit("_")));
assert_no_change(regex_match(col("c1"), lit("f%o")));
assert_no_change(regex_match(col("c1"), lit("f_o")));

// empty cases
assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
assert_change(
regex_not_match(col("c1"), lit("")),
not_like(col("c1"), "%"),
);
assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
assert_change(
regex_not_imatch(col("c1"), lit("")),
not_ilike(col("c1"), "%"),
);

// single character
assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), "%x%"));

// single word
assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), "%foo%"));

// OR-chain
assert_change(
regex_match(col("c1"), lit("foo|bar|baz")),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it might be better to convert this to an IN <list> which has a bunch of optimizations https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/expressions/in_list.rs

I remember a discussion about IN <list> vs OR chains -- @tustvold do you have any opinion on which is better in this case?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you can express it as an IN that will be fastest, but there isn't a IN LIKE AFAIK (although we could implement such a thing)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can extend the rewrite logic at one point to spot fully anchored regex expressions (i.e. the ones that span the whole string, not any substring). In this case we could emit EQ/NEQ/IN.

like(col("c1"), "%foo%")
.or(like(col("c1"), "%bar%"))
.or(like(col("c1"), "%baz%")),
);
assert_change(
regex_match(col("c1"), lit("foo|x|baz")),
like(col("c1"), "%foo%")
.or(like(col("c1"), "%x%"))
.or(like(col("c1"), "%baz%")),
);
assert_change(
regex_match(col("c1"), lit("foo||baz")),
like(col("c1"), "%foo%")
.or(like(col("c1"), "%"))
.or(like(col("c1"), "%baz%")),
);
assert_change(
regex_not_match(col("c1"), lit("foo|bar|baz")),
not_like(col("c1"), "%foo%")
.and(not_like(col("c1"), "%bar%"))
.and(not_like(col("c1"), "%baz%")),
);
// Too many patterns (MAX_REGEX_ALTERNATIONS_EXPANSION)
assert_no_change(regex_match(col("c1"), lit("foo|bar|baz|blarg|bozo|etc")));
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't see any tests here for MAX_REGEX_ALTERNATIONS_EXPENSION -- maybe we could add one more check for foo|bar|baz|blarg|bozo|etc or something to show it wasn't rewritten

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

#[track_caller]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fn assert_no_change(expr: Expr) {
let optimized = simplify(expr.clone());
assert_eq!(expr, optimized);
}

#[track_caller]
fn assert_change(expr: Expr, expected: Expr) {
let optimized = simplify(expr);
assert_eq!(optimized, expected);
}

fn regex_match(left: Expr, right: Expr) -> Expr {
Expr::BinaryExpr(BinaryExpr {
left: Box::new(left),
op: Operator::RegexMatch,
right: Box::new(right),
})
}

fn regex_not_match(left: Expr, right: Expr) -> Expr {
Expr::BinaryExpr(BinaryExpr {
left: Box::new(left),
op: Operator::RegexNotMatch,
right: Box::new(right),
})
}

fn regex_imatch(left: Expr, right: Expr) -> Expr {
Expr::BinaryExpr(BinaryExpr {
left: Box::new(left),
op: Operator::RegexIMatch,
right: Box::new(right),
})
}

fn regex_not_imatch(left: Expr, right: Expr) -> Expr {
Expr::BinaryExpr(BinaryExpr {
left: Box::new(left),
op: Operator::RegexNotIMatch,
right: Box::new(right),
})
}

fn like(expr: Expr, pattern: &str) -> Expr {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤔 but it turns out that those make Expr::Like rather than BinaryOperator::Like 🤔 -- I am not sure why there is duplicate

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filed #4765 which I think is fairly serious

Expr::Like(Like {
negated: false,
expr: Box::new(expr),
pattern: Box::new(lit(pattern)),
escape_char: None,
})
}

fn not_like(expr: Expr, pattern: &str) -> Expr {
Expr::Like(Like {
negated: true,
expr: Box::new(expr),
pattern: Box::new(lit(pattern)),
escape_char: None,
})
}

fn ilike(expr: Expr, pattern: &str) -> Expr {
Expr::ILike(Like {
negated: false,
expr: Box::new(expr),
pattern: Box::new(lit(pattern)),
escape_char: None,
})
}

fn not_ilike(expr: Expr, pattern: &str) -> Expr {
Expr::ILike(Like {
negated: true,
expr: Box::new(expr),
pattern: Box::new(lit(pattern)),
escape_char: None,
})
}

// ------------------------------
// ----- Simplifier tests -------
// ------------------------------

fn simplify(expr: Expr) -> Expr {
fn try_simplify(expr: Expr) -> Result<Expr> {
let schema = expr_test_schema();
let execution_props = ExecutionProps::new();
let simplifier = ExprSimplifier::new(
SimplifyContext::new(&execution_props).with_schema(schema),
);
simplifier.simplify(expr).unwrap()
simplifier.simplify(expr)
}

fn simplify(expr: Expr) -> Expr {
try_simplify(expr).unwrap()
}

fn expr_test_schema() -> DFSchemaRef {
Expand Down
1 change: 1 addition & 0 deletions datafusion/optimizer/src/simplify_expressions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

pub mod context;
pub mod expr_simplifier;
mod regex;
pub mod simplify_exprs;
mod utils;

Expand Down
Loading