diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index baddb42261e40..a5485a0cfb30d 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -745,6 +745,7 @@ dependencies = [ "datafusion-physical-expr", "hashbrown 0.13.1", "log", + "regex-syntax", ] [[package]] diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index e3810ba300d27..d0f94a7ef5586 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -45,6 +45,7 @@ datafusion-expr = { path = "../expr", version = "15.0.0" } datafusion-physical-expr = { path = "../physical-expr", version = "15.0.0" } hashbrown = { version = "0.13", features = ["raw"] } log = "^0.4" +regex-syntax = "0.6.28" [dev-dependencies] ctor = "0.1.22" diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 3a51099fe6452..08c96168dc5df 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -18,7 +18,9 @@ //! Expression simplification API use super::utils::*; -use crate::type_coercion::TypeCoercionRewriter; +use crate::{ + simplify_expressions::regex::simplify_regex_expr, type_coercion::TypeCoercionRewriter, +}; use arrow::{ array::new_null_array, datatypes::{DataType, Field, Schema}, @@ -332,7 +334,10 @@ impl<'a, S> Simplifier<'a, S> { impl<'a, S: SimplifyInfo> ExprRewriter for Simplifier<'a, S> { /// rewrite the expression simplifying any constant expressions fn mutate(&mut self, expr: Expr) -> Result { - use datafusion_expr::Operator::{And, Divide, Eq, Modulo, Multiply, NotEq, Or}; + use datafusion_expr::Operator::{ + And, Divide, Eq, Modulo, Multiply, NotEq, Or, RegexIMatch, RegexMatch, + RegexNotIMatch, RegexNotMatch, + }; let info = self.info; let new_expr = match expr { @@ -779,10 +784,18 @@ impl<'a, S: SimplifyInfo> ExprRewriter for Simplifier<'a, S> { ) } } - expr => { - // no additional rewrites possible - expr - } + + // + // Rules for regexes + // + Expr::BinaryExpr(BinaryExpr { + left, + op: op @ (RegexMatch | RegexNotMatch | RegexIMatch | RegexNotIMatch), + right, + }) => simplify_regex_expr(left, op, right)?, + + // no additional rewrites possible + expr => expr, }; Ok(new_expr) } @@ -803,7 +816,7 @@ mod tests { datatypes::{DataType, Field, Schema}, }; use chrono::{DateTime, TimeZone, Utc}; - use datafusion_common::{cast::as_int32_array, DFField, ToDFSchema}; + use datafusion_common::{assert_contains, cast::as_int32_array, DFField, ToDFSchema}; use datafusion_expr::*; use datafusion_physical_expr::{ execution_props::ExecutionProps, functions::make_scalar_function, @@ -1576,17 +1589,168 @@ mod tests { assert_eq!(simplify(expr), expected) } + #[test] + fn test_simplify_regex() { + // malformed regex + assert_contains!( + try_simplify(regex_match(col("c1"), lit("foo{"))) + .unwrap_err() + .to_string(), + "regex parse error" + ); + + // unsupported cases + assert_no_change(regex_match(col("c1"), lit("foo.*"))); + assert_no_change(regex_match(col("c1"), lit("(foo)"))); + assert_no_change(regex_match(col("c1"), lit("^foo"))); + assert_no_change(regex_match(col("c1"), lit("foo$"))); + assert_no_change(regex_match(col("c1"), lit("%"))); + assert_no_change(regex_match(col("c1"), lit("_"))); + assert_no_change(regex_match(col("c1"), lit("f%o"))); + assert_no_change(regex_match(col("c1"), lit("f_o"))); + + // empty cases + assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%")); + assert_change( + regex_not_match(col("c1"), lit("")), + not_like(col("c1"), "%"), + ); + assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%")); + assert_change( + regex_not_imatch(col("c1"), lit("")), + not_ilike(col("c1"), "%"), + ); + + // single character + assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), "%x%")); + + // single word + assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), "%foo%")); + + // OR-chain + assert_change( + regex_match(col("c1"), lit("foo|bar|baz")), + like(col("c1"), "%foo%") + .or(like(col("c1"), "%bar%")) + .or(like(col("c1"), "%baz%")), + ); + assert_change( + regex_match(col("c1"), lit("foo|x|baz")), + like(col("c1"), "%foo%") + .or(like(col("c1"), "%x%")) + .or(like(col("c1"), "%baz%")), + ); + assert_change( + regex_match(col("c1"), lit("foo||baz")), + like(col("c1"), "%foo%") + .or(like(col("c1"), "%")) + .or(like(col("c1"), "%baz%")), + ); + assert_change( + regex_not_match(col("c1"), lit("foo|bar|baz")), + not_like(col("c1"), "%foo%") + .and(not_like(col("c1"), "%bar%")) + .and(not_like(col("c1"), "%baz%")), + ); + // Too many patterns (MAX_REGEX_ALTERNATIONS_EXPANSION) + assert_no_change(regex_match(col("c1"), lit("foo|bar|baz|blarg|bozo|etc"))); + } + + #[track_caller] + fn assert_no_change(expr: Expr) { + let optimized = simplify(expr.clone()); + assert_eq!(expr, optimized); + } + + #[track_caller] + fn assert_change(expr: Expr, expected: Expr) { + let optimized = simplify(expr); + assert_eq!(optimized, expected); + } + + fn regex_match(left: Expr, right: Expr) -> Expr { + Expr::BinaryExpr(BinaryExpr { + left: Box::new(left), + op: Operator::RegexMatch, + right: Box::new(right), + }) + } + + fn regex_not_match(left: Expr, right: Expr) -> Expr { + Expr::BinaryExpr(BinaryExpr { + left: Box::new(left), + op: Operator::RegexNotMatch, + right: Box::new(right), + }) + } + + fn regex_imatch(left: Expr, right: Expr) -> Expr { + Expr::BinaryExpr(BinaryExpr { + left: Box::new(left), + op: Operator::RegexIMatch, + right: Box::new(right), + }) + } + + fn regex_not_imatch(left: Expr, right: Expr) -> Expr { + Expr::BinaryExpr(BinaryExpr { + left: Box::new(left), + op: Operator::RegexNotIMatch, + right: Box::new(right), + }) + } + + fn like(expr: Expr, pattern: &str) -> Expr { + Expr::Like(Like { + negated: false, + expr: Box::new(expr), + pattern: Box::new(lit(pattern)), + escape_char: None, + }) + } + + fn not_like(expr: Expr, pattern: &str) -> Expr { + Expr::Like(Like { + negated: true, + expr: Box::new(expr), + pattern: Box::new(lit(pattern)), + escape_char: None, + }) + } + + fn ilike(expr: Expr, pattern: &str) -> Expr { + Expr::ILike(Like { + negated: false, + expr: Box::new(expr), + pattern: Box::new(lit(pattern)), + escape_char: None, + }) + } + + fn not_ilike(expr: Expr, pattern: &str) -> Expr { + Expr::ILike(Like { + negated: true, + expr: Box::new(expr), + pattern: Box::new(lit(pattern)), + escape_char: None, + }) + } + // ------------------------------ // ----- Simplifier tests ------- // ------------------------------ - fn simplify(expr: Expr) -> Expr { + fn try_simplify(expr: Expr) -> Result { let schema = expr_test_schema(); let execution_props = ExecutionProps::new(); let simplifier = ExprSimplifier::new( SimplifyContext::new(&execution_props).with_schema(schema), ); - simplifier.simplify(expr).unwrap() + simplifier.simplify(expr) + } + + fn simplify(expr: Expr) -> Expr { + try_simplify(expr).unwrap() } fn expr_test_schema() -> DFSchemaRef { diff --git a/datafusion/optimizer/src/simplify_expressions/mod.rs b/datafusion/optimizer/src/simplify_expressions/mod.rs index 0eb3359c692fb..975976e1f5205 100644 --- a/datafusion/optimizer/src/simplify_expressions/mod.rs +++ b/datafusion/optimizer/src/simplify_expressions/mod.rs @@ -17,6 +17,7 @@ pub mod context; pub mod expr_simplifier; +mod regex; pub mod simplify_exprs; mod utils; diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs new file mode 100644 index 0000000000000..038e1fcce9edf --- /dev/null +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion_common::{DataFusionError, ScalarValue}; +use datafusion_expr::{BinaryExpr, Expr, Like, Operator}; +use regex_syntax::hir::{Hir, HirKind, Literal}; + +/// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions. +const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4; + +pub fn simplify_regex_expr( + left: Box, + op: Operator, + right: Box, +) -> Result { + let mode = OperatorMode::new(&op); + + if let Expr::Literal(ScalarValue::Utf8(Some(pattern))) = right.as_ref() { + match regex_syntax::Parser::new().parse(pattern) { + Ok(hir) => { + let kind = hir.kind(); + + if let HirKind::Alternation(alts) = kind { + if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION { + if let Some(expr) = lower_alt(&mode, &left, alts) { + return Ok(expr); + } + } + } else if let Some(expr) = lower_simple(&mode, &left, &hir) { + return Ok(expr); + } + } + Err(e) => { + // error out early since the execution may fail anyways + return Err(DataFusionError::Context( + "Invalid regex".to_owned(), + Box::new(DataFusionError::External(Box::new(e))), + )); + } + } + } + + // leave untouched if optimization didn't work + Ok(Expr::BinaryExpr(BinaryExpr { left, op, right })) +} + +struct OperatorMode { + not: bool, + i: bool, +} + +impl OperatorMode { + fn new(op: &Operator) -> Self { + let not = match op { + Operator::RegexMatch | Operator::RegexIMatch => false, + Operator::RegexNotMatch | Operator::RegexNotIMatch => true, + _ => unreachable!(), + }; + + let i = match op { + Operator::RegexMatch | Operator::RegexNotMatch => false, + Operator::RegexIMatch | Operator::RegexNotIMatch => true, + _ => unreachable!(), + }; + + Self { not, i } + } + + fn expr(&self, expr: Box, pattern: String) -> Expr { + let like = Like { + negated: self.not, + expr, + pattern: Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern)))), + escape_char: None, + }; + + if self.i { + Expr::ILike(like) + } else { + Expr::Like(like) + } + } +} + +fn collect_concat_to_like_string(parts: &[Hir]) -> Option { + let mut s = String::with_capacity(parts.len() + 2); + s.push('%'); + + for sub in parts { + if let HirKind::Literal(Literal::Unicode(c)) = sub.kind() { + if !is_safe_for_like(*c) { + return None; + } + s.push(*c); + } else { + return None; + } + } + + s.push('%'); + Some(s) +} + +fn is_safe_for_like(c: char) -> bool { + (c != '%') && (c != '_') +} + +fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { + match hir.kind() { + HirKind::Empty => { + return Some(mode.expr(Box::new(left.clone()), "%".to_owned())); + } + HirKind::Literal(Literal::Unicode(c)) if is_safe_for_like(*c) => { + return Some(mode.expr(Box::new(left.clone()), format!("%{c}%"))); + } + HirKind::Concat(inner) => { + if let Some(pattern) = collect_concat_to_like_string(inner) { + return Some(mode.expr(Box::new(left.clone()), pattern)); + } + } + _ => {} + } + + None +} + +fn lower_alt(mode: &OperatorMode, left: &Expr, alts: &[Hir]) -> Option { + let mut accu: Option = None; + + for part in alts { + if let Some(expr) = lower_simple(mode, left, part) { + accu = match accu { + Some(accu) => { + if mode.not { + Some(accu.and(expr)) + } else { + Some(accu.or(expr)) + } + } + None => Some(expr), + }; + } else { + return None; + } + } + + Some(accu.expect("at least two alts")) +}