diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index c65733bd75267..6d87031d7c023 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -42,6 +42,7 @@ use crate::{ logical_plan::{Expr, Operator}, optimizer::utils, physical_plan::{planner::DefaultPhysicalPlanner, ColumnarValue, PhysicalExpr}, + scalar::ScalarValue, }; /// Interface to pass statistics information to [`PruningPredicates`] @@ -548,7 +549,7 @@ fn build_predicate_expression( // allow partial failure in predicate expression generation // this can still produce a useful predicate when multiple conditions are joined using AND Err(_) => { - return Ok(logical_plan::lit(true)); + return Ok(unhandled); } }; let corrected_op = expr_builder.correct_operator(op); @@ -586,8 +587,47 @@ fn build_predicate_expression( .min_column_expr()? .lt_eq(expr_builder.scalar_expr().clone()) } + Operator::Like => { + match &**right { + // If the literal is a 'starts_with' + Expr::Literal(ScalarValue::Utf8(Some(string))) + if !string.starts_with('%') => + { + // Split the string to get the first part before '%' + let split = string.split('%').next().unwrap().to_string(); + let scalar_expr = Expr::Literal(ScalarValue::Utf8(Some(split))); + // Behaves like Eq + let min_column_expr = expr_builder.min_column_expr()?; + let max_column_expr = expr_builder.max_column_expr()?; + min_column_expr + .lt_eq(scalar_expr.clone()) + .and(scalar_expr.lt_eq(max_column_expr)) + } + _ => unhandled, + } + } + Operator::NotLike => { + match &**right { + // If the literal is a 'starts_with' + Expr::Literal(ScalarValue::Utf8(Some(string))) + if !string.starts_with('%') => + { + // Split the string to get the first part before '%' + let split = string.split('%').next().unwrap().to_string(); + let scalar_expr = Expr::Literal(ScalarValue::Utf8(Some(split))); + // Behaves like Eq + let min_column_expr = expr_builder.min_column_expr()?; + let max_column_expr = expr_builder.max_column_expr()?; + // Inverse of Like + min_column_expr + .gt_eq(scalar_expr.clone()) + .and(scalar_expr.gt_eq(max_column_expr)) + } + _ => unhandled, + } + } // other expressions are not supported - _ => logical_plan::lit(true), + _ => unhandled, }; Ok(statistics_expr) } @@ -1095,6 +1135,60 @@ mod tests { Ok(()) } + #[test] + fn row_group_predicate_starts_with() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]); + // test LIKE operator that is converted to a 'starts_with' + let expr = col("c1").like(lit("Banana%lemon")); + let expected_expr = + "#c1_min LtEq Utf8(\"Banana\") And Utf8(\"Banana\") LtEq #c1_max"; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_like() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]); + // test LIKE operator that can't be converted to a 'starts_with' + let expr = col("c1").like(lit("%Banana%")); + let expected_expr = "Boolean(true)"; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_not_starts_with() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]); + // test NOT LIKE operator that is converted to a 'starts_with' + let expr = col("c1").not_like(lit("Banana%Lemon")); + let expected_expr = + "#c1_min GtEq Utf8(\"Banana\") And Utf8(\"Banana\") GtEq #c1_max"; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_not_like() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]); + // test NOT LIKE operator that can't be converted to a 'starts_with' + let expr = col("c1").not_like(lit("%Banana%")); + let expected_expr = "Boolean(true)"; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + #[test] fn row_group_predicate_required_columns() -> Result<()> { let schema = Schema::new(vec![