From 1062d5c8e77291bd7ae2245b2f701c12d4d27310 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sat, 5 Jun 2021 11:57:56 +0200 Subject: [PATCH 1/2] add expr::like and expr::notlike to pruning logic --- datafusion/src/physical_optimizer/pruning.rs | 96 +++++++++++++++++++- 1 file changed, 94 insertions(+), 2 deletions(-) diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index c65733bd75267..0e43e4eda554b 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -42,6 +42,7 @@ use crate::{ logical_plan::{Expr, Operator}, optimizer::utils, physical_plan::{planner::DefaultPhysicalPlanner, ColumnarValue, PhysicalExpr}, + scalar::ScalarValue, }; /// Interface to pass statistics information to [`PruningPredicates`] @@ -548,7 +549,7 @@ fn build_predicate_expression( // allow partial failure in predicate expression generation // this can still produce a useful predicate when multiple conditions are joined using AND Err(_) => { - return Ok(logical_plan::lit(true)); + return Ok(unhandled); } }; let corrected_op = expr_builder.correct_operator(op); @@ -586,8 +587,45 @@ fn build_predicate_expression( .min_column_expr()? .lt_eq(expr_builder.scalar_expr().clone()) } + Operator::Like => { + match &**right { + // If the literal is a 'starts_with' + Expr::Literal(ScalarValue::Utf8(Some(string))) + if !string.starts_with('%') => + { + let scalar_expr = + Expr::Literal(ScalarValue::Utf8(Some(string.replace('%', "")))); + // Behaves like Eq + let min_column_expr = expr_builder.min_column_expr()?; + let max_column_expr = expr_builder.max_column_expr()?; + min_column_expr + .lt_eq(scalar_expr.clone()) + .and(scalar_expr.lt_eq(max_column_expr)) + } + _ => unhandled, + } + } + Operator::NotLike => { + match &**right { + // If the literal is a 'starts_with' + Expr::Literal(ScalarValue::Utf8(Some(string))) + if !string.starts_with('%') => + { + let scalar_expr = + Expr::Literal(ScalarValue::Utf8(Some(string.replace('%', "")))); + // Behaves like Eq + let min_column_expr = expr_builder.min_column_expr()?; + let max_column_expr = expr_builder.max_column_expr()?; + // Inverse of Like + min_column_expr + .gt_eq(scalar_expr.clone()) + .and(scalar_expr.gt_eq(max_column_expr)) + } + _ => unhandled, + } + } // other expressions are not supported - _ => logical_plan::lit(true), + _ => unhandled, }; Ok(statistics_expr) } @@ -1095,6 +1133,60 @@ mod tests { Ok(()) } + #[test] + fn row_group_predicate_starts_with() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]); + // test LIKE operator that is converted to a 'starts_with' + let expr = col("c1").like(lit("Banana%")); + let expected_expr = + "#c1_min LtEq Utf8(\"Banana\") And Utf8(\"Banana\") LtEq #c1_max"; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_like() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]); + // test LIKE operator that can't be converted to a 'starts_with' + let expr = col("c1").like(lit("%Banana%")); + let expected_expr = "Boolean(true)"; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_not_starts_with() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]); + // test LIKE operator that can't be converted to a 'starts_with' + let expr = col("c1").not().like(lit("Banana%")); + let expected_expr = + "NOT #c1_min LtEq Utf8(\"Banana\") And Utf8(\"Banana\") LtEq NOT #c1_max"; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_not_like() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]); + // test LIKE operator that can't be converted to a 'starts_with' + let expr = col("c1").not().like(lit("%Banana%")); + let expected_expr = "Boolean(true)"; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + #[test] fn row_group_predicate_required_columns() -> Result<()> { let schema = Schema::new(vec![ From 1ee63ddb552f96cc3fbad9a6e13b3f3191f42973 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Mon, 7 Jun 2021 05:26:18 +0200 Subject: [PATCH 2/2] address review feedback --- datafusion/src/physical_optimizer/pruning.rs | 22 +++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index 0e43e4eda554b..6d87031d7c023 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -593,8 +593,9 @@ fn build_predicate_expression( Expr::Literal(ScalarValue::Utf8(Some(string))) if !string.starts_with('%') => { - let scalar_expr = - Expr::Literal(ScalarValue::Utf8(Some(string.replace('%', "")))); + // Split the string to get the first part before '%' + let split = string.split('%').next().unwrap().to_string(); + let scalar_expr = Expr::Literal(ScalarValue::Utf8(Some(split))); // Behaves like Eq let min_column_expr = expr_builder.min_column_expr()?; let max_column_expr = expr_builder.max_column_expr()?; @@ -611,8 +612,9 @@ fn build_predicate_expression( Expr::Literal(ScalarValue::Utf8(Some(string))) if !string.starts_with('%') => { - let scalar_expr = - Expr::Literal(ScalarValue::Utf8(Some(string.replace('%', "")))); + // Split the string to get the first part before '%' + let split = string.split('%').next().unwrap().to_string(); + let scalar_expr = Expr::Literal(ScalarValue::Utf8(Some(split))); // Behaves like Eq let min_column_expr = expr_builder.min_column_expr()?; let max_column_expr = expr_builder.max_column_expr()?; @@ -1137,7 +1139,7 @@ mod tests { fn row_group_predicate_starts_with() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]); // test LIKE operator that is converted to a 'starts_with' - let expr = col("c1").like(lit("Banana%")); + let expr = col("c1").like(lit("Banana%lemon")); let expected_expr = "#c1_min LtEq Utf8(\"Banana\") And Utf8(\"Banana\") LtEq #c1_max"; let predicate_expr = @@ -1163,10 +1165,10 @@ mod tests { #[test] fn row_group_predicate_not_starts_with() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]); - // test LIKE operator that can't be converted to a 'starts_with' - let expr = col("c1").not().like(lit("Banana%")); + // test NOT LIKE operator that is converted to a 'starts_with' + let expr = col("c1").not_like(lit("Banana%Lemon")); let expected_expr = - "NOT #c1_min LtEq Utf8(\"Banana\") And Utf8(\"Banana\") LtEq NOT #c1_max"; + "#c1_min GtEq Utf8(\"Banana\") And Utf8(\"Banana\") GtEq #c1_max"; let predicate_expr = build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; assert_eq!(format!("{:?}", predicate_expr), expected_expr); @@ -1177,8 +1179,8 @@ mod tests { #[test] fn row_group_predicate_not_like() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]); - // test LIKE operator that can't be converted to a 'starts_with' - let expr = col("c1").not().like(lit("%Banana%")); + // test NOT LIKE operator that can't be converted to a 'starts_with' + let expr = col("c1").not_like(lit("%Banana%")); let expected_expr = "Boolean(true)"; let predicate_expr = build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?;