From b411c90936fba3427808277603b4649ea55e1028 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 22 Jan 2024 21:16:30 +0800 Subject: [PATCH 01/12] simplified always true or false expression Signed-off-by: jayzhan211 --- .../simplify_expressions/expr_simplifier.rs | 6 +- .../simplify_expressions/inlist_simplifier.rs | 90 +++++++++++++++++++ .../optimizer/src/simplify_expressions/mod.rs | 1 + .../sqllogictest/test_files/predicates.slt | 20 +++++ 4 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 3ba343003e33b..a0fb1b1aaac30 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -19,8 +19,10 @@ use std::ops::Not; -use super::or_in_list_simplifier::OrInListSimplifier; use super::utils::*; +use super::{ + inlist_simplifier::InListSimplifier, or_in_list_simplifier::OrInListSimplifier, +}; use crate::analyzer::type_coercion::TypeCoercionRewriter; use crate::simplify_expressions::guarantees::GuaranteeRewriter; use crate::simplify_expressions::regex::simplify_regex_expr; @@ -133,6 +135,7 @@ impl ExprSimplifier { let mut simplifier = Simplifier::new(&self.info); let mut const_evaluator = ConstEvaluator::try_new(self.info.execution_props())?; let mut or_in_list_simplifier = OrInListSimplifier::new(); + let mut boolean_simplifier = InListSimplifier::new(); let mut guarantee_rewriter = GuaranteeRewriter::new(&self.guarantees); // TODO iterate until no changes are made during rewrite @@ -142,6 +145,7 @@ impl ExprSimplifier { expr.rewrite(&mut const_evaluator)? .rewrite(&mut simplifier)? .rewrite(&mut or_in_list_simplifier)? + .rewrite(&mut boolean_simplifier)? .rewrite(&mut guarantee_rewriter)? // run both passes twice to try an minimize simplifications that we missed .rewrite(&mut const_evaluator)? diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs new file mode 100644 index 0000000000000..f4fc55eb91183 --- /dev/null +++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! This module implements a rule that simplifies expressions that is guaranteed to be true or false at planning time + +use std::borrow::Cow; +use std::collections::HashSet; + +use datafusion_common::tree_node::TreeNodeRewriter; +use datafusion_common::Result; +use datafusion_expr::expr::InList; +use datafusion_expr::{lit, BinaryExpr, Expr, Operator}; + +/// Simplify expressions that is guaranteed to be true or false to a literal boolean expression +/// +/// Rules: +/// 1. `a in (1,2,3) AND a in (4,5) -> Intersection` +// 2. `a in (1,2,3) OR a in (1,2,3) -> a in (1,2,3)` +pub(super) struct InListSimplifier {} + +impl InListSimplifier { + pub(super) fn new() -> Self { + Self {} + } +} + +impl TreeNodeRewriter for InListSimplifier { + type N = Expr; + + fn mutate(&mut self, expr: Expr) -> Result { + if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = &expr { + match (left.as_ref(), op, right.as_ref()) { + (Expr::InList(l1), Operator::And, Expr::InList(l2)) => { + if l1.expr == l2.expr && !l1.negated && !l2.negated { + let l1_set: HashSet = l1.list.iter().cloned().collect(); + let intersect_list: Vec = l2 + .list + .iter() + .filter(|x| l1_set.contains(x)) + .cloned() + .collect(); + if intersect_list.is_empty() { + return Ok(lit(false)); + } + let merged_inlist = InList { + expr: l1.expr.clone(), + list: intersect_list, + negated: false, + }; + return Ok(Expr::InList(merged_inlist)); + } else if l1.expr == l2.expr && l1.negated && l2.negated { + let l1_set: HashSet = l1.list.iter().cloned().collect(); + let intersect_list: Vec = l2 + .list + .iter() + .filter(|x| l1_set.contains(x)) + .cloned() + .collect(); + if intersect_list.is_empty() { + return Ok(lit(true)); + } + let merged_inlist = InList { + expr: l1.expr.clone(), + list: intersect_list, + negated: true, + }; + return Ok(Expr::InList(merged_inlist)); + } + } + _ => {} + } + } + + Ok(expr) + } +} \ No newline at end of file diff --git a/datafusion/optimizer/src/simplify_expressions/mod.rs b/datafusion/optimizer/src/simplify_expressions/mod.rs index 2cf6ed166cdde..44ba5b3e3b847 100644 --- a/datafusion/optimizer/src/simplify_expressions/mod.rs +++ b/datafusion/optimizer/src/simplify_expressions/mod.rs @@ -18,6 +18,7 @@ pub mod context; pub mod expr_simplifier; mod guarantees; +mod inlist_simplifier; mod or_in_list_simplifier; mod regex; pub mod simplify_exprs; diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index e32e415338a7a..58cbd04774e9f 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -725,3 +725,23 @@ AggregateExec: mode=SinglePartitioned, gby=[p_partkey@2 as p_partkey], aggr=[SUM --------CoalesceBatchesExec: target_batch_size=8192 ----------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=1 ------------MemoryExec: partitions=1, partition_sizes=[1] + +# Inlist simplification + +statement ok +create table t(x int) as values (1), (2), (3); + +query TT +explain select x from t where x IN (1,2,3) AND x IN (4,5); +---- +logical_plan EmptyRelation +physical_plan EmptyExec + +query TT +explain select x from t where x NOT IN (1,2,3,4) AND x NOT IN (5,6,7,8); +---- +logical_plan TableScan: t projection=[x] +physical_plan MemoryExec: partitions=1, partition_sizes=[1] + +statement ok +drop table t; From c12695458bad8cc5f4fd9c93abef9e8cc9895ee0 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 22 Jan 2024 21:16:46 +0800 Subject: [PATCH 02/12] fmt Signed-off-by: jayzhan211 --- .../optimizer/src/simplify_expressions/inlist_simplifier.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs index f4fc55eb91183..866fdc38d14ed 100644 --- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs @@ -87,4 +87,4 @@ impl TreeNodeRewriter for InListSimplifier { Ok(expr) } -} \ No newline at end of file +} From cb19bb8cccad006a8bcefe85849874987574f955 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 22 Jan 2024 21:20:49 +0800 Subject: [PATCH 03/12] clippy Signed-off-by: jayzhan211 --- .../simplify_expressions/inlist_simplifier.rs | 74 +++++++++---------- 1 file changed, 36 insertions(+), 38 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs index 866fdc38d14ed..c6269310321f2 100644 --- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs @@ -17,7 +17,6 @@ //! This module implements a rule that simplifies expressions that is guaranteed to be true or false at planning time -use std::borrow::Cow; use std::collections::HashSet; use datafusion_common::tree_node::TreeNodeRewriter; @@ -43,45 +42,44 @@ impl TreeNodeRewriter for InListSimplifier { fn mutate(&mut self, expr: Expr) -> Result { if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = &expr { - match (left.as_ref(), op, right.as_ref()) { - (Expr::InList(l1), Operator::And, Expr::InList(l2)) => { - if l1.expr == l2.expr && !l1.negated && !l2.negated { - let l1_set: HashSet = l1.list.iter().cloned().collect(); - let intersect_list: Vec = l2 - .list - .iter() - .filter(|x| l1_set.contains(x)) - .cloned() - .collect(); - if intersect_list.is_empty() { - return Ok(lit(false)); - } - let merged_inlist = InList { - expr: l1.expr.clone(), - list: intersect_list, - negated: false, - }; - return Ok(Expr::InList(merged_inlist)); - } else if l1.expr == l2.expr && l1.negated && l2.negated { - let l1_set: HashSet = l1.list.iter().cloned().collect(); - let intersect_list: Vec = l2 - .list - .iter() - .filter(|x| l1_set.contains(x)) - .cloned() - .collect(); - if intersect_list.is_empty() { - return Ok(lit(true)); - } - let merged_inlist = InList { - expr: l1.expr.clone(), - list: intersect_list, - negated: true, - }; - return Ok(Expr::InList(merged_inlist)); + if let (Expr::InList(l1), Operator::And, Expr::InList(l2)) = + (left.as_ref(), op, right.as_ref()) + { + if l1.expr == l2.expr && !l1.negated && !l2.negated { + let l1_set: HashSet = l1.list.iter().cloned().collect(); + let intersect_list: Vec = l2 + .list + .iter() + .filter(|x| l1_set.contains(x)) + .cloned() + .collect(); + if intersect_list.is_empty() { + return Ok(lit(false)); } + let merged_inlist = InList { + expr: l1.expr.clone(), + list: intersect_list, + negated: false, + }; + return Ok(Expr::InList(merged_inlist)); + } else if l1.expr == l2.expr && l1.negated && l2.negated { + let l1_set: HashSet = l1.list.iter().cloned().collect(); + let intersect_list: Vec = l2 + .list + .iter() + .filter(|x| l1_set.contains(x)) + .cloned() + .collect(); + if intersect_list.is_empty() { + return Ok(lit(true)); + } + let merged_inlist = InList { + expr: l1.expr.clone(), + list: intersect_list, + negated: true, + }; + return Ok(Expr::InList(merged_inlist)); } - _ => {} } } From f4c37416dc7015629852d552ade0d0141b44f059 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 22 Jan 2024 22:01:01 +0800 Subject: [PATCH 04/12] a in and b not in cases Signed-off-by: jayzhan211 --- .../simplify_expressions/expr_simplifier.rs | 4 +- .../simplify_expressions/inlist_simplifier.rs | 39 ++++++++++++++++++- .../sqllogictest/test_files/predicates.slt | 34 ++++++++++++++++ 3 files changed, 73 insertions(+), 4 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index a0fb1b1aaac30..3508e014721f7 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -135,7 +135,7 @@ impl ExprSimplifier { let mut simplifier = Simplifier::new(&self.info); let mut const_evaluator = ConstEvaluator::try_new(self.info.execution_props())?; let mut or_in_list_simplifier = OrInListSimplifier::new(); - let mut boolean_simplifier = InListSimplifier::new(); + let mut inlist_simplifier = InListSimplifier::new(); let mut guarantee_rewriter = GuaranteeRewriter::new(&self.guarantees); // TODO iterate until no changes are made during rewrite @@ -145,7 +145,7 @@ impl ExprSimplifier { expr.rewrite(&mut const_evaluator)? .rewrite(&mut simplifier)? .rewrite(&mut or_in_list_simplifier)? - .rewrite(&mut boolean_simplifier)? + .rewrite(&mut inlist_simplifier)? .rewrite(&mut guarantee_rewriter)? // run both passes twice to try an minimize simplifications that we missed .rewrite(&mut const_evaluator)? diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs index c6269310321f2..eada3561987b7 100644 --- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs @@ -27,8 +27,9 @@ use datafusion_expr::{lit, BinaryExpr, Expr, Operator}; /// Simplify expressions that is guaranteed to be true or false to a literal boolean expression /// /// Rules: -/// 1. `a in (1,2,3) AND a in (4,5) -> Intersection` -// 2. `a in (1,2,3) OR a in (1,2,3) -> a in (1,2,3)` +/// 1. `a in (1,2,3) AND a in (4,5) -> a in (1,2,3,4,5) Intersection` +/// 2. `a not int (1,2,3) AND a not in (4,5,6) -> a not in (1,2,3,4,5,6) Intersection` +/// 3. `a in (1,2,3) AND a not in (1,3,x,y,z...) -> a in (2) Exception` pub(super) struct InListSimplifier {} impl InListSimplifier { @@ -79,6 +80,40 @@ impl TreeNodeRewriter for InListSimplifier { negated: true, }; return Ok(Expr::InList(merged_inlist)); + } else if l1.expr == l2.expr && !l1.negated && l2.negated { + let l2_set: HashSet = l2.list.iter().cloned().collect(); + let except_list: Vec = l1 + .list + .iter() + .filter(|x| !l2_set.contains(x)) + .cloned() + .collect(); + if except_list.is_empty() { + return Ok(lit(false)); + } + let merged_inlist = InList { + expr: l1.expr.clone(), + list: except_list, + negated: false, + }; + return Ok(Expr::InList(merged_inlist)); + } else if l1.expr == l2.expr && l1.negated && !l2.negated { + let l2_set: HashSet = l2.list.iter().cloned().collect(); + let except_list: Vec = l1 + .list + .iter() + .filter(|x| !l2_set.contains(x)) + .cloned() + .collect(); + if except_list.is_empty() { + return Ok(lit(true)); + } + let merged_inlist = InList { + expr: l1.expr.clone(), + list: except_list, + negated: true, + }; + return Ok(Expr::InList(merged_inlist)); } } } diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index 58cbd04774e9f..3b66c920dc40a 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -743,5 +743,39 @@ explain select x from t where x NOT IN (1,2,3,4) AND x NOT IN (5,6,7,8); logical_plan TableScan: t projection=[x] physical_plan MemoryExec: partitions=1, partition_sizes=[1] +query TT +explain select x from t where x IN (1,2,3,4) AND x NOT IN (1,2,3,4,5,6); +---- +logical_plan EmptyRelation +physical_plan EmptyExec + +query TT +explain select x from t where x IN (1,2,3,4,5) AND x NOT IN (1,2,3,4); +---- +logical_plan +Filter: t.x = Int32(5) +--TableScan: t projection=[x] +physical_plan +CoalesceBatchesExec: target_batch_size=8192 +--FilterExec: x@0 = 5 +----MemoryExec: partitions=1, partition_sizes=[1] + +query TT +explain select x from t where x NOT IN (1,2,3,4) AND x IN (1,2,3,4,5); +---- +logical_plan TableScan: t projection=[x] +physical_plan MemoryExec: partitions=1, partition_sizes=[1] + +query TT +explain select x from t where x NOT IN (1,2,3,4,5) AND x IN (1,2,3); +---- +logical_plan +Filter: t.x != Int32(4) AND t.x != Int32(5) +--TableScan: t projection=[x] +physical_plan +CoalesceBatchesExec: target_batch_size=8192 +--FilterExec: x@0 != 4 AND x@0 != 5 +----MemoryExec: partitions=1, partition_sizes=[1] + statement ok drop table t; From fd2fcba77985fce157ac5a0c6c0cbd5f5086ffe8 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 22 Jan 2024 22:15:10 +0800 Subject: [PATCH 05/12] fix except Signed-off-by: jayzhan211 --- .../simplify_expressions/inlist_simplifier.rs | 108 +++++++----------- .../sqllogictest/test_files/predicates.slt | 16 +-- 2 files changed, 52 insertions(+), 72 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs index eada3561987b7..2ba0612ea6732 100644 --- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs @@ -47,73 +47,13 @@ impl TreeNodeRewriter for InListSimplifier { (left.as_ref(), op, right.as_ref()) { if l1.expr == l2.expr && !l1.negated && !l2.negated { - let l1_set: HashSet = l1.list.iter().cloned().collect(); - let intersect_list: Vec = l2 - .list - .iter() - .filter(|x| l1_set.contains(x)) - .cloned() - .collect(); - if intersect_list.is_empty() { - return Ok(lit(false)); - } - let merged_inlist = InList { - expr: l1.expr.clone(), - list: intersect_list, - negated: false, - }; - return Ok(Expr::InList(merged_inlist)); + return inlist_intersection(l1, l2, false); } else if l1.expr == l2.expr && l1.negated && l2.negated { - let l1_set: HashSet = l1.list.iter().cloned().collect(); - let intersect_list: Vec = l2 - .list - .iter() - .filter(|x| l1_set.contains(x)) - .cloned() - .collect(); - if intersect_list.is_empty() { - return Ok(lit(true)); - } - let merged_inlist = InList { - expr: l1.expr.clone(), - list: intersect_list, - negated: true, - }; - return Ok(Expr::InList(merged_inlist)); + return inlist_intersection(l1, l2, true); } else if l1.expr == l2.expr && !l1.negated && l2.negated { - let l2_set: HashSet = l2.list.iter().cloned().collect(); - let except_list: Vec = l1 - .list - .iter() - .filter(|x| !l2_set.contains(x)) - .cloned() - .collect(); - if except_list.is_empty() { - return Ok(lit(false)); - } - let merged_inlist = InList { - expr: l1.expr.clone(), - list: except_list, - negated: false, - }; - return Ok(Expr::InList(merged_inlist)); + return inlist_except(l1, l2); } else if l1.expr == l2.expr && l1.negated && !l2.negated { - let l2_set: HashSet = l2.list.iter().cloned().collect(); - let except_list: Vec = l1 - .list - .iter() - .filter(|x| !l2_set.contains(x)) - .cloned() - .collect(); - if except_list.is_empty() { - return Ok(lit(true)); - } - let merged_inlist = InList { - expr: l1.expr.clone(), - list: except_list, - negated: true, - }; - return Ok(Expr::InList(merged_inlist)); + return inlist_except(l2, l1); } } } @@ -121,3 +61,43 @@ impl TreeNodeRewriter for InListSimplifier { Ok(expr) } } + +fn inlist_intersection(l1: &InList, l2: &InList, negated: bool) -> Result { + let l1_set: HashSet = l1.list.iter().cloned().collect(); + let intersect_list: Vec = l2 + .list + .iter() + .filter(|x| l1_set.contains(x)) + .cloned() + .collect(); + // e in () is always false + // e not in () is always true + if intersect_list.is_empty() { + return Ok(lit(negated)); + } + let merged_inlist = InList { + expr: l1.expr.clone(), + list: intersect_list, + negated, + }; + Ok(Expr::InList(merged_inlist)) +} + +fn inlist_except(l1: &InList, l2: &InList) -> Result { + let l2_set: HashSet = l2.list.iter().cloned().collect(); + let except_list: Vec = l1 + .list + .iter() + .filter(|x| !l2_set.contains(x)) + .cloned() + .collect(); + if except_list.is_empty() { + return Ok(lit(false)); + } + let merged_inlist = InList { + expr: l1.expr.clone(), + list: except_list, + negated: false, + }; + Ok(Expr::InList(merged_inlist)) +} diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index 3b66c920dc40a..77d451a0b08da 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -763,19 +763,19 @@ CoalesceBatchesExec: target_batch_size=8192 query TT explain select x from t where x NOT IN (1,2,3,4) AND x IN (1,2,3,4,5); ---- -logical_plan TableScan: t projection=[x] -physical_plan MemoryExec: partitions=1, partition_sizes=[1] - -query TT -explain select x from t where x NOT IN (1,2,3,4,5) AND x IN (1,2,3); ----- logical_plan -Filter: t.x != Int32(4) AND t.x != Int32(5) +Filter: t.x = Int32(5) --TableScan: t projection=[x] physical_plan CoalesceBatchesExec: target_batch_size=8192 ---FilterExec: x@0 != 4 AND x@0 != 5 +--FilterExec: x@0 = 5 ----MemoryExec: partitions=1, partition_sizes=[1] +query TT +explain select x from t where x NOT IN (1,2,3,4,5) AND x IN (1,2,3); +---- +logical_plan EmptyRelation +physical_plan EmptyExec + statement ok drop table t; From 69039ec99ea055f277bffab48bf835767d32e22f Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 22 Jan 2024 22:18:30 +0800 Subject: [PATCH 06/12] update comments Signed-off-by: jayzhan211 --- .../src/simplify_expressions/inlist_simplifier.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs index 2ba0612ea6732..e3a9443072802 100644 --- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs @@ -27,9 +27,12 @@ use datafusion_expr::{lit, BinaryExpr, Expr, Operator}; /// Simplify expressions that is guaranteed to be true or false to a literal boolean expression /// /// Rules: -/// 1. `a in (1,2,3) AND a in (4,5) -> a in (1,2,3,4,5) Intersection` -/// 2. `a not int (1,2,3) AND a not in (4,5,6) -> a not in (1,2,3,4,5,6) Intersection` -/// 3. `a in (1,2,3) AND a not in (1,3,x,y,z...) -> a in (2) Exception` +/// If both expressions are positive or negative, then we can apply intersection of both inlist expressions +/// 1. `a in (1,2,3) AND a in (4,5) -> a in (1,2,3,4,5)` +/// 2. `a not int (1,2,3) AND a not in (4,5,6) -> a not in (1,2,3,4,5,6)` +/// If one of the expressions is negated, then we apply exception on positive expression +/// 1. `a in (1,2,3) AND a not in (1,2,3,4,5) -> a in ()` +/// 2. `a not in (1,2,3) AND a in (1,2,3,4,5) -> a in (4,5)` pub(super) struct InListSimplifier {} impl InListSimplifier { From 26413b4284675d937782a87adb76a119e924434e Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 23 Jan 2024 08:52:11 +0800 Subject: [PATCH 07/12] update comments Signed-off-by: jayzhan211 --- .../src/simplify_expressions/inlist_simplifier.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs index e3a9443072802..eca348407e6fa 100644 --- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! This module implements a rule that simplifies expressions that is guaranteed to be true or false at planning time +//! This module implements a rule that simplifies the values for `InList`s use std::collections::HashSet; @@ -27,12 +27,14 @@ use datafusion_expr::{lit, BinaryExpr, Expr, Operator}; /// Simplify expressions that is guaranteed to be true or false to a literal boolean expression /// /// Rules: -/// If both expressions are positive or negative, then we can apply intersection of both inlist expressions +/// If both expressions are `IN` or `NOT IN`, then we can apply intersection of both lists /// 1. `a in (1,2,3) AND a in (4,5) -> a in (1,2,3,4,5)` -/// 2. `a not int (1,2,3) AND a not in (4,5,6) -> a not in (1,2,3,4,5,6)` -/// If one of the expressions is negated, then we apply exception on positive expression -/// 1. `a in (1,2,3) AND a not in (1,2,3,4,5) -> a in ()` +/// 2. `a in (1,2,3) AND a in (2,3,4) -> a in (1,2,3,4)` +/// 3. `a not int (1,2,3) AND a not in (4,5,6) -> a not in (1,2,3,4,5,6)` +/// If one of the expressions is `IN` and another one is `NOT IN`, then we apply exception on `In` expression +/// 1. `a in (1,2,3) AND a not in (1,2,3,4,5) -> a in (), which is false` /// 2. `a not in (1,2,3) AND a in (1,2,3,4,5) -> a in (4,5)` +/// 3. `a in (1,2,3) AND a not in (4,5) -> a in (1,2,3)` pub(super) struct InListSimplifier {} impl InListSimplifier { From af9e186e79284c49e9c100f2cc9817ea00d18727 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 23 Jan 2024 09:28:54 +0800 Subject: [PATCH 08/12] fix bugs Signed-off-by: jayzhan211 --- .../simplify_expressions/expr_simplifier.rs | 32 ++++++++++++++--- .../simplify_expressions/inlist_simplifier.rs | 34 +++++++++++++++---- .../sqllogictest/test_files/predicates.slt | 9 +++-- 3 files changed, 63 insertions(+), 12 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 3508e014721f7..cb261cf0ced8f 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -3186,13 +3186,37 @@ mod tests { col("c1").eq(subquery1).or(col("c1").eq(subquery2)) ); - // c1 NOT IN (1, 2, 3, 4) OR c1 NOT IN (5, 6, 7, 8) -> - // c1 NOT IN (1, 2, 3, 4) OR c1 NOT IN (5, 6, 7, 8) + // c1 NOT IN (1, 2, 3, 4) OR c1 NOT IN (5, 6, 7, 8) -> true let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).or( in_list(col("c1"), vec![lit(5), lit(6), lit(7), lit(8)], true), ); - assert_eq!(simplify(expr.clone()), expr); - } + assert_eq!(simplify(expr.clone()), lit(true)); + + // c1 IN (1,2,3,4) AND c1 IN (5,6,7,8) -> false + let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], false).and( + in_list(col("c1"), vec![lit(5), lit(6), lit(7), lit(8)], false), + ); + assert_eq!(simplify(expr.clone()), lit(false)); + + // c1 IN (1,2,3,4) AND c1 IN (3,4,5,6) -> c1 = 3 or c1 = 4 + let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], false).and( + in_list(col("c1"), vec![lit(3), lit(4), lit(5), lit(6)], false), + ); + assert_eq!( + simplify(expr.clone()), + col("c1").eq(lit(3)).or(col("c1").eq(lit(4))) + ); + + // c1 NOT IN (1,2,3,4) AND c1 NOT IN (5,6,7,8) -> c1 NOT IN (1,2,3,4,5,6,7,8) + let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).and( + in_list(col("c1"), vec![lit(5), lit(6), lit(7), lit(8)], true), + ); + assert_eq!( + simplify(expr.clone()), + in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4), lit(5), lit(6), lit(7), lit(8)], true) + ); + + } #[test] fn simplify_large_or() { diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs index eca348407e6fa..92241dcc9074c 100644 --- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs @@ -27,10 +27,14 @@ use datafusion_expr::{lit, BinaryExpr, Expr, Operator}; /// Simplify expressions that is guaranteed to be true or false to a literal boolean expression /// /// Rules: -/// If both expressions are `IN` or `NOT IN`, then we can apply intersection of both lists -/// 1. `a in (1,2,3) AND a in (4,5) -> a in (1,2,3,4,5)` -/// 2. `a in (1,2,3) AND a in (2,3,4) -> a in (1,2,3,4)` -/// 3. `a not int (1,2,3) AND a not in (4,5,6) -> a not in (1,2,3,4,5,6)` +/// If both expressions are `IN` or `NOT IN`, then we can apply intersection or union on both lists +/// Intersection: +/// 1. `a in (1,2,3) AND a in (4,5) -> a in ()` +/// 2. `a in (1,2,3) AND a in (2,3,4) -> a in (2,3)` +/// 3. `a not in (1,2,3) OR a not in (3,4,5,6) -> a not in (3)` +/// Union: +/// 1. `a not int (1,2,3) AND a not in (4,5,6) -> a not in (1,2,3,4,5,6)` +/// 2. `a in (1,2,3) OR a in (4,5,6) -> a in (1,2,3,4,5,6)` /// If one of the expressions is `IN` and another one is `NOT IN`, then we apply exception on `In` expression /// 1. `a in (1,2,3) AND a not in (1,2,3,4,5) -> a in (), which is false` /// 2. `a not in (1,2,3) AND a in (1,2,3,4,5) -> a in (4,5)` @@ -47,19 +51,25 @@ impl TreeNodeRewriter for InListSimplifier { type N = Expr; fn mutate(&mut self, expr: Expr) -> Result { - if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = &expr { + if let Expr::BinaryExpr(BinaryExpr {left, op, right }) = &expr { if let (Expr::InList(l1), Operator::And, Expr::InList(l2)) = (left.as_ref(), op, right.as_ref()) { if l1.expr == l2.expr && !l1.negated && !l2.negated { return inlist_intersection(l1, l2, false); } else if l1.expr == l2.expr && l1.negated && l2.negated { - return inlist_intersection(l1, l2, true); + return inlist_union(l1, l2, true); } else if l1.expr == l2.expr && !l1.negated && l2.negated { return inlist_except(l1, l2); } else if l1.expr == l2.expr && l1.negated && !l2.negated { return inlist_except(l2, l1); } + } else if let (Expr::InList(l1), Operator::Or, Expr::InList(l2)) = + (left.as_ref(), op, right.as_ref()) + { + if l1.expr == l2.expr && l1.negated && l2.negated { + return inlist_intersection(l1, l2, true) + } } } @@ -67,6 +77,18 @@ impl TreeNodeRewriter for InListSimplifier { } } +fn inlist_union(l1: &InList, l2: &InList, negated: bool) -> Result { + let mut list = vec![]; + list.extend(l1.list.clone()); + list.extend(l2.list.clone()); + let merged_inlist = InList { + expr: l1.expr.clone(), + list, + negated, + }; + Ok(Expr::InList(merged_inlist)) +} + fn inlist_intersection(l1: &InList, l2: &InList, negated: bool) -> Result { let l1_set: HashSet = l1.list.iter().cloned().collect(); let intersect_list: Vec = l2 diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index 77d451a0b08da..1b8777909949c 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -740,8 +740,13 @@ physical_plan EmptyExec query TT explain select x from t where x NOT IN (1,2,3,4) AND x NOT IN (5,6,7,8); ---- -logical_plan TableScan: t projection=[x] -physical_plan MemoryExec: partitions=1, partition_sizes=[1] +logical_plan +Filter: t.x NOT IN ([Int32(1), Int32(2), Int32(3), Int32(4), Int32(5), Int32(6), Int32(7), Int32(8)]) +--TableScan: t projection=[x] +physical_plan +CoalesceBatchesExec: target_batch_size=8192 +--FilterExec: x@0 NOT IN (SET) ([Literal { value: Int32(1) }, Literal { value: Int32(2) }, Literal { value: Int32(3) }, Literal { value: Int32(4) }, Literal { value: Int32(5) }, Literal { value: Int32(6) }, Literal { value: Int32(7) }, Literal { value: Int32(8) }]) +----MemoryExec: partitions=1, partition_sizes=[1] query TT explain select x from t where x IN (1,2,3,4) AND x NOT IN (1,2,3,4,5,6); From fe64b11c47014090e2c44ae6737749e4ea1c061b Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 23 Jan 2024 20:41:17 +0800 Subject: [PATCH 09/12] fix union Signed-off-by: jayzhan211 --- .../simplify_expressions/expr_simplifier.rs | 67 +++++++++++++++---- .../simplify_expressions/inlist_simplifier.rs | 30 +++++---- .../or_in_list_simplifier.rs | 14 ++-- 3 files changed, 81 insertions(+), 30 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index cb261cf0ced8f..d80f5dc8fb9aa 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -3186,37 +3186,76 @@ mod tests { col("c1").eq(subquery1).or(col("c1").eq(subquery2)) ); - // c1 NOT IN (1, 2, 3, 4) OR c1 NOT IN (5, 6, 7, 8) -> true + // 1. c1 IN (1,2,3,4) AND c1 IN (5,6,7,8) -> false + let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], false).and( + in_list(col("c1"), vec![lit(5), lit(6), lit(7), lit(8)], false), + ); + assert_eq!(simplify(expr.clone()), lit(false)); + + // 2. c1 IN (1,2,3,4) AND c1 IN (4,5,6,7) -> c1 = 4 + let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], false).and( + in_list(col("c1"), vec![lit(4), lit(5), lit(6), lit(7)], false), + ); + assert_eq!(simplify(expr.clone()), col("c1").eq(lit(4))); + + // 3. c1 NOT IN (1, 2, 3, 4) OR c1 NOT IN (5, 6, 7, 8) -> true let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).or( in_list(col("c1"), vec![lit(5), lit(6), lit(7), lit(8)], true), ); assert_eq!(simplify(expr.clone()), lit(true)); - // c1 IN (1,2,3,4) AND c1 IN (5,6,7,8) -> false - let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], false).and( - in_list(col("c1"), vec![lit(5), lit(6), lit(7), lit(8)], false), + // 4. c1 NOT IN (1,2,3,4) AND c1 NOT IN (4,5,6,7) -> c1 NOT IN (1,2,3,4,5,6,7) + let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).and( + in_list(col("c1"), vec![lit(4), lit(5), lit(6), lit(7)], true), + ); + assert_eq!( + simplify(expr.clone()), + in_list( + col("c1"), + vec![lit(1), lit(2), lit(3), lit(4), lit(5), lit(6), lit(7)], + true + ) ); - assert_eq!(simplify(expr.clone()), lit(false)); - // c1 IN (1,2,3,4) AND c1 IN (3,4,5,6) -> c1 = 3 or c1 = 4 - let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], false).and( - in_list(col("c1"), vec![lit(3), lit(4), lit(5), lit(6)], false), + // 5. c1 IN (1,2,3,4) OR c1 IN (2,3,4,5) -> c1 IN (1,2,3,4,5) + let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], false).or( + in_list(col("c1"), vec![lit(2), lit(3), lit(4), lit(5)], false), ); assert_eq!( simplify(expr.clone()), - col("c1").eq(lit(3)).or(col("c1").eq(lit(4))) + in_list( + col("c1"), + vec![lit(1), lit(2), lit(3), lit(4), lit(5)], + false + ) ); - // c1 NOT IN (1,2,3,4) AND c1 NOT IN (5,6,7,8) -> c1 NOT IN (1,2,3,4,5,6,7,8) - let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).and( + // 6. c1 IN (1,2,3) AND c1 NOT INT (1,2,3,4,5) -> false + let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3)], false).and(in_list( + col("c1"), + vec![lit(1), lit(2), lit(3), lit(4), lit(5)], + true, + )); + assert_eq!(simplify(expr.clone()), lit(false)); + + // 7. c1 NOT IN (1,2,3,4) AND c1 IN (1,2,3,4,5) -> c1 = 5 + let expr = + in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).and(in_list( + col("c1"), + vec![lit(1), lit(2), lit(3), lit(4), lit(5)], + false, + )); + assert_eq!(simplify(expr.clone()), col("c1").eq(lit(5))); + + // 8. c1 IN (1,2,3,4) AND c1 NOT IN (5,6,7,8) -> c1 IN (1,2,3,4) + let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], false).and( in_list(col("c1"), vec![lit(5), lit(6), lit(7), lit(8)], true), ); assert_eq!( simplify(expr.clone()), - in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4), lit(5), lit(6), lit(7), lit(8)], true) + in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], false) ); - - } + } #[test] fn simplify_large_or() { diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs index 92241dcc9074c..4e66821f18a86 100644 --- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs @@ -29,16 +29,17 @@ use datafusion_expr::{lit, BinaryExpr, Expr, Operator}; /// Rules: /// If both expressions are `IN` or `NOT IN`, then we can apply intersection or union on both lists /// Intersection: -/// 1. `a in (1,2,3) AND a in (4,5) -> a in ()` +/// 1. `a in (1,2,3) AND a in (4,5) -> a in (), which is false` /// 2. `a in (1,2,3) AND a in (2,3,4) -> a in (2,3)` /// 3. `a not in (1,2,3) OR a not in (3,4,5,6) -> a not in (3)` /// Union: -/// 1. `a not int (1,2,3) AND a not in (4,5,6) -> a not in (1,2,3,4,5,6)` -/// 2. `a in (1,2,3) OR a in (4,5,6) -> a in (1,2,3,4,5,6)` +/// 4. `a not int (1,2,3) AND a not in (4,5,6) -> a not in (1,2,3,4,5,6)` +/// # This rule is handled by `or_in_list_simplifier.rs` +/// 5. `a in (1,2,3) OR a in (4,5,6) -> a in (1,2,3,4,5,6)` /// If one of the expressions is `IN` and another one is `NOT IN`, then we apply exception on `In` expression -/// 1. `a in (1,2,3) AND a not in (1,2,3,4,5) -> a in (), which is false` -/// 2. `a not in (1,2,3) AND a in (1,2,3,4,5) -> a in (4,5)` -/// 3. `a in (1,2,3) AND a not in (4,5) -> a in (1,2,3)` +/// 6. `a in (1,2,3,4) AND a not in (1,2,3,4,5) -> a in (), which is false` +/// 7. `a not in (1,2,3,4) AND a in (1,2,3,4,5) -> a = 5` +/// 8. `a in (1,2,3,4) AND a not in (5,6,7,8) -> a in (1,2,3,4)` pub(super) struct InListSimplifier {} impl InListSimplifier { @@ -51,7 +52,7 @@ impl TreeNodeRewriter for InListSimplifier { type N = Expr; fn mutate(&mut self, expr: Expr) -> Result { - if let Expr::BinaryExpr(BinaryExpr {left, op, right }) = &expr { + if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = &expr { if let (Expr::InList(l1), Operator::And, Expr::InList(l2)) = (left.as_ref(), op, right.as_ref()) { @@ -68,7 +69,7 @@ impl TreeNodeRewriter for InListSimplifier { (left.as_ref(), op, right.as_ref()) { if l1.expr == l2.expr && l1.negated && l2.negated { - return inlist_intersection(l1, l2, true) + return inlist_intersection(l1, l2, true); } } } @@ -78,12 +79,17 @@ impl TreeNodeRewriter for InListSimplifier { } fn inlist_union(l1: &InList, l2: &InList, negated: bool) -> Result { - let mut list = vec![]; - list.extend(l1.list.clone()); - list.extend(l2.list.clone()); + let mut seen: HashSet = HashSet::new(); + let list_set = l1 + .list + .iter() + .chain(l2.list.iter()) + .filter(|&e| seen.insert(e.to_owned())) + .cloned() + .collect::>(); let merged_inlist = InList { expr: l1.expr.clone(), - list, + list: list_set, negated, }; Ok(Expr::InList(merged_inlist)) diff --git a/datafusion/optimizer/src/simplify_expressions/or_in_list_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/or_in_list_simplifier.rs index cebaaccc41c7e..a1e06aed64729 100644 --- a/datafusion/optimizer/src/simplify_expressions/or_in_list_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/or_in_list_simplifier.rs @@ -18,6 +18,7 @@ //! This module implements a rule that simplifies OR expressions into IN list expressions use std::borrow::Cow; +use std::collections::HashSet; use datafusion_common::tree_node::TreeNodeRewriter; use datafusion_common::Result; @@ -52,12 +53,17 @@ impl TreeNodeRewriter for OrInListSimplifier { { let lhs = lhs.into_owned(); let rhs = rhs.into_owned(); - let mut list = vec![]; - list.extend(lhs.list); - list.extend(rhs.list); + let mut seen: HashSet = HashSet::new(); + let list_set = lhs + .list + .into_iter() + .chain(rhs.list.into_iter()) + .filter(|e| seen.insert(e.to_owned())) + .collect::>(); + let merged_inlist = InList { expr: lhs.expr, - list, + list: list_set, negated: false, }; return Ok(Expr::InList(merged_inlist)); From f106ed03824d1e2ffadaad47cdfbb8f841dc8a81 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 23 Jan 2024 20:41:33 +0800 Subject: [PATCH 10/12] clippy Signed-off-by: jayzhan211 --- .../optimizer/src/simplify_expressions/or_in_list_simplifier.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/or_in_list_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/or_in_list_simplifier.rs index a1e06aed64729..a9209252e6e54 100644 --- a/datafusion/optimizer/src/simplify_expressions/or_in_list_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/or_in_list_simplifier.rs @@ -57,7 +57,7 @@ impl TreeNodeRewriter for OrInListSimplifier { let list_set = lhs .list .into_iter() - .chain(rhs.list.into_iter()) + .chain(rhs.list) .filter(|e| seen.insert(e.to_owned())) .collect::>(); From 2c080799e1c323dff8931996ca208ec87f61053d Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 23 Jan 2024 21:16:30 +0800 Subject: [PATCH 11/12] add more test Signed-off-by: jayzhan211 --- .../simplify_expressions/expr_simplifier.rs | 35 +++++++++++++++++++ .../sqllogictest/test_files/predicates.slt | 28 ++------------- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index d80f5dc8fb9aa..0849bfcad5eb3 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -3255,6 +3255,41 @@ mod tests { simplify(expr.clone()), in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], false) ); + + // inlist with more than two expressions + // c1 IN (1,2,3,4,5,6) AND c1 IN (1,3,5,6) AND c1 IN (3,6) -> c1 = 3 OR c1 = 6 + let expr = in_list( + col("c1"), + vec![lit(1), lit(2), lit(3), lit(4), lit(5), lit(6)], + false, + ) + .and(in_list( + col("c1"), + vec![lit(1), lit(3), lit(5), lit(6)], + false, + )) + .and(in_list(col("c1"), vec![lit(3), lit(6)], false)); + assert_eq!( + simplify(expr.clone()), + col("c1").eq(lit(3)).or(col("c1").eq(lit(6))) + ); + + // c1 NOT IN (1,2,3,4) AND c1 IN (5,6,7,8) AND c1 NOT IN (3,4,5,6) AND c1 IN (8,9,10) -> c1 = 8 + let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).and( + in_list(col("c1"), vec![lit(5), lit(6), lit(7), lit(8)], false) + .and(in_list(col("c1"), vec![lit(3), lit(4), lit(5), lit(6)], true)) + .and(in_list(col("c1"), vec![lit(8), lit(9), lit(10)], false))); + assert_eq!(simplify(expr.clone()), col("c1").eq(lit(8))); + + // Contains non-InList expression + // c1 NOT IN (1,2,3,4) OR c1 != 5 OR c1 NOT IN (6,7,8,9) -> c1 NOT IN (1,2,3,4) OR c1 != 5 OR c1 NOT IN (6,7,8,9) + let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).or( + col("c1").not_eq(lit(5)) + .or(in_list(col("c1"), vec![lit(6), lit(7), lit(8), lit(9)], true))); + // TODO: Further simplify this expression + // assert_eq!(simplify(expr.clone()), lit(true)); + assert_eq!(simplify(expr.clone()), expr); + } #[test] diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index 1b8777909949c..b5347f997a5ab 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -738,21 +738,10 @@ logical_plan EmptyRelation physical_plan EmptyExec query TT -explain select x from t where x NOT IN (1,2,3,4) AND x NOT IN (5,6,7,8); +explain select x from t where x NOT IN (1,2,3,4) OR x NOT IN (5,6,7,8); ---- -logical_plan -Filter: t.x NOT IN ([Int32(1), Int32(2), Int32(3), Int32(4), Int32(5), Int32(6), Int32(7), Int32(8)]) ---TableScan: t projection=[x] -physical_plan -CoalesceBatchesExec: target_batch_size=8192 ---FilterExec: x@0 NOT IN (SET) ([Literal { value: Int32(1) }, Literal { value: Int32(2) }, Literal { value: Int32(3) }, Literal { value: Int32(4) }, Literal { value: Int32(5) }, Literal { value: Int32(6) }, Literal { value: Int32(7) }, Literal { value: Int32(8) }]) -----MemoryExec: partitions=1, partition_sizes=[1] - -query TT -explain select x from t where x IN (1,2,3,4) AND x NOT IN (1,2,3,4,5,6); ----- -logical_plan EmptyRelation -physical_plan EmptyExec +logical_plan TableScan: t projection=[x] +physical_plan MemoryExec: partitions=1, partition_sizes=[1] query TT explain select x from t where x IN (1,2,3,4,5) AND x NOT IN (1,2,3,4); @@ -765,17 +754,6 @@ CoalesceBatchesExec: target_batch_size=8192 --FilterExec: x@0 = 5 ----MemoryExec: partitions=1, partition_sizes=[1] -query TT -explain select x from t where x NOT IN (1,2,3,4) AND x IN (1,2,3,4,5); ----- -logical_plan -Filter: t.x = Int32(5) ---TableScan: t projection=[x] -physical_plan -CoalesceBatchesExec: target_batch_size=8192 ---FilterExec: x@0 = 5 -----MemoryExec: partitions=1, partition_sizes=[1] - query TT explain select x from t where x NOT IN (1,2,3,4,5) AND x IN (1,2,3); ---- From 3fb45db53c3c6acb35b3ea98b5fb28d046de92fe Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 23 Jan 2024 21:19:33 +0800 Subject: [PATCH 12/12] rename Signed-off-by: jayzhan211 --- .../simplify_expressions/expr_simplifier.rs | 21 +++++++++++++------ .../simplify_expressions/inlist_simplifier.rs | 4 ++-- .../or_in_list_simplifier.rs | 4 ++-- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 0849bfcad5eb3..431213e025017 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -3277,19 +3277,28 @@ mod tests { // c1 NOT IN (1,2,3,4) AND c1 IN (5,6,7,8) AND c1 NOT IN (3,4,5,6) AND c1 IN (8,9,10) -> c1 = 8 let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).and( in_list(col("c1"), vec![lit(5), lit(6), lit(7), lit(8)], false) - .and(in_list(col("c1"), vec![lit(3), lit(4), lit(5), lit(6)], true)) - .and(in_list(col("c1"), vec![lit(8), lit(9), lit(10)], false))); + .and(in_list( + col("c1"), + vec![lit(3), lit(4), lit(5), lit(6)], + true, + )) + .and(in_list(col("c1"), vec![lit(8), lit(9), lit(10)], false)), + ); assert_eq!(simplify(expr.clone()), col("c1").eq(lit(8))); // Contains non-InList expression // c1 NOT IN (1,2,3,4) OR c1 != 5 OR c1 NOT IN (6,7,8,9) -> c1 NOT IN (1,2,3,4) OR c1 != 5 OR c1 NOT IN (6,7,8,9) - let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).or( - col("c1").not_eq(lit(5)) - .or(in_list(col("c1"), vec![lit(6), lit(7), lit(8), lit(9)], true))); + let expr = + in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).or(col("c1") + .not_eq(lit(5)) + .or(in_list( + col("c1"), + vec![lit(6), lit(7), lit(8), lit(9)], + true, + ))); // TODO: Further simplify this expression // assert_eq!(simplify(expr.clone()), lit(true)); assert_eq!(simplify(expr.clone()), expr); - } #[test] diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs index 4e66821f18a86..fa95f1688e6f4 100644 --- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs @@ -80,7 +80,7 @@ impl TreeNodeRewriter for InListSimplifier { fn inlist_union(l1: &InList, l2: &InList, negated: bool) -> Result { let mut seen: HashSet = HashSet::new(); - let list_set = l1 + let list = l1 .list .iter() .chain(l2.list.iter()) @@ -89,7 +89,7 @@ fn inlist_union(l1: &InList, l2: &InList, negated: bool) -> Result { .collect::>(); let merged_inlist = InList { expr: l1.expr.clone(), - list: list_set, + list, negated, }; Ok(Expr::InList(merged_inlist)) diff --git a/datafusion/optimizer/src/simplify_expressions/or_in_list_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/or_in_list_simplifier.rs index a9209252e6e54..fd5c9ecaf82c5 100644 --- a/datafusion/optimizer/src/simplify_expressions/or_in_list_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/or_in_list_simplifier.rs @@ -54,7 +54,7 @@ impl TreeNodeRewriter for OrInListSimplifier { let lhs = lhs.into_owned(); let rhs = rhs.into_owned(); let mut seen: HashSet = HashSet::new(); - let list_set = lhs + let list = lhs .list .into_iter() .chain(rhs.list) @@ -63,7 +63,7 @@ impl TreeNodeRewriter for OrInListSimplifier { let merged_inlist = InList { expr: lhs.expr, - list: list_set, + list, negated: false, }; return Ok(Expr::InList(merged_inlist));