-
Notifications
You must be signed in to change notification settings - Fork 1.9k
move the type coercion to the beginning of the optimizer rule and support type coercion for subquery
#3636
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
move the type coercion to the beginning of the optimizer rule and support type coercion for subquery
#3636
Changes from all commits
79dad7b
fc95f62
c041b90
9a86e7a
972449e
29449b3
7bd00d7
904949d
2e9d18e
824ad99
e28cf75
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -336,10 +336,10 @@ order by s_name; | |
| Projection: #part.p_partkey AS p_partkey, alias=__sq_1 | ||
| Filter: #part.p_name LIKE Utf8("forest%") | ||
| TableScan: part projection=[p_partkey, p_name], partial_filters=[#part.p_name LIKE Utf8("forest%")] | ||
| Projection: #lineitem.l_partkey, #lineitem.l_suppkey, Decimal128(Some(50000000000000000),38,17) * CAST(#SUM(lineitem.l_quantity) AS Decimal128(38, 17)) AS __value, alias=__sq_3 | ||
| Projection: #lineitem.l_partkey, #lineitem.l_suppkey, CAST(Float64(0.5) AS Decimal128(38, 17)) * CAST(#SUM(lineitem.l_quantity) AS Decimal128(38, 17)) AS __value, alias=__sq_3 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like a regression somehow -- the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's the desired result. If we want to do the evaluation
If we don't want the regression in this pr, I can use the method of
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should avoid regressions -- so perhaps we can do
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, I will recovery them in the follow up pr |
||
| Aggregate: groupBy=[[#lineitem.l_partkey, #lineitem.l_suppkey]], aggr=[[SUM(#lineitem.l_quantity)]] | ||
| Filter: #lineitem.l_shipdate >= Date32("8766") | ||
| TableScan: lineitem projection=[l_partkey, l_suppkey, l_quantity, l_shipdate], partial_filters=[#lineitem.l_shipdate >= Date32("8766")]"# | ||
| Filter: #lineitem.l_shipdate >= CAST(Utf8("1994-01-01") AS Date32) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. likewise, this isn't right because it should have been evaluated to a constant |
||
| TableScan: lineitem projection=[l_partkey, l_suppkey, l_quantity, l_shipdate], partial_filters=[#lineitem.l_shipdate >= CAST(Utf8("1994-01-01") AS Date32)]"# | ||
| .to_string(); | ||
| assert_eq!(actual, expected); | ||
|
|
||
|
|
@@ -393,8 +393,8 @@ order by cntrycode;"#; | |
| TableScan: orders projection=[o_custkey] | ||
| Projection: #AVG(customer.c_acctbal) AS __value, alias=__sq_1 | ||
| Aggregate: groupBy=[[]], aggr=[[AVG(#customer.c_acctbal)]] | ||
| Filter: CAST(#customer.c_acctbal AS Decimal128(30, 15)) > Decimal128(Some(0),30,15) AND substr(#customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) | ||
| TableScan: customer projection=[c_phone, c_acctbal], partial_filters=[CAST(#customer.c_acctbal AS Decimal128(30, 15)) > Decimal128(Some(0),30,15), substr(#customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")])]"# | ||
| Filter: CAST(#customer.c_acctbal AS Decimal128(30, 15)) > CAST(Float64(0) AS Decimal128(30, 15)) AND substr(#customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) | ||
| TableScan: customer projection=[c_phone, c_acctbal], partial_filters=[CAST(#customer.c_acctbal AS Decimal128(30, 15)) > CAST(Float64(0) AS Decimal128(30, 15)), substr(#customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")])]"# | ||
| .to_string(); | ||
| assert_eq!(actual, expected); | ||
|
|
||
|
|
@@ -453,7 +453,7 @@ order by value desc; | |
| TableScan: supplier projection=[s_suppkey, s_nationkey] | ||
| Filter: #nation.n_name = Utf8("GERMANY") | ||
| TableScan: nation projection=[n_nationkey, n_name], partial_filters=[#nation.n_name = Utf8("GERMANY")] | ||
| Projection: CAST(#SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Decimal128(38, 17)) * Decimal128(Some(10000000000000),38,17) AS __value, alias=__sq_1 | ||
| Projection: CAST(#SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Decimal128(38, 17)) * CAST(Float64(0.0001) AS Decimal128(38, 17)) AS __value, alias=__sq_1 | ||
| Aggregate: groupBy=[[]], aggr=[[SUM(CAST(#partsupp.ps_supplycost AS Decimal128(26, 2)) * CAST(#partsupp.ps_availqty AS Decimal128(26, 2)))]] | ||
| Inner Join: #supplier.s_nationkey = #nation.n_nationkey | ||
| Inner Join: #partsupp.ps_suppkey = #supplier.s_suppkey | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,7 @@ use arrow::datatypes::DataType; | |
| use datafusion_common::{DFSchema, DFSchemaRef, DataFusionError, Result}; | ||
| use datafusion_expr::binary_rule::{coerce_types, comparison_coercion}; | ||
| use datafusion_expr::expr_rewriter::{ExprRewritable, ExprRewriter, RewriteRecursion}; | ||
| use datafusion_expr::logical_plan::Subquery; | ||
| use datafusion_expr::type_coercion::data_types; | ||
| use datafusion_expr::utils::from_plan; | ||
| use datafusion_expr::{ | ||
|
|
@@ -50,56 +51,70 @@ impl OptimizerRule for TypeCoercion { | |
| plan: &LogicalPlan, | ||
| optimizer_config: &mut OptimizerConfig, | ||
| ) -> Result<LogicalPlan> { | ||
| // optimize child plans first | ||
| let new_inputs = plan | ||
| .inputs() | ||
| .iter() | ||
| .map(|p| self.optimize(p, optimizer_config)) | ||
| .collect::<Result<Vec<_>>>()?; | ||
|
|
||
| // get schema representing all available input fields. This is used for data type | ||
| // resolution only, so order does not matter here | ||
| let schema = new_inputs.iter().map(|input| input.schema()).fold( | ||
| DFSchema::empty(), | ||
| |mut lhs, rhs| { | ||
| lhs.merge(rhs); | ||
| lhs | ||
| }, | ||
| ); | ||
| optimize_internal(&DFSchema::empty(), plan, optimizer_config) | ||
| } | ||
| } | ||
|
|
||
| let mut expr_rewrite = TypeCoercionRewriter { | ||
| schema: Arc::new(schema), | ||
| }; | ||
| fn optimize_internal( | ||
| // use the external schema to handle the correlated subqueries case | ||
| external_schema: &DFSchema, | ||
| plan: &LogicalPlan, | ||
| optimizer_config: &mut OptimizerConfig, | ||
| ) -> Result<LogicalPlan> { | ||
| // optimize child plans first | ||
| let new_inputs = plan | ||
| .inputs() | ||
| .iter() | ||
| .map(|p| optimize_internal(external_schema, p, optimizer_config)) | ||
| .collect::<Result<Vec<_>>>()?; | ||
|
|
||
| // get schema representing all available input fields. This is used for data type | ||
| // resolution only, so order does not matter here | ||
| let mut schema = new_inputs.iter().map(|input| input.schema()).fold( | ||
| DFSchema::empty(), | ||
| |mut lhs, rhs| { | ||
| lhs.merge(rhs); | ||
| lhs | ||
| }, | ||
| ); | ||
|
|
||
| // merge the outer schema for correlated subqueries | ||
| // like case: | ||
| // select t2.c2 from t1 where t1.c1 in (select t2.c1 from t2 where t2.c2=t1.c3) | ||
| schema.merge(external_schema); | ||
|
|
||
| let mut expr_rewrite = TypeCoercionRewriter { | ||
| schema: Arc::new(schema), | ||
| }; | ||
|
|
||
| let original_expr_names: Vec<Option<String>> = plan | ||
| .expressions() | ||
| .iter() | ||
| .map(|expr| expr.name().ok()) | ||
| .collect(); | ||
|
|
||
| let new_expr = plan | ||
| .expressions() | ||
| .into_iter() | ||
| .zip(original_expr_names) | ||
| .map(|(expr, original_name)| { | ||
| let expr = expr.rewrite(&mut expr_rewrite)?; | ||
|
|
||
| // ensure aggregate names don't change: | ||
| // https://github.com/apache/arrow-datafusion/issues/3555 | ||
| if matches!(expr, Expr::AggregateFunction { .. }) { | ||
| if let Some((alias, name)) = original_name.zip(expr.name().ok()) { | ||
| if alias != name { | ||
| return Ok(expr.alias(&alias)); | ||
| } | ||
| let original_expr_names: Vec<Option<String>> = plan | ||
| .expressions() | ||
| .iter() | ||
| .map(|expr| expr.name().ok()) | ||
| .collect(); | ||
|
|
||
| let new_expr = plan | ||
| .expressions() | ||
| .into_iter() | ||
| .zip(original_expr_names) | ||
| .map(|(expr, original_name)| { | ||
| let expr = expr.rewrite(&mut expr_rewrite)?; | ||
|
|
||
| // ensure aggregate names don't change: | ||
| // https://github.com/apache/arrow-datafusion/issues/3555 | ||
| if matches!(expr, Expr::AggregateFunction { .. }) { | ||
| if let Some((alias, name)) = original_name.zip(expr.name().ok()) { | ||
| if alias != name { | ||
| return Ok(expr.alias(&alias)); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Ok(expr) | ||
| }) | ||
| .collect::<Result<Vec<_>>>()?; | ||
| Ok(expr) | ||
| }) | ||
| .collect::<Result<Vec<_>>>()?; | ||
|
|
||
| from_plan(plan, &new_expr, &new_inputs) | ||
| } | ||
| from_plan(plan, &new_expr, &new_inputs) | ||
| } | ||
|
|
||
| pub(crate) struct TypeCoercionRewriter { | ||
|
|
@@ -119,6 +134,41 @@ impl ExprRewriter for TypeCoercionRewriter { | |
|
|
||
| fn mutate(&mut self, expr: Expr) -> Result<Expr> { | ||
| match expr { | ||
| Expr::ScalarSubquery(Subquery { subquery }) => { | ||
| let mut optimizer_config = OptimizerConfig::new(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should pass through the same OptimizerConfig (so it has the same context needed to evaluate
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After we move the I think the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think when we complete this issue #3582, this parameter will be dropped. |
||
| let new_plan = | ||
| optimize_internal(&self.schema, &subquery, &mut optimizer_config)?; | ||
| Ok(Expr::ScalarSubquery(Subquery::new(new_plan))) | ||
| } | ||
| Expr::Exists { subquery, negated } => { | ||
| let mut optimizer_config = OptimizerConfig::new(); | ||
| let new_plan = optimize_internal( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand why the outer query's schema needs to be passed while optimizing the subquery. I would expect that we would recursively optimize the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As in I think this should be more like: let optimizer = TypeCoercion::new();
let new_plan = optimizer.optimize(&subquery.subquery, &mut optimizer_config)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I also want this, but there are some correlated subquery, like the comments. For the subquery This is way that I find, do you @andygrove @alamb have any thoughts about this?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I would need to try it myself |
||
| &self.schema, | ||
| &subquery.subquery, | ||
| &mut optimizer_config, | ||
| )?; | ||
| Ok(Expr::Exists { | ||
| subquery: Subquery::new(new_plan), | ||
| negated, | ||
| }) | ||
| } | ||
| Expr::InSubquery { | ||
| expr, | ||
| subquery, | ||
| negated, | ||
| } => { | ||
| let mut optimizer_config = OptimizerConfig::new(); | ||
| let new_plan = optimize_internal( | ||
| &self.schema, | ||
| &subquery.subquery, | ||
| &mut optimizer_config, | ||
| )?; | ||
| Ok(Expr::InSubquery { | ||
| expr, | ||
| subquery: Subquery::new(new_plan), | ||
| negated, | ||
| }) | ||
| } | ||
| Expr::IsTrue(expr) => { | ||
| let expr = is_true(get_casted_expr_for_bool_op(&expr, &self.schema)?); | ||
| Ok(expr) | ||
|
|
@@ -368,11 +418,12 @@ fn coerce_arguments_for_signature( | |
|
|
||
| #[cfg(test)] | ||
| mod test { | ||
| use crate::type_coercion::TypeCoercion; | ||
| use crate::type_coercion::{TypeCoercion, TypeCoercionRewriter}; | ||
| use crate::{OptimizerConfig, OptimizerRule}; | ||
| use arrow::datatypes::DataType; | ||
| use datafusion_common::{DFField, DFSchema, Result, ScalarValue}; | ||
| use datafusion_expr::{col, ColumnarValue}; | ||
| use datafusion_expr::expr_rewriter::ExprRewritable; | ||
| use datafusion_expr::{cast, col, is_true, ColumnarValue}; | ||
| use datafusion_expr::{ | ||
| lit, | ||
| logical_plan::{EmptyRelation, Projection}, | ||
|
|
@@ -735,4 +786,25 @@ mod test { | |
| ), | ||
| })) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_type_coercion_rewrite() -> Result<()> { | ||
| let schema = Arc::new( | ||
| DFSchema::new_with_metadata( | ||
| vec![DFField::new(None, "a", DataType::Int64, true)], | ||
| std::collections::HashMap::new(), | ||
| ) | ||
| .unwrap(), | ||
| ); | ||
| let mut rewriter = TypeCoercionRewriter::new(schema); | ||
| let expr = is_true(lit(12i32).eq(lit(13i64))); | ||
| let expected = is_true( | ||
| cast(lit(ScalarValue::Int32(Some(12))), DataType::Int64) | ||
| .eq(lit(ScalarValue::Int64(Some(13)))), | ||
| ); | ||
| let result = expr.rewrite(&mut rewriter)?; | ||
| assert_eq!(expected, result); | ||
| Ok(()) | ||
| // TODO add more test for this | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree some more tests would be good
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -109,10 +109,9 @@ fn test_sql(sql: &str) -> Result<LogicalPlan> { | |
| // TODO should make align with rules in the context | ||
| // https://github.com/apache/arrow-datafusion/issues/3524 | ||
| let rules: Vec<Arc<dyn OptimizerRule + Sync + Send>> = vec![ | ||
| // Simplify expressions first to maximize the chance | ||
| // of applying other optimizations | ||
| Arc::new(SimplifyExpressions::new()), | ||
| Arc::new(PreCastLitInComparisonExpressions::new()), | ||
| Arc::new(TypeCoercion::new()), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
| Arc::new(SimplifyExpressions::new()), | ||
| Arc::new(DecorrelateWhereExists::new()), | ||
| Arc::new(DecorrelateWhereIn::new()), | ||
| Arc::new(ScalarSubqueryToJoin::new()), | ||
|
|
@@ -125,9 +124,6 @@ fn test_sql(sql: &str) -> Result<LogicalPlan> { | |
| Arc::new(RewriteDisjunctivePredicate::new()), | ||
| Arc::new(FilterNullJoinKeys::default()), | ||
| Arc::new(ReduceOuterJoin::new()), | ||
| Arc::new(TypeCoercion::new()), | ||
| // after the type coercion, can do simplify expression again | ||
| Arc::new(SimplifyExpressions::new()), | ||
| Arc::new(FilterPushDown::new()), | ||
| Arc::new(LimitPushDown::new()), | ||
| Arc::new(SingleDistinctToGroupBy::new()), | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now that we have
TypeCoercionI wonder if we still needPreCastLitInComparisonExpressionshttps://github.com/apache/arrow-datafusion/blob/d16457a0ba129b077935078e5cf89d028f598e0b/datafusion/optimizer/src/pre_cast_lit_in_comparison.rs#L31-L50
Appears to be a subset of what TypeCoercion is doing
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Though perhaps this is something similar to what you have described in #3622
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
PreCastLitInComparisonExpressionswill be invalid when we move thetype coercionto the front ofPreCastLitInComparisonExpressions.In the next pr, I will do #3622 and move the type coercion to the front of the
PreCastLitInComparisonExpressionsI think
type coercionjust do one thing which is make type compatible in all operation, but thePreCastLitInComparisonExpressionsor #3622 is to reducecastfor column expr or other expr instead of adding thecastto the literal. This will reduce the cast effort of the runtime.