From 318b601a6bd1ee26d7e459bd6df6a3a75928be76 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 8 Sep 2022 16:50:54 -0600 Subject: [PATCH 1/7] Fix schema bug in TypeCoercion rule --- datafusion/optimizer/src/type_coercion.rs | 25 ++++++++++--------- .../optimizer/tests/integration-test.rs | 20 +++++++++++++++ 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs index 42c081af3dfe1..522eebc93c6d7 100644 --- a/datafusion/optimizer/src/type_coercion.rs +++ b/datafusion/optimizer/src/type_coercion.rs @@ -21,12 +21,11 @@ use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::{DFSchema, DFSchemaRef, Result}; use datafusion_expr::binary_rule::coerce_types; use datafusion_expr::expr_rewriter::{ExprRewritable, ExprRewriter, RewriteRecursion}; -use datafusion_expr::logical_plan::builder::build_join_schema; -use datafusion_expr::logical_plan::JoinType; use datafusion_expr::type_coercion::data_types; use datafusion_expr::utils::from_plan; use datafusion_expr::{Expr, LogicalPlan}; use datafusion_expr::{ExprSchemable, Signature}; +use std::sync::Arc; #[derive(Default)] pub struct TypeCoercion {} @@ -54,17 +53,19 @@ impl OptimizerRule for TypeCoercion { .map(|p| self.optimize(p, optimizer_config)) .collect::>>()?; - let schema = match new_inputs.len() { - 1 => new_inputs[0].schema().clone(), - 2 => DFSchemaRef::new(build_join_schema( - new_inputs[0].schema(), - new_inputs[1].schema(), - &JoinType::Inner, - )?), - _ => DFSchemaRef::new(DFSchema::empty()), - }; + // get schema representing all available input fields. This is used for data type + // resolution only, so order does not matter here + let schema = new_inputs.iter().map(|input| input.schema()).fold( + DFSchema::empty(), + |mut lhs, rhs| { + lhs.merge(rhs); + lhs + }, + ); - let mut expr_rewrite = TypeCoercionRewriter { schema }; + let mut expr_rewrite = TypeCoercionRewriter { + schema: Arc::new(schema), + }; let new_expr = plan .expressions() diff --git a/datafusion/optimizer/tests/integration-test.rs b/datafusion/optimizer/tests/integration-test.rs index 55c38689bdfb6..d433b46961629 100644 --- a/datafusion/optimizer/tests/integration-test.rs +++ b/datafusion/optimizer/tests/integration-test.rs @@ -34,6 +34,7 @@ use datafusion_optimizer::scalar_subquery_to_join::ScalarSubqueryToJoin; use datafusion_optimizer::simplify_expressions::SimplifyExpressions; use datafusion_optimizer::single_distinct_to_groupby::SingleDistinctToGroupBy; use datafusion_optimizer::subquery_filter_to_join::SubqueryFilterToJoin; +use datafusion_optimizer::type_coercion::TypeCoercion; use datafusion_optimizer::{OptimizerConfig, OptimizerRule}; use datafusion_sql::planner::{ContextProvider, SqlToRel}; use datafusion_sql::sqlparser::ast::Statement; @@ -56,6 +57,24 @@ fn distribute_by() -> Result<()> { Ok(()) } +#[test] +fn intersect() -> Result<()> { + let sql = "SELECT col_int32, col_utf8 FROM test \ + INTERSECT SELECT col_int32, col_utf8 FROM test \ + INTERSECT SELECT col_int32, col_utf8 FROM test"; + let plan = test_sql(sql)?; + let expected = + "Semi Join: #test.col_int32 = #test.col_int32, #test.col_utf8 = #test.col_utf8\ + \n Distinct:\ + \n Semi Join: #test.col_int32 = #test.col_int32, #test.col_utf8 = #test.col_utf8\ + \n Distinct:\ + \n TableScan: test projection=[col_int32, col_utf8]\ + \n TableScan: test projection=[col_int32, col_utf8]\ + \n TableScan: test projection=[col_int32, col_utf8]"; + assert_eq!(expected, format!("{:?}", plan)); + Ok(()) +} + fn test_sql(sql: &str) -> Result { let rules: Vec> = vec![ // Simplify expressions first to maximize the chance @@ -73,6 +92,7 @@ fn test_sql(sql: &str) -> Result { Arc::new(FilterNullJoinKeys::default()), Arc::new(ReduceOuterJoin::new()), Arc::new(FilterPushDown::new()), + Arc::new(TypeCoercion::new()), Arc::new(LimitPushDown::new()), Arc::new(SingleDistinctToGroupBy::new()), ]; From d45034680977d9e3923ce77f1d9a690c3f47bdeb Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 8 Sep 2022 17:11:19 -0600 Subject: [PATCH 2/7] Add type coercion for between --- datafusion/optimizer/src/type_coercion.rs | 15 +++++++++ .../optimizer/tests/integration-test.rs | 32 +++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs index 522eebc93c6d7..fb736ab5346e3 100644 --- a/datafusion/optimizer/src/type_coercion.rs +++ b/datafusion/optimizer/src/type_coercion.rs @@ -98,6 +98,21 @@ impl ExprRewriter for TypeCoercionRewriter { right: Box::new(right.cast_to(&coerced_type, &self.schema)?), }) } + Expr::Between { + expr, + negated, + low, + high, + } => { + // cast low and high values to same type as the expression + let coerced_type = expr.get_type(&self.schema)?; + Ok(Expr::Between { + expr, + negated, + low: Box::new(low.cast_to(&coerced_type, &self.schema)?), + high: Box::new(high.cast_to(&coerced_type, &self.schema)?), + }) + } Expr::ScalarUDF { fun, args } => { let new_expr = coerce_arguments_for_signature( args.as_slice(), diff --git a/datafusion/optimizer/tests/integration-test.rs b/datafusion/optimizer/tests/integration-test.rs index d433b46961629..064afbff21238 100644 --- a/datafusion/optimizer/tests/integration-test.rs +++ b/datafusion/optimizer/tests/integration-test.rs @@ -27,6 +27,7 @@ use datafusion_optimizer::filter_null_join_keys::FilterNullJoinKeys; use datafusion_optimizer::filter_push_down::FilterPushDown; use datafusion_optimizer::limit_push_down::LimitPushDown; use datafusion_optimizer::optimizer::Optimizer; +use datafusion_optimizer::pre_cast_lit_in_comparison::PreCastLitInComparisonExpressions; use datafusion_optimizer::projection_push_down::ProjectionPushDown; use datafusion_optimizer::reduce_outer_join::ReduceOuterJoin; use datafusion_optimizer::rewrite_disjunctive_predicate::RewriteDisjunctivePredicate; @@ -75,11 +76,40 @@ fn intersect() -> Result<()> { Ok(()) } +#[test] +fn between_date32_plus_interval() -> Result<()> { + let sql = "SELECT count(1) FROM test \ + WHERE col_date32 between '1998-03-18' AND (cast('1998-03-18' as date) + INTERVAL '90 days')"; + let plan = test_sql(sql)?; + let expected = + "Projection: #COUNT(UInt8(1))\ + \n Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ + \n Filter: #test.col_date32 BETWEEN CAST(Utf8(\"1998-03-18\") AS Date32) AND Date32(\"10393\")\ + \n TableScan: test projection=[col_date32]"; + assert_eq!(expected, format!("{:?}", plan)); + Ok(()) +} + +#[test] +fn between_date64_plus_interval() -> Result<()> { + let sql = "SELECT count(1) FROM test \ + WHERE col_date64 between '1998-03-18' AND (cast('1998-03-18' as date) + INTERVAL '90 days')"; + let plan = test_sql(sql)?; + let expected = + "Projection: #COUNT(UInt8(1))\ + \n Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ + \n Filter: #test.col_date64 BETWEEN CAST(Utf8(\"1998-03-18\") AS Date64) AND CAST(Date32(\"10393\") AS Date64)\ + \n TableScan: test projection=[col_date64]"; + assert_eq!(expected, format!("{:?}", plan)); + Ok(()) +} + fn test_sql(sql: &str) -> Result { let rules: Vec> = vec![ // Simplify expressions first to maximize the chance // of applying other optimizations Arc::new(SimplifyExpressions::new()), + Arc::new(PreCastLitInComparisonExpressions::new()), Arc::new(DecorrelateWhereExists::new()), Arc::new(DecorrelateWhereIn::new()), Arc::new(ScalarSubqueryToJoin::new()), @@ -127,6 +157,8 @@ impl ContextProvider for MySchemaProvider { vec![ Field::new("col_int32", DataType::Int32, true), Field::new("col_utf8", DataType::Utf8, true), + Field::new("col_date32", DataType::Date32, true), + Field::new("col_date64", DataType::Date64, true), ], HashMap::new(), ); From f4283f5a68bfd32c79a7362974ee0dd3cf718e36 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 8 Sep 2022 17:39:49 -0600 Subject: [PATCH 3/7] add workaround for INTERVAL --- datafusion/optimizer/src/type_coercion.rs | 68 +++++++++++++++++++---- 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs index fb736ab5346e3..af1ad4de56bd8 100644 --- a/datafusion/optimizer/src/type_coercion.rs +++ b/datafusion/optimizer/src/type_coercion.rs @@ -18,6 +18,7 @@ //! Optimizer rule for type validation and coercion use crate::{OptimizerConfig, OptimizerRule}; +use arrow::datatypes::DataType; use datafusion_common::{DFSchema, DFSchemaRef, Result}; use datafusion_expr::binary_rule::coerce_types; use datafusion_expr::expr_rewriter::{ExprRewritable, ExprRewriter, RewriteRecursion}; @@ -87,16 +88,35 @@ impl ExprRewriter for TypeCoercionRewriter { } fn mutate(&mut self, expr: Expr) -> Result { + println!("mutate: {}", expr); match expr { - Expr::BinaryExpr { left, op, right } => { + Expr::BinaryExpr { + ref left, + op, + ref right, + } => { let left_type = left.get_type(&self.schema)?; let right_type = right.get_type(&self.schema)?; - let coerced_type = coerce_types(&left_type, &op, &right_type)?; - Ok(Expr::BinaryExpr { - left: Box::new(left.cast_to(&coerced_type, &self.schema)?), - op, - right: Box::new(right.cast_to(&coerced_type, &self.schema)?), - }) + match (&left_type, &right_type) { + (_, &DataType::Interval(_)) => { + // bug + // Arrow `can_cast_types` says we cannot cast an Interval to Date32/Date64 + // which contradicts DataFusion's `coerce_types` + Ok(expr.clone()) + } + _ => { + let coerced_type = coerce_types(&left_type, &op, &right_type)?; + Ok(Expr::BinaryExpr { + left: Box::new( + left.clone().cast_to(&coerced_type, &self.schema)?, + ), + op, + right: Box::new( + right.clone().cast_to(&coerced_type, &self.schema)?, + ), + }) + } + } } Expr::Between { expr, @@ -161,12 +181,12 @@ mod test { use crate::type_coercion::TypeCoercion; use crate::{OptimizerConfig, OptimizerRule}; use arrow::datatypes::DataType; - use datafusion_common::{DFSchema, Result}; + use datafusion_common::{DFSchema, Result, ScalarValue}; use datafusion_expr::{ lit, logical_plan::{EmptyRelation, Projection}, - Expr, LogicalPlan, ReturnTypeFunction, ScalarFunctionImplementation, ScalarUDF, - Signature, Volatility, + Expr, LogicalPlan, Operator, ReturnTypeFunction, ScalarFunctionImplementation, + ScalarUDF, Signature, Volatility, }; use std::sync::Arc; @@ -260,6 +280,34 @@ mod test { Ok(()) } + #[test] + fn binary_op_date32_add_interval() -> Result<()> { + //CAST(Utf8("1998-03-18") AS Date32) + IntervalDayTime("386547056640") + let expr = Expr::BinaryExpr { + left: Box::new(Expr::Cast { + expr: Box::new(lit("1998-03-18")), + data_type: DataType::Date32, + }), + op: Operator::Plus, + right: Box::new(Expr::Literal(ScalarValue::IntervalDayTime(Some( + 386547056640, + )))), + }; + let empty = Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema: Arc::new(DFSchema::empty()), + })); + let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty, None)?); + let rule = TypeCoercion::new(); + let mut config = OptimizerConfig::default(); + let plan = rule.optimize(&plan, &mut config)?; + assert_eq!( + "Projection: CAST(Utf8(\"1998-03-18\") AS Date32) + IntervalDayTime(\"386547056640\")\n EmptyRelation", + &format!("{:?}", plan) + ); + Ok(()) + } + fn empty() -> Arc { Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, From 4e120379a16aa2d40d905a01db23d0beeb233c29 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 8 Sep 2022 18:12:31 -0600 Subject: [PATCH 4/7] fix regression --- datafusion/core/tests/sql/predicates.rs | 2 +- datafusion/optimizer/src/type_coercion.rs | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/datafusion/core/tests/sql/predicates.rs b/datafusion/core/tests/sql/predicates.rs index 3c11b690d292a..350d521b518e3 100644 --- a/datafusion/core/tests/sql/predicates.rs +++ b/datafusion/core/tests/sql/predicates.rs @@ -428,7 +428,7 @@ async fn multiple_or_predicates() -> Result<()> { "Explain [plan_type:Utf8, plan:Utf8]", " Projection: #lineitem.l_partkey [l_partkey:Int64]", " Projection: #part.p_partkey = #lineitem.l_partkey AS #part.p_partkey = #lineitem.l_partkey#lineitem.l_partkey#part.p_partkey, #lineitem.l_partkey, #lineitem.l_quantity, #part.p_brand, #part.p_size [#part.p_partkey = #lineitem.l_partkey#lineitem.l_partkey#part.p_partkey:Boolean;N, l_partkey:Int64, l_quantity:Float64, p_brand:Utf8, p_size:Int32]", - " Filter: #part.p_partkey = #lineitem.l_partkey AND #part.p_brand = Utf8(\"Brand#12\") AND #lineitem.l_quantity >= CAST(Int64(1) AS Float64) AND #lineitem.l_quantity <= CAST(Int64(11) AS Float64) AND #part.p_size BETWEEN Int64(1) AND Int64(5) OR #part.p_brand = Utf8(\"Brand#23\") AND #lineitem.l_quantity >= CAST(Int64(10) AS Float64) AND #lineitem.l_quantity <= CAST(Int64(20) AS Float64) AND #part.p_size BETWEEN Int64(1) AND Int64(10) OR #part.p_brand = Utf8(\"Brand#34\") AND #lineitem.l_quantity >= CAST(Int64(20) AS Float64) AND #lineitem.l_quantity <= CAST(Int64(30) AS Float64) AND #part.p_size BETWEEN Int64(1) AND Int64(15) [l_partkey:Int64, l_quantity:Float64, p_partkey:Int64, p_brand:Utf8, p_size:Int32]", + " Filter: #part.p_partkey = #lineitem.l_partkey AND #part.p_brand = Utf8(\"Brand#12\") AND #lineitem.l_quantity >= CAST(Int64(1) AS Float64) AND #lineitem.l_quantity <= CAST(Int64(11) AS Float64) AND CAST(#part.p_size AS Int64) BETWEEN Int64(1) AND Int64(5) OR #part.p_brand = Utf8(\"Brand#23\") AND #lineitem.l_quantity >= CAST(Int64(10) AS Float64) AND #lineitem.l_quantity <= CAST(Int64(20) AS Float64) AND CAST(#part.p_size AS Int64) BETWEEN Int64(1) AND Int64(10) OR #part.p_brand = Utf8(\"Brand#34\") AND #lineitem.l_quantity >= CAST(Int64(20) AS Float64) AND #lineitem.l_quantity <= CAST(Int64(30) AS Float64) AND CAST(#part.p_size AS Int64) BETWEEN Int64(1) AND Int64(15) [l_partkey:Int64, l_quantity:Float64, p_partkey:Int64, p_brand:Utf8, p_size:Int32]", " CrossJoin: [l_partkey:Int64, l_quantity:Float64, p_partkey:Int64, p_brand:Utf8, p_size:Int32]", " TableScan: lineitem projection=[l_partkey, l_quantity] [l_partkey:Int64, l_quantity:Float64]", " TableScan: part projection=[p_partkey, p_brand, p_size] [p_partkey:Int64, p_brand:Utf8, p_size:Int32]", diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs index af1ad4de56bd8..d65b85a7c225f 100644 --- a/datafusion/optimizer/src/type_coercion.rs +++ b/datafusion/optimizer/src/type_coercion.rs @@ -19,8 +19,8 @@ use crate::{OptimizerConfig, OptimizerRule}; use arrow::datatypes::DataType; -use datafusion_common::{DFSchema, DFSchemaRef, Result}; -use datafusion_expr::binary_rule::coerce_types; +use datafusion_common::{DFSchema, DFSchemaRef, DataFusionError, Result}; +use datafusion_expr::binary_rule::{coerce_types, comparison_coercion}; use datafusion_expr::expr_rewriter::{ExprRewritable, ExprRewriter, RewriteRecursion}; use datafusion_expr::type_coercion::data_types; use datafusion_expr::utils::from_plan; @@ -88,7 +88,6 @@ impl ExprRewriter for TypeCoercionRewriter { } fn mutate(&mut self, expr: Expr) -> Result { - println!("mutate: {}", expr); match expr { Expr::BinaryExpr { ref left, @@ -98,10 +97,12 @@ impl ExprRewriter for TypeCoercionRewriter { let left_type = left.get_type(&self.schema)?; let right_type = right.get_type(&self.schema)?; match (&left_type, &right_type) { - (_, &DataType::Interval(_)) => { - // bug - // Arrow `can_cast_types` says we cannot cast an Interval to Date32/Date64 - // which contradicts DataFusion's `coerce_types` + ( + DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _), + &DataType::Interval(_), + ) => { + // Arrow `can_cast_types` says we cannot cast an Interval to + // Date32/Date64/Timestamp, which contradicts DataFusion's `coerce_types` Ok(expr.clone()) } _ => { @@ -124,10 +125,12 @@ impl ExprRewriter for TypeCoercionRewriter { low, high, } => { - // cast low and high values to same type as the expression - let coerced_type = expr.get_type(&self.schema)?; + let expr_type = expr.get_type(&self.schema)?; + let low_type = low.get_type(&self.schema)?; + let coerced_type = comparison_coercion(&expr_type, &low_type) + .ok_or_else(|| DataFusionError::Internal("".to_string()))?; Ok(Expr::Between { - expr, + expr: Box::new(expr.cast_to(&coerced_type, &self.schema)?), negated, low: Box::new(low.cast_to(&coerced_type, &self.schema)?), high: Box::new(high.cast_to(&coerced_type, &self.schema)?), From 73ebe71b0e9c8ce5fd89360d06e763b909ce0725 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 9 Sep 2022 08:24:12 -0600 Subject: [PATCH 5/7] add support for coercion between Date32/Date64 and fix regressions caused by recent merges to master --- datafusion/expr/src/binary_rule.rs | 2 ++ datafusion/optimizer/tests/integration-test.rs | 14 ++++++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/datafusion/expr/src/binary_rule.rs b/datafusion/expr/src/binary_rule.rs index 5eeb1867a0170..986decee35274 100644 --- a/datafusion/expr/src/binary_rule.rs +++ b/datafusion/expr/src/binary_rule.rs @@ -516,6 +516,8 @@ fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Some(Date64), + (Date32, Date64) => Some(Date64), (Utf8, Date32) => Some(Date32), (Date32, Utf8) => Some(Date32), (Utf8, Date64) => Some(Date64), diff --git a/datafusion/optimizer/tests/integration-test.rs b/datafusion/optimizer/tests/integration-test.rs index 064afbff21238..87a0bab68a40a 100644 --- a/datafusion/optimizer/tests/integration-test.rs +++ b/datafusion/optimizer/tests/integration-test.rs @@ -79,12 +79,11 @@ fn intersect() -> Result<()> { #[test] fn between_date32_plus_interval() -> Result<()> { let sql = "SELECT count(1) FROM test \ - WHERE col_date32 between '1998-03-18' AND (cast('1998-03-18' as date) + INTERVAL '90 days')"; + WHERE col_date32 between '1998-03-18' AND cast('1998-03-18' as date) + INTERVAL '90 days'"; let plan = test_sql(sql)?; let expected = - "Projection: #COUNT(UInt8(1))\ - \n Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ - \n Filter: #test.col_date32 BETWEEN CAST(Utf8(\"1998-03-18\") AS Date32) AND Date32(\"10393\")\ + "Projection: #COUNT(UInt8(1))\n Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ + \n Filter: #test.col_date32 >= CAST(Utf8(\"1998-03-18\") AS Date32) AND #test.col_date32 <= Date32(\"10393\")\ \n TableScan: test projection=[col_date32]"; assert_eq!(expected, format!("{:?}", plan)); Ok(()) @@ -93,12 +92,11 @@ fn between_date32_plus_interval() -> Result<()> { #[test] fn between_date64_plus_interval() -> Result<()> { let sql = "SELECT count(1) FROM test \ - WHERE col_date64 between '1998-03-18' AND (cast('1998-03-18' as date) + INTERVAL '90 days')"; + WHERE col_date64 between '1998-03-18' AND cast('1998-03-18' as date) + INTERVAL '90 days'"; let plan = test_sql(sql)?; let expected = - "Projection: #COUNT(UInt8(1))\ - \n Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ - \n Filter: #test.col_date64 BETWEEN CAST(Utf8(\"1998-03-18\") AS Date64) AND CAST(Date32(\"10393\") AS Date64)\ + "Projection: #COUNT(UInt8(1))\n Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ + \n Filter: #test.col_date64 >= CAST(Utf8(\"1998-03-18\") AS Date64) AND #test.col_date64 <= CAST(Date32(\"10393\") AS Date64)\ \n TableScan: test projection=[col_date64]"; assert_eq!(expected, format!("{:?}", plan)); Ok(()) From e1a0ab6eaa262f8edc854df4e2b61c80fdce4641 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 9 Sep 2022 09:12:28 -0600 Subject: [PATCH 6/7] fix error message --- datafusion/optimizer/src/type_coercion.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs index d65b85a7c225f..f946c5c8ad7b6 100644 --- a/datafusion/optimizer/src/type_coercion.rs +++ b/datafusion/optimizer/src/type_coercion.rs @@ -128,7 +128,12 @@ impl ExprRewriter for TypeCoercionRewriter { let expr_type = expr.get_type(&self.schema)?; let low_type = low.get_type(&self.schema)?; let coerced_type = comparison_coercion(&expr_type, &low_type) - .ok_or_else(|| DataFusionError::Internal("".to_string()))?; + .ok_or_else(|| { + DataFusionError::Internal(format!( + "Failed to coerce types {} and {} in BETWEEN expression", + expr_type, low_type + )) + })?; Ok(Expr::Between { expr: Box::new(expr.cast_to(&coerced_type, &self.schema)?), negated, From f81fb7c3ed83af1f6dae592aa13ccfcbe1f56ca5 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 9 Sep 2022 10:47:54 -0600 Subject: [PATCH 7/7] update comments and link to follow-on issue --- datafusion/expr/src/binary_rule.rs | 6 ++++++ datafusion/optimizer/src/type_coercion.rs | 3 +-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/binary_rule.rs b/datafusion/expr/src/binary_rule.rs index 986decee35274..29c1cf0d4174b 100644 --- a/datafusion/expr/src/binary_rule.rs +++ b/datafusion/expr/src/binary_rule.rs @@ -72,6 +72,12 @@ pub fn binary_operator_data_type( /// Coercion rules for all binary operators. Returns the output type /// of applying `op` to an argument of `lhs_type` and `rhs_type`. +/// +/// TODO this function is trying to serve two purposes at once; it determines the result type +/// of the binary operation and also determines how the inputs can be coerced but this +/// results in inconsistencies in some cases (particular around date + interval) +/// +/// Tracking issue is https://github.com/apache/arrow-datafusion/issues/3419 pub fn coerce_types( lhs_type: &DataType, op: &Operator, diff --git a/datafusion/optimizer/src/type_coercion.rs b/datafusion/optimizer/src/type_coercion.rs index f946c5c8ad7b6..df0d3681177a1 100644 --- a/datafusion/optimizer/src/type_coercion.rs +++ b/datafusion/optimizer/src/type_coercion.rs @@ -101,8 +101,7 @@ impl ExprRewriter for TypeCoercionRewriter { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _), &DataType::Interval(_), ) => { - // Arrow `can_cast_types` says we cannot cast an Interval to - // Date32/Date64/Timestamp, which contradicts DataFusion's `coerce_types` + // this is a workaround for https://github.com/apache/arrow-datafusion/issues/3419 Ok(expr.clone()) } _ => {