From 53bbe407419254e69fdd07e15d00e15b45c40727 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 23 May 2020 07:24:18 -0600 Subject: [PATCH 1/4] re-implement get_supertype --- rust/datafusion/src/logicalplan.rs | 12 +- .../datafusion/src/optimizer/type_coercion.rs | 2 +- rust/datafusion/src/optimizer/utils.rs | 184 ++++++++---------- 3 files changed, 88 insertions(+), 110 deletions(-) diff --git a/rust/datafusion/src/logicalplan.rs b/rust/datafusion/src/logicalplan.rs index 704c5482543..eca9fcdd9b7 100644 --- a/rust/datafusion/src/logicalplan.rs +++ b/rust/datafusion/src/logicalplan.rs @@ -731,15 +731,18 @@ pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { _ => false, }, Int16 => match type_from { - Int8 | Int16 | UInt8 => true, + Int8 | Int16 => true, + UInt8 => true, _ => false, }, Int32 => match type_from { - Int8 | Int16 | Int32 | UInt8 | UInt16 => true, + Int8 | Int16 | Int32 => true, + UInt8 | UInt16 => true, _ => false, }, Int64 => match type_from { - Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 => true, + Int8 | Int16 | Int32 | Int64 => true, + UInt8 | UInt16 | UInt32 => true, _ => false, }, UInt8 => match type_from { @@ -747,14 +750,17 @@ pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { _ => false, }, UInt16 => match type_from { + Int8 => true, UInt8 | UInt16 => true, _ => false, }, UInt32 => match type_from { + Int8 | Int16 => true, UInt8 | UInt16 | UInt32 => true, _ => false, }, UInt64 => match type_from { + Int8 | Int16 | Int32 => true, UInt8 | UInt16 | UInt32 | UInt64 => true, _ => false, }, diff --git a/rust/datafusion/src/optimizer/type_coercion.rs b/rust/datafusion/src/optimizer/type_coercion.rs index 03a85ba4c85..50340dc2f0b 100644 --- a/rust/datafusion/src/optimizer/type_coercion.rs +++ b/rust/datafusion/src/optimizer/type_coercion.rs @@ -285,6 +285,6 @@ mod tests { let expr2 = rule.rewrite_expr(&expr, &schema).unwrap(); - assert_eq!(expected, format!("{:?}", expr2)); + assert_eq!(format!("{:?}", expr2), expected); } } diff --git a/rust/datafusion/src/optimizer/utils.rs b/rust/datafusion/src/optimizer/utils.rs index f9467a12091..ac706ab5165 100644 --- a/rust/datafusion/src/optimizer/utils.rs +++ b/rust/datafusion/src/optimizer/utils.rs @@ -131,116 +131,88 @@ pub fn exprlist_to_fields(expr: &[Expr], input_schema: &Schema) -> Result Result { - match _get_supertype(l, r) { - Some(dt) => Ok(dt), - None => _get_supertype(r, l).ok_or_else(|| { - ExecutionError::InternalError(format!( - "Failed to determine supertype of {:?} and {:?}", - l, r - )) - }), - } -} - -/// Given two datatypes, determine the supertype that both types can safely be cast to -fn _get_supertype(l: &DataType, r: &DataType) -> Option { use arrow::datatypes::DataType::*; - match (l, r) { - (UInt8, Int8) => Some(Int8), - (UInt8, Int16) => Some(Int16), - (UInt8, Int32) => Some(Int32), - (UInt8, Int64) => Some(Int64), - - (UInt16, Int16) => Some(Int16), - (UInt16, Int32) => Some(Int32), - (UInt16, Int64) => Some(Int64), - - (UInt32, Int32) => Some(Int32), - (UInt32, Int64) => Some(Int64), - - (UInt64, Int64) => Some(Int64), - - (Int8, UInt8) => Some(Int8), - - (Int16, UInt8) => Some(Int16), - (Int16, UInt16) => Some(Int16), - - (Int32, UInt8) => Some(Int32), - (Int32, UInt16) => Some(Int32), - (Int32, UInt32) => Some(Int32), - - (Int64, UInt8) => Some(Int64), - (Int64, UInt16) => Some(Int64), - (Int64, UInt32) => Some(Int64), - (Int64, UInt64) => Some(Int64), - - (UInt8, UInt8) => Some(UInt8), - (UInt8, UInt16) => Some(UInt16), - (UInt8, UInt32) => Some(UInt32), - (UInt8, UInt64) => Some(UInt64), - (UInt8, Float32) => Some(Float32), - (UInt8, Float64) => Some(Float64), - (UInt16, UInt8) => Some(UInt16), - (UInt16, UInt16) => Some(UInt16), - (UInt16, UInt32) => Some(UInt32), - (UInt16, UInt64) => Some(UInt64), - (UInt16, Float32) => Some(Float32), - (UInt16, Float64) => Some(Float64), - - (UInt32, UInt8) => Some(UInt32), - (UInt32, UInt16) => Some(UInt32), - (UInt32, UInt32) => Some(UInt32), - (UInt32, UInt64) => Some(UInt64), - (UInt32, Float32) => Some(Float32), - (UInt32, Float64) => Some(Float64), - - (UInt64, UInt8) => Some(UInt64), - (UInt64, UInt16) => Some(UInt64), - (UInt64, UInt32) => Some(UInt64), - (UInt64, UInt64) => Some(UInt64), - (UInt64, Float32) => Some(Float32), - (UInt64, Float64) => Some(Float64), - - (Int8, Int8) => Some(Int8), - (Int8, Int16) => Some(Int16), - (Int8, Int32) => Some(Int32), - (Int8, Int64) => Some(Int64), - (Int8, Float32) => Some(Float32), - (Int8, Float64) => Some(Float64), - - (Int16, Int8) => Some(Int16), - (Int16, Int16) => Some(Int16), - (Int16, Int32) => Some(Int32), - (Int16, Int64) => Some(Int64), - (Int16, Float32) => Some(Float32), - (Int16, Float64) => Some(Float64), - - (Int32, Int8) => Some(Int32), - (Int32, Int16) => Some(Int32), - (Int32, Int32) => Some(Int32), - (Int32, Int64) => Some(Int64), - (Int32, Float32) => Some(Float32), - (Int32, Float64) => Some(Float64), - - (Int64, Int8) => Some(Int64), - (Int64, Int16) => Some(Int64), - (Int64, Int32) => Some(Int64), - (Int64, Int64) => Some(Int64), - (Int64, Float32) => Some(Float32), - (Int64, Float64) => Some(Float64), - - (Float32, Float32) => Some(Float32), - (Float32, Float64) => Some(Float64), - (Float64, Float32) => Some(Float64), - (Float64, Float64) => Some(Float64), - - (Utf8, _) => Some(Utf8), - (_, Utf8) => Some(Utf8), - - (Boolean, Boolean) => Some(Boolean), + if l == r { + return Ok(l.clone()); + } + let super_type = match l { + UInt8 => match r { + UInt16 | UInt32 | UInt64 => Some(r.clone()), + Int16 | Int32 | Int64 => Some(r.clone()), + Float32 | Float64 => Some(r.clone()), + _ => None, + }, + UInt16 => match r { + UInt8 => Some(l.clone()), + UInt32 | UInt64 => Some(r.clone()), + Int8 => Some(l.clone()), + Int32 | Int64 => Some(r.clone()), + Float32 | Float64 => Some(r.clone()), + _ => None, + }, + UInt32 => match r { + UInt8 | UInt16 => Some(l.clone()), + UInt64 => Some(r.clone()), + Int8 | Int16 => Some(l.clone()), + Int64 => Some(r.clone()), + Float32 | Float64 => Some(r.clone()), + _ => None, + }, + UInt64 => match r { + UInt8 | UInt16 | UInt32 => Some(l.clone()), + Int8 | Int16 | Int32 => Some(l.clone()), + Float32 | Float64 => Some(r.clone()), + _ => None, + }, + Int8 => match r { + Int16 | Int32 | Int64 => Some(r.clone()), + UInt16 | UInt32 | UInt64 => Some(r.clone()), + Float32 | Float64 => Some(r.clone()), + _ => None, + }, + Int16 => match r { + Int8 => Some(l.clone()), + Int32 | Int64 => Some(r.clone()), + UInt32 | UInt64 => Some(r.clone()), + Float32 | Float64 => Some(r.clone()), + _ => None, + }, + Int32 => match r { + Int8 | Int16 => Some(l.clone()), + Int64 => Some(r.clone()), + UInt64 => Some(r.clone()), + Float32 | Float64 => Some(r.clone()), + _ => None, + }, + Int64 => match r { + Int8 | Int16 | Int32 => Some(l.clone()), + UInt8 | UInt16 | UInt32 => Some(l.clone()), + Float32 | Float64 => Some(r.clone()), + _ => None, + }, + Float32 => match r { + Int8 | Int16 | Int32 | Int64 => Some(Float32), + UInt8 | UInt16 | UInt32 | UInt64 => Some(Float32), + Float64 => Some(Float64), + _ => None, + }, + Float64 => match r { + Int8 | Int16 | Int32 | Int64 => Some(Float64), + UInt8 | UInt16 | UInt32 | UInt64 => Some(Float64), + Float32 | Float64 => Some(Float64), + _ => None, + }, _ => None, + }; + + match super_type { + Some(dt) => Ok(dt), + None => Err(ExecutionError::InternalError(format!( + "Failed to determine supertype of {:?} and {:?}", + l, r + ))), } } From cde5a0c3670e28e4955ef5133b6f504ccd7797d0 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 23 May 2020 10:09:37 -0600 Subject: [PATCH 2/4] tests --- rust/datafusion/src/logicalplan.rs | 5 ++++ .../datafusion/src/optimizer/type_coercion.rs | 25 ++++++++++++++++++- rust/datafusion/src/optimizer/utils.rs | 2 ++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/rust/datafusion/src/logicalplan.rs b/rust/datafusion/src/logicalplan.rs index eca9fcdd9b7..faef11fc7e3 100644 --- a/rust/datafusion/src/logicalplan.rs +++ b/rust/datafusion/src/logicalplan.rs @@ -725,6 +725,11 @@ impl fmt::Debug for LogicalPlan { /// Verify a given type cast can be performed pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { use self::DataType::*; + + if type_from == type_into { + return true; + } + match type_into { Int8 => match type_from { Int8 => true, diff --git a/rust/datafusion/src/optimizer/type_coercion.rs b/rust/datafusion/src/optimizer/type_coercion.rs index 50340dc2f0b..443df15ed41 100644 --- a/rust/datafusion/src/optimizer/type_coercion.rs +++ b/rust/datafusion/src/optimizer/type_coercion.rs @@ -187,8 +187,10 @@ mod tests { use crate::execution::context::ExecutionContext; use crate::execution::physical_plan::csv::CsvReadOptions; use crate::logicalplan::Expr::*; - use crate::logicalplan::{col, Operator}; + use crate::logicalplan::{col, can_coerce_from, Operator}; + use crate::optimizer::utils::get_supertype; use crate::test::arrow_testdata_path; + use arrow::datatypes::DataType::*; use arrow::datatypes::{DataType, Field, Schema}; #[test] @@ -209,6 +211,27 @@ mod tests { format!("{:?}", plan).starts_with("Selection: CAST(#c7 AS Float64) Lt #c12") ); + #[test] + fn test_type_matrix() -> Result<()> { + let types = vec![ + Boolean, Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, + Float64, Utf8, + ]; + + for from_type in &types { + for to_type in &types { + match get_supertype(from_type, to_type) { + Ok(t) => { + // swapping from and to should result in same supertype + assert_eq!(t, get_supertype(to_type, from_type)?); + // both from and to types should be coercable to the supertype + assert!(can_coerce_from(&t, &from_type)); + assert!(can_coerce_from(&t, &to_type)); + } + Err(_) => assert!(get_supertype(to_type, from_type).is_err()), + } + } + } Ok(()) } diff --git a/rust/datafusion/src/optimizer/utils.rs b/rust/datafusion/src/optimizer/utils.rs index ac706ab5165..d329167596c 100644 --- a/rust/datafusion/src/optimizer/utils.rs +++ b/rust/datafusion/src/optimizer/utils.rs @@ -175,6 +175,7 @@ pub fn get_supertype(l: &DataType, r: &DataType) -> Result { Int16 => match r { Int8 => Some(l.clone()), Int32 | Int64 => Some(r.clone()), + UInt8 => Some(l.clone()), UInt32 | UInt64 => Some(r.clone()), Float32 | Float64 => Some(r.clone()), _ => None, @@ -182,6 +183,7 @@ pub fn get_supertype(l: &DataType, r: &DataType) -> Result { Int32 => match r { Int8 | Int16 => Some(l.clone()), Int64 => Some(r.clone()), + UInt8 | UInt16 => Some(l.clone()), UInt64 => Some(r.clone()), Float32 | Float64 => Some(r.clone()), _ => None, From 992a21a79cc90f69d2beab77e532f92575b92822 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 23 May 2020 15:30:52 -0600 Subject: [PATCH 3/4] remove support for casts and implicit coercions that could lose data --- rust/datafusion/src/execution/context.rs | 2 +- rust/datafusion/src/logicalplan.rs | 12 +++-------- .../datafusion/src/optimizer/type_coercion.rs | 21 ++++--------------- rust/datafusion/src/optimizer/utils.rs | 12 ----------- 4 files changed, 8 insertions(+), 39 deletions(-) diff --git a/rust/datafusion/src/execution/context.rs b/rust/datafusion/src/execution/context.rs index 9952f8cae2f..6f953bad139 100644 --- a/rust/datafusion/src/execution/context.rs +++ b/rust/datafusion/src/execution/context.rs @@ -658,7 +658,7 @@ mod tests { let mut ctx = create_ctx(&tmp_dir, partition_count)?; let logical_plan = - ctx.create_logical_plan("SELECT c1, c2 FROM test WHERE c1 > 0 AND c1 < 3")?; + ctx.create_logical_plan("SELECT c1, c2 FROM test WHERE CAST(c1 AS double) > 0 AND CAST(c1 AS double) < 3")?; let logical_plan = ctx.optimize(&logical_plan)?; let physical_plan = ctx.create_physical_plan(&logical_plan, 1024)?; diff --git a/rust/datafusion/src/logicalplan.rs b/rust/datafusion/src/logicalplan.rs index faef11fc7e3..f92ea8eea77 100644 --- a/rust/datafusion/src/logicalplan.rs +++ b/rust/datafusion/src/logicalplan.rs @@ -288,14 +288,14 @@ impl Expr { let this_type = self.get_type(schema)?; if this_type == *cast_to_type { Ok(self.clone()) - } else if can_coerce_from(cast_to_type, &this_type) { + } else if cast_supported(cast_to_type, &this_type) { Ok(Expr::Cast { expr: Box::new(self.clone()), data_type: cast_to_type.clone(), }) } else { Err(ExecutionError::General(format!( - "Cannot automatically convert {:?} to {:?}", + "Cannot cast from {:?} to {:?}", this_type, cast_to_type ))) } @@ -723,7 +723,7 @@ impl fmt::Debug for LogicalPlan { } /// Verify a given type cast can be performed -pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { +pub fn cast_supported(type_into: &DataType, type_from: &DataType) -> bool { use self::DataType::*; if type_from == type_into { @@ -737,17 +737,14 @@ pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { }, Int16 => match type_from { Int8 | Int16 => true, - UInt8 => true, _ => false, }, Int32 => match type_from { Int8 | Int16 | Int32 => true, - UInt8 | UInt16 => true, _ => false, }, Int64 => match type_from { Int8 | Int16 | Int32 | Int64 => true, - UInt8 | UInt16 | UInt32 => true, _ => false, }, UInt8 => match type_from { @@ -755,17 +752,14 @@ pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { _ => false, }, UInt16 => match type_from { - Int8 => true, UInt8 | UInt16 => true, _ => false, }, UInt32 => match type_from { - Int8 | Int16 => true, UInt8 | UInt16 | UInt32 => true, _ => false, }, UInt64 => match type_from { - Int8 | Int16 | Int32 => true, UInt8 | UInt16 | UInt32 | UInt64 => true, _ => false, }, diff --git a/rust/datafusion/src/optimizer/type_coercion.rs b/rust/datafusion/src/optimizer/type_coercion.rs index 443df15ed41..1cbd416691a 100644 --- a/rust/datafusion/src/optimizer/type_coercion.rs +++ b/rust/datafusion/src/optimizer/type_coercion.rs @@ -187,7 +187,8 @@ mod tests { use crate::execution::context::ExecutionContext; use crate::execution::physical_plan::csv::CsvReadOptions; use crate::logicalplan::Expr::*; - use crate::logicalplan::{col, can_coerce_from, Operator}; + use crate::logicalplan::{col, can_coerce_from, cast_supported, Operator}; + use crate::logicalplan::{, Operator}; use crate::optimizer::utils::get_supertype; use crate::test::arrow_testdata_path; use arrow::datatypes::DataType::*; @@ -225,8 +226,8 @@ mod tests { // swapping from and to should result in same supertype assert_eq!(t, get_supertype(to_type, from_type)?); // both from and to types should be coercable to the supertype - assert!(can_coerce_from(&t, &from_type)); - assert!(can_coerce_from(&t, &to_type)); + assert!(cast_supported(&t, &from_type)); + assert!(cast_supported(&t, &to_type)); } Err(_) => assert!(get_supertype(to_type, from_type).is_err()), } @@ -277,20 +278,6 @@ mod tests { ); } - #[test] - fn test_add_u32_i64() { - binary_cast_test( - DataType::UInt32, - DataType::Int64, - "CAST(#0 AS Int64) Plus #1", - ); - binary_cast_test( - DataType::Int64, - DataType::UInt32, - "#0 Plus CAST(#1 AS Int64)", - ); - } - fn binary_cast_test(left_type: DataType, right_type: DataType, expected: &str) { let schema = Schema::new(vec![ Field::new("c0", left_type, true), diff --git a/rust/datafusion/src/optimizer/utils.rs b/rust/datafusion/src/optimizer/utils.rs index d329167596c..56046360ecd 100644 --- a/rust/datafusion/src/optimizer/utils.rs +++ b/rust/datafusion/src/optimizer/utils.rs @@ -140,57 +140,45 @@ pub fn get_supertype(l: &DataType, r: &DataType) -> Result { let super_type = match l { UInt8 => match r { UInt16 | UInt32 | UInt64 => Some(r.clone()), - Int16 | Int32 | Int64 => Some(r.clone()), Float32 | Float64 => Some(r.clone()), _ => None, }, UInt16 => match r { UInt8 => Some(l.clone()), UInt32 | UInt64 => Some(r.clone()), - Int8 => Some(l.clone()), - Int32 | Int64 => Some(r.clone()), Float32 | Float64 => Some(r.clone()), _ => None, }, UInt32 => match r { UInt8 | UInt16 => Some(l.clone()), UInt64 => Some(r.clone()), - Int8 | Int16 => Some(l.clone()), - Int64 => Some(r.clone()), Float32 | Float64 => Some(r.clone()), _ => None, }, UInt64 => match r { UInt8 | UInt16 | UInt32 => Some(l.clone()), - Int8 | Int16 | Int32 => Some(l.clone()), Float32 | Float64 => Some(r.clone()), _ => None, }, Int8 => match r { Int16 | Int32 | Int64 => Some(r.clone()), - UInt16 | UInt32 | UInt64 => Some(r.clone()), Float32 | Float64 => Some(r.clone()), _ => None, }, Int16 => match r { Int8 => Some(l.clone()), Int32 | Int64 => Some(r.clone()), - UInt8 => Some(l.clone()), - UInt32 | UInt64 => Some(r.clone()), Float32 | Float64 => Some(r.clone()), _ => None, }, Int32 => match r { Int8 | Int16 => Some(l.clone()), Int64 => Some(r.clone()), - UInt8 | UInt16 => Some(l.clone()), - UInt64 => Some(r.clone()), Float32 | Float64 => Some(r.clone()), _ => None, }, Int64 => match r { Int8 | Int16 | Int32 => Some(l.clone()), - UInt8 | UInt16 | UInt32 => Some(l.clone()), Float32 | Float64 => Some(r.clone()), _ => None, }, From facdfb17a0c0893f8baaf78cf98ef8627cc16ccb Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 23 May 2020 15:41:34 -0600 Subject: [PATCH 4/4] fix rebase error --- rust/datafusion/src/optimizer/type_coercion.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rust/datafusion/src/optimizer/type_coercion.rs b/rust/datafusion/src/optimizer/type_coercion.rs index 1cbd416691a..349627e33f9 100644 --- a/rust/datafusion/src/optimizer/type_coercion.rs +++ b/rust/datafusion/src/optimizer/type_coercion.rs @@ -187,8 +187,7 @@ mod tests { use crate::execution::context::ExecutionContext; use crate::execution::physical_plan::csv::CsvReadOptions; use crate::logicalplan::Expr::*; - use crate::logicalplan::{col, can_coerce_from, cast_supported, Operator}; - use crate::logicalplan::{, Operator}; + use crate::logicalplan::{cast_supported, col, Operator}; use crate::optimizer::utils::get_supertype; use crate::test::arrow_testdata_path; use arrow::datatypes::DataType::*; @@ -212,6 +211,9 @@ mod tests { format!("{:?}", plan).starts_with("Selection: CAST(#c7 AS Float64) Lt #c12") ); + Ok(()) + } + #[test] fn test_type_matrix() -> Result<()> { let types = vec![