From 55ec30ed96e2046072c22ad6d157cdf5027d6b87 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 11 May 2022 06:29:29 -0400 Subject: [PATCH] fix `NULL column` evaluation, tests for same --- datafusion-cli/Cargo.lock | 21 +-- .../src/physical_plan/file_format/parquet.rs | 7 +- datafusion/core/tests/sql/expr.rs | 155 +++++------------- .../physical-expr/src/expressions/binary.rs | 63 +++---- 4 files changed, 81 insertions(+), 165 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 9a84e35c2309d..ad6fb557fe76d 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -205,7 +205,7 @@ dependencies = [ "futures", "log", "parking_lot", - "sqlparser 0.16.0", + "sqlparser", "tempfile", "tokio", ] @@ -232,7 +232,7 @@ dependencies = [ "prost-types", "rustc_version", "serde", - "sqlparser 0.16.0", + "sqlparser", "tokio", "tonic", "tonic-build", @@ -507,7 +507,7 @@ dependencies = [ "pin-project-lite", "rand", "smallvec", - "sqlparser 0.17.0", + "sqlparser", "tempfile", "tokio", "tokio-stream", @@ -536,12 +536,12 @@ dependencies = [ "arrow", "ordered-float 3.0.0", "parquet", - "sqlparser 0.17.0", + "sqlparser", ] [[package]] name = "datafusion-data-access" -version = "1.0.0" +version = "7.0.0" dependencies = [ "async-trait", "chrono", @@ -559,7 +559,7 @@ dependencies = [ "ahash", "arrow", "datafusion-common", - "sqlparser 0.17.0", + "sqlparser", ] [[package]] @@ -1936,15 +1936,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "sqlparser" -version = "0.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e9a527b68048eb95495a1508f6c8395c8defcff5ecdbe8ad4106d08a2ef2a3c" -dependencies = [ - "log", -] - [[package]] name = "sqlparser" version = "0.17.0" diff --git a/datafusion/core/src/physical_plan/file_format/parquet.rs b/datafusion/core/src/physical_plan/file_format/parquet.rs index d2e156f32ed95..2cee69a64fe9f 100644 --- a/datafusion/core/src/physical_plan/file_format/parquet.rs +++ b/datafusion/core/src/physical_plan/file_format/parquet.rs @@ -1452,9 +1452,10 @@ mod tests { .enumerate() .map(|(i, g)| row_group_predicate(g, i)) .collect::>(); - // no row group is filtered out because the predicate expression can't be evaluated - // when a null array is generated for a statistics column, - assert_eq!(row_group_filter, vec![true, true]); + + // bool = NULL always evaluates to NULL (and thus will not + // pass predicates. Ideally these should both be false + assert_eq!(row_group_filter, vec![false, true]); Ok(()) } diff --git a/datafusion/core/tests/sql/expr.rs b/datafusion/core/tests/sql/expr.rs index b8a00265766b0..ca1f824ac36d0 100644 --- a/datafusion/core/tests/sql/expr.rs +++ b/datafusion/core/tests/sql/expr.rs @@ -1203,121 +1203,54 @@ async fn nested_subquery() -> Result<()> { } #[tokio::test] -async fn comparisons_with_null() -> Result<()> { +async fn comparisons_with_null_lt() { let ctx = SessionContext::new(); - // 1. Numeric comparison with NULL - let sql = "select column1 < NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------------------+", - "| t.column1 Lt NULL |", - "+-------------------+", - "| |", - "| |", - "+-------------------+", - ]; - assert_batches_eq!(expected, &actual); - let sql = - "select column1 <= NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+---------------------+", - "| t.column1 LtEq NULL |", - "+---------------------+", - "| |", - "| |", - "+---------------------+", + // we expect all the following queries to yield a two null values + let cases = vec![ + // 1. Numeric comparison with NULL + "select column1 < NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + "select column1 <= NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + "select column1 > NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + "select column1 >= NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + "select column1 = NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + "select column1 != NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + // 1.1 Float value comparison with NULL + "select column3 < NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + // String comparison with NULL + "select column2 < NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + // Boolean comparison with NULL + "select column1 < NULL from (VALUES (true), (false)) as t", + // ---- + // ---- same queries, reversed argument order (as they go through + // ---- a different evaluation path) + // ---- + + // 1. Numeric comparison with NULL + "select NULL < column1 from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + "select NULL <= column1 from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + "select NULL > column1 from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + "select NULL >= column1 from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + "select NULL = column1 from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + "select NULL != column1 from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + // 1.1 Float value comparison with NULL + "select NULL < column3 from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + // String comparison with NULL + "select NULL < column2 from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t", + // Boolean comparison with NULL + "select NULL < column1 from (VALUES (true), (false)) as t", ]; - assert_batches_eq!(expected, &actual); - let sql = "select column1 > NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------------------+", - "| t.column1 Gt NULL |", - "+-------------------+", - "| |", - "| |", - "+-------------------+", - ]; - assert_batches_eq!(expected, &actual); + for sql in cases { + println!("Computing: {}", sql); - let sql = - "select column1 >= NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+---------------------+", - "| t.column1 GtEq NULL |", - "+---------------------+", - "| |", - "| |", - "+---------------------+", - ]; - assert_batches_eq!(expected, &actual); + let mut actual = execute_to_batches(&ctx, sql).await; + assert_eq!(actual.len(), 1); - let sql = "select column1 = NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------------------+", - "| t.column1 Eq NULL |", - "+-------------------+", - "| |", - "| |", - "+-------------------+", - ]; - assert_batches_eq!(expected, &actual); - - let sql = - "select column1 != NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+----------------------+", - "| t.column1 NotEq NULL |", - "+----------------------+", - "| |", - "| |", - "+----------------------+", - ]; - assert_batches_eq!(expected, &actual); - - // 1.1 Float value comparison with NULL - let sql = "select column3 < NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------------------+", - "| t.column3 Lt NULL |", - "+-------------------+", - "| |", - "| |", - "+-------------------+", - ]; - assert_batches_eq!(expected, &actual); - - // String comparison with NULL - let sql = "select column2 < NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------------------+", - "| t.column2 Lt NULL |", - "+-------------------+", - "| |", - "| |", - "+-------------------+", - ]; - assert_batches_eq!(expected, &actual); - - // Boolean comparison with NULL - let sql = "select column1 < NULL from (VALUES (true), (false)) as t"; - let actual = execute_to_batches(&ctx, sql).await; - let expected = vec![ - "+-------------------+", - "| t.column1 Lt NULL |", - "+-------------------+", - "| |", - "| |", - "+-------------------+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) + let batch = actual.pop().unwrap(); + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 1); + assert!(batch.columns()[0].is_null(0)); + assert!(batch.columns()[0].is_null(1)); + } } diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 25beea5b22b1e..57a162697e0cd 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -25,10 +25,6 @@ use arrow::compute::kernels::arithmetic::{ multiply_scalar, subtract, subtract_scalar, }; use arrow::compute::kernels::boolean::{and_kleene, not, or_kleene}; -use arrow::compute::kernels::comparison::{ - eq_bool_scalar, gt_bool_scalar, gt_eq_bool_scalar, lt_bool_scalar, lt_eq_bool_scalar, - neq_bool_scalar, -}; use arrow::compute::kernels::comparison::{ eq_dyn_bool_scalar, gt_dyn_bool_scalar, gt_eq_dyn_bool_scalar, lt_dyn_bool_scalar, lt_eq_dyn_bool_scalar, neq_dyn_bool_scalar, @@ -44,12 +40,10 @@ use arrow::compute::kernels::comparison::{ use arrow::compute::kernels::comparison::{ eq_scalar, gt_eq_scalar, gt_scalar, lt_eq_scalar, lt_scalar, neq_scalar, }; +use arrow::compute::kernels::comparison::{like_utf8, nlike_utf8, regexp_is_match_utf8}; use arrow::compute::kernels::comparison::{ - eq_utf8_scalar, gt_eq_utf8_scalar, gt_utf8_scalar, like_utf8_scalar, - lt_eq_utf8_scalar, lt_utf8_scalar, neq_utf8_scalar, nlike_utf8_scalar, - regexp_is_match_utf8_scalar, + like_utf8_scalar, nlike_utf8_scalar, regexp_is_match_utf8_scalar, }; -use arrow::compute::kernels::comparison::{like_utf8, nlike_utf8, regexp_is_match_utf8}; use arrow::datatypes::{ArrowNumericType, DataType, Schema, TimeUnit}; use arrow::error::ArrowError::DivideByZero; use arrow::record_batch::RecordBatch; @@ -746,7 +740,7 @@ macro_rules! compute_op_scalar { } /// Invoke a dyn compute kernel on a data array and a scalar value -/// LEFT is Primitive or Dictionart array of numeric values, RIGHT is scalar value +/// LEFT is Primitive or Dictionary array of numeric values, RIGHT is scalar value /// OP_TYPE is the return type of scalar function macro_rules! compute_op_dyn_scalar { ($LEFT:expr, $RIGHT:expr, $OP:ident, $OP_TYPE:expr) => {{ @@ -1156,39 +1150,25 @@ impl BinaryExpr { array: &dyn Array, scalar: &ScalarValue, ) -> Result>> { + let bool_type = &DataType::Boolean; let scalar_result = match &self.op { Operator::Lt => { - binary_array_op_dyn_scalar!(array, scalar.clone(), lt, &DataType::Boolean) + binary_array_op_dyn_scalar!(array, scalar.clone(), lt, bool_type) } Operator::LtEq => { - binary_array_op_dyn_scalar!( - array, - scalar.clone(), - lt_eq, - &DataType::Boolean - ) + binary_array_op_dyn_scalar!(array, scalar.clone(), lt_eq, bool_type) } Operator::Gt => { - binary_array_op_dyn_scalar!(array, scalar.clone(), gt, &DataType::Boolean) + binary_array_op_dyn_scalar!(array, scalar.clone(), gt, bool_type) } Operator::GtEq => { - binary_array_op_dyn_scalar!( - array, - scalar.clone(), - gt_eq, - &DataType::Boolean - ) + binary_array_op_dyn_scalar!(array, scalar.clone(), gt_eq, bool_type) } Operator::Eq => { - binary_array_op_dyn_scalar!(array, scalar.clone(), eq, &DataType::Boolean) + binary_array_op_dyn_scalar!(array, scalar.clone(), eq, bool_type) } Operator::NotEq => { - binary_array_op_dyn_scalar!( - array, - scalar.clone(), - neq, - &DataType::Boolean - ) + binary_array_op_dyn_scalar!(array, scalar.clone(), neq, bool_type) } Operator::Like => { binary_string_array_op_scalar!(array, scalar.clone(), like) @@ -1255,14 +1235,25 @@ impl BinaryExpr { scalar: &ScalarValue, array: &ArrayRef, ) -> Result>> { + let bool_type = &DataType::Boolean; let scalar_result = match &self.op { - Operator::Lt => binary_array_op_scalar!(array, scalar.clone(), gt), - Operator::LtEq => binary_array_op_scalar!(array, scalar.clone(), gt_eq), - Operator::Gt => binary_array_op_scalar!(array, scalar.clone(), lt), - Operator::GtEq => binary_array_op_scalar!(array, scalar.clone(), lt_eq), - Operator::Eq => binary_array_op_scalar!(array, scalar.clone(), eq), + Operator::Lt => { + binary_array_op_dyn_scalar!(array, scalar.clone(), gt, bool_type) + } + Operator::LtEq => { + binary_array_op_dyn_scalar!(array, scalar.clone(), gt_eq, bool_type) + } + Operator::Gt => { + binary_array_op_dyn_scalar!(array, scalar.clone(), lt, bool_type) + } + Operator::GtEq => { + binary_array_op_dyn_scalar!(array, scalar.clone(), lt_eq, bool_type) + } + Operator::Eq => { + binary_array_op_dyn_scalar!(array, scalar.clone(), eq, bool_type) + } Operator::NotEq => { - binary_array_op_scalar!(array, scalar.clone(), neq) + binary_array_op_dyn_scalar!(array, scalar.clone(), neq, bool_type) } // if scalar operation is not supported - fallback to array implementation _ => None,