diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 9003467703df2..591f479e641a0 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -760,6 +760,25 @@ impl TreeNodeRewriter for Simplifier<'_, S> { None => lit_bool_null(), }) } + // According to SQL's null semantics, NULL = NULL evaluates to NULL + // Both sides are the same expression (A = A) and A is non-volatile expression + // A = A --> A IS NOT NULL OR NULL + // A = A --> true (if A not nullable) + Expr::BinaryExpr(BinaryExpr { + left, + op: Eq, + right, + }) if (left == right) & !left.is_volatile() => { + Transformed::yes(match !info.nullable(&left)? { + true => lit(true), + false => Expr::BinaryExpr(BinaryExpr { + left: Box::new(Expr::IsNotNull(left)), + op: Or, + right: Box::new(lit_bool_null()), + }), + }) + } + // Rules for NotEq // @@ -2152,6 +2171,21 @@ mod tests { } } + #[test] + fn test_simplify_eq_not_self() { + // `expr_a`: column `c2` is nullable, so `c2 = c2` simplifies to `c2 IS NOT NULL OR NULL` + // This ensures the expression is only true when `c2` is not NULL, accounting for SQL's NULL semantics. + let expr_a = col("c2").eq(col("c2")); + let expected_a = col("c2").is_not_null().or(lit_bool_null()); + + // `expr_b`: column `c2_non_null` is explicitly non-nullable, so `c2_non_null = c2_non_null` is always true + let expr_b = col("c2_non_null").eq(col("c2_non_null")); + let expected_b = lit(true); + + assert_eq!(simplify(expr_a), expected_a); + assert_eq!(simplify(expr_b), expected_b); + } + #[test] fn test_simplify_or_true() { let expr_a = col("c2").or(lit(true)); diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index f9bbcedff5eef..f165d3bf66ba0 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -6130,7 +6130,8 @@ select count(*) from test WHERE array_has([needle], needle); ---- 100000 -# TODO: this should probably be possible to completely remove the filter as always true? +# The optimizer does not currently eliminate the filter; +# Instead, it's rewritten as `IS NULL OR NOT NULL` due to SQL null semantics query TT explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) select count(*) from test WHERE array_has([needle], needle); @@ -6140,10 +6141,9 @@ logical_plan 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] 03)----SubqueryAlias: test 04)------SubqueryAlias: t -05)--------Projection: -06)----------Filter: __common_expr_3 = __common_expr_3 -07)------------Projection: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) AS __common_expr_3 -08)--------------TableScan: tmp_table projection=[value] +05)--------Projection: +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IS NOT NULL OR Boolean(NULL) +07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] @@ -6151,10 +6151,9 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: __common_expr_3@0 = __common_expr_3@0 -08)--------------ProjectionExec: expr=[substr(md5(CAST(value@0 AS Utf8)), 1, 32) as __common_expr_3] -09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -10)------------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IS NOT NULL OR NULL +08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] # any operator query ? diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt index 43193fb41cfad..90785fd939962 100644 --- a/datafusion/sqllogictest/test_files/simplify_expr.slt +++ b/datafusion/sqllogictest/test_files/simplify_expr.slt @@ -63,5 +63,16 @@ query T select b from t where b !~ '.*' ---- +query TT +explain select * from t where a = a; +---- +logical_plan +01)Filter: t.a IS NOT NULL OR Boolean(NULL) +02)--TableScan: t projection=[a, b] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: a@0 IS NOT NULL OR NULL +03)----DataSourceExec: partitions=1, partition_sizes=[1] + statement ok drop table t;