Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,25 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
None => lit_bool_null(),
})
}
// According to SQL's null semantics, NULL = NULL evaluates to NULL
// Both sides are the same expression (A = A) and A is non-volatile expression
// A = A --> A IS NOT NULL OR NULL
// A = A --> true (if A not nullable)
Expr::BinaryExpr(BinaryExpr {
left,
op: Eq,
right,
}) if (left == right) & !left.is_volatile() => {
Transformed::yes(match !info.nullable(&left)? {
true => lit(true),
false => Expr::BinaryExpr(BinaryExpr {
left: Box::new(Expr::IsNotNull(left)),
op: Or,
right: Box::new(lit_bool_null()),
}),
})
}

// Rules for NotEq
//

Expand Down Expand Up @@ -2152,6 +2171,21 @@ mod tests {
}
}

#[test]
fn test_simplify_eq_not_self() {
// `expr_a`: column `c2` is nullable, so `c2 = c2` simplifies to `c2 IS NOT NULL OR NULL`
// This ensures the expression is only true when `c2` is not NULL, accounting for SQL's NULL semantics.
let expr_a = col("c2").eq(col("c2"));
let expected_a = col("c2").is_not_null().or(lit_bool_null());

// `expr_b`: column `c2_non_null` is explicitly non-nullable, so `c2_non_null = c2_non_null` is always true
let expr_b = col("c2_non_null").eq(col("c2_non_null"));
let expected_b = lit(true);

assert_eq!(simplify(expr_a), expected_a);
assert_eq!(simplify(expr_b), expected_b);
}

#[test]
fn test_simplify_or_true() {
let expr_a = col("c2").or(lit(true));
Expand Down
17 changes: 8 additions & 9 deletions datafusion/sqllogictest/test_files/array.slt
Original file line number Diff line number Diff line change
Expand Up @@ -6130,7 +6130,8 @@ select count(*) from test WHERE array_has([needle], needle);
----
100000

# TODO: this should probably be possible to completely remove the filter as always true?
# The optimizer does not currently eliminate the filter;
# Instead, it's rewritten as `IS NULL OR NOT NULL` due to SQL null semantics
Comment on lines +6133 to +6134
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated the comment in slt as well :)

query TT
explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
select count(*) from test WHERE array_has([needle], needle);
Expand All @@ -6140,21 +6141,19 @@ logical_plan
02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
03)----SubqueryAlias: test
04)------SubqueryAlias: t
05)--------Projection:
06)----------Filter: __common_expr_3 = __common_expr_3
07)------------Projection: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) AS __common_expr_3
08)--------------TableScan: tmp_table projection=[value]
05)--------Projection:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a comment a few lines above that says:

TODO: this should probably be possible to completely remove the filter as always true?

We can probably update that too -- but we could do it in a follow on PR as well

06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IS NOT NULL OR Boolean(NULL)
07)------------TableScan: tmp_table projection=[value]
physical_plan
01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
03)----CoalescePartitionsExec
04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
05)--------ProjectionExec: expr=[]
06)----------CoalesceBatchesExec: target_batch_size=8192
07)------------FilterExec: __common_expr_3@0 = __common_expr_3@0
08)--------------ProjectionExec: expr=[substr(md5(CAST(value@0 AS Utf8)), 1, 32) as __common_expr_3]
09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
10)------------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IS NOT NULL OR NULL
08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]

# any operator
query ?
Expand Down
11 changes: 11 additions & 0 deletions datafusion/sqllogictest/test_files/simplify_expr.slt
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,16 @@ query T
select b from t where b !~ '.*'
----

query TT
explain select * from t where a = a;
----
logical_plan
01)Filter: t.a IS NOT NULL OR Boolean(NULL)
02)--TableScan: t projection=[a, b]
physical_plan
01)CoalesceBatchesExec: target_batch_size=8192
02)--FilterExec: a@0 IS NOT NULL OR NULL
03)----DataSourceExec: partitions=1, partition_sizes=[1]

statement ok
drop table t;