From 7088371330591b5278ae4e6083cfcbedc5f0fda6 Mon Sep 17 00:00:00 2001 From: gatorsmile Date: Wed, 12 Sep 2018 21:11:22 +0800 Subject: [PATCH 1/2] [SPARK-25402][SQL] Null handling in BooleanSimplification This PR is to fix the null handling in BooleanSimplification. In the rule BooleanSimplification, there are two cases that do not properly handle null values. The optimization is not right if either side is null. This PR is to fix them. Added test cases Closes #22390 from gatorsmile/fixBooleanSimplification. Authored-by: gatorsmile Signed-off-by: Wenchen Fan (cherry picked from commit 79cc59718fdf7785bdc37a26bb8df4c6151114a6) Signed-off-by: Wenchen Fan --- .../sql/catalyst/optimizer/expressions.scala | 13 +++-- .../BooleanSimplificationSuite.scala | 47 +++++++++++++++++-- .../org/apache/spark/sql/DataFrameSuite.scala | 10 ++++ 3 files changed, 61 insertions(+), 9 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala index f2334830f8d88..cf26ab0bc69ea 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala @@ -154,10 +154,15 @@ object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper { case TrueLiteral Or _ => TrueLiteral case _ Or TrueLiteral => TrueLiteral - case a And b if Not(a).semanticEquals(b) => FalseLiteral - case a Or b if Not(a).semanticEquals(b) => TrueLiteral - case a And b if a.semanticEquals(Not(b)) => FalseLiteral - case a Or b if a.semanticEquals(Not(b)) => TrueLiteral + case a And b if Not(a).semanticEquals(b) => + If(IsNull(a), Literal.create(null, a.dataType), FalseLiteral) + case a And b if a.semanticEquals(Not(b)) => + If(IsNull(b), Literal.create(null, b.dataType), FalseLiteral) + + case a Or b if Not(a).semanticEquals(b) => + If(IsNull(a), Literal.create(null, a.dataType), TrueLiteral) + case a Or b if a.semanticEquals(Not(b)) => + If(IsNull(b), Literal.create(null, b.dataType), TrueLiteral) case a And b if a.semanticEquals(b) => a case a Or b if a.semanticEquals(b) => a diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala index c275f997ba6e9..71825b988fc41 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.catalyst.optimizer +import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.dsl.expressions._ @@ -26,7 +27,7 @@ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.Row +import org.apache.spark.sql.types.BooleanType class BooleanSimplificationSuite extends PlanTest with PredicateHelper { @@ -37,6 +38,7 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper { Batch("Constant Folding", FixedPoint(50), NullPropagation(conf), ConstantFolding, + SimplifyConditionals, BooleanSimplification, PruneFilters(conf)) :: Nil } @@ -47,6 +49,14 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper { testRelation.output, Seq(Row(1, 2, 3, "abc")) ) + val testNotNullableRelation = LocalRelation('a.int.notNull, 'b.int.notNull, 'c.int.notNull, + 'd.string.notNull, 'e.boolean.notNull, 'f.boolean.notNull, 'g.boolean.notNull, + 'h.boolean.notNull) + + val testNotNullableRelationWithData = LocalRelation.fromExternalRows( + testNotNullableRelation.output, Seq(Row(1, 2, 3, "abc")) + ) + private def checkCondition(input: Expression, expected: LogicalPlan): Unit = { val plan = testRelationWithData.where(input).analyze val actual = Optimize.execute(plan) @@ -60,6 +70,13 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper { comparePlans(actual, correctAnswer) } + private def checkConditionInNotNullableRelation( + input: Expression, expected: LogicalPlan): Unit = { + val plan = testNotNullableRelationWithData.where(input).analyze + val actual = Optimize.execute(plan) + comparePlans(actual, expected) + } + test("a && a => a") { checkCondition(Literal(1) < 'a && Literal(1) < 'a, Literal(1) < 'a) checkCondition(Literal(1) < 'a && Literal(1) < 'a && Literal(1) < 'a, Literal(1) < 'a) @@ -173,10 +190,30 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper { } test("Complementation Laws") { - checkCondition('a && !'a, testRelation) - checkCondition(!'a && 'a, testRelation) + checkConditionInNotNullableRelation('e && !'e, testNotNullableRelation) + checkConditionInNotNullableRelation(!'e && 'e, testNotNullableRelation) + + checkConditionInNotNullableRelation('e || !'e, testNotNullableRelationWithData) + checkConditionInNotNullableRelation(!'e || 'e, testNotNullableRelationWithData) + } + + test("Complementation Laws - null handling") { + checkCondition('e && !'e, + testRelationWithData.where(If('e.isNull, Literal.create(null, BooleanType), false)).analyze) + checkCondition(!'e && 'e, + testRelationWithData.where(If('e.isNull, Literal.create(null, BooleanType), false)).analyze) + + checkCondition('e || !'e, + testRelationWithData.where(If('e.isNull, Literal.create(null, BooleanType), true)).analyze) + checkCondition(!'e || 'e, + testRelationWithData.where(If('e.isNull, Literal.create(null, BooleanType), true)).analyze) + } + + test("Complementation Laws - negative case") { + checkCondition('e && !'f, testRelationWithData.where('e && !'f).analyze) + checkCondition(!'f && 'e, testRelationWithData.where(!'f && 'e).analyze) - checkCondition('a || !'a, testRelationWithData) - checkCondition(!'a || 'a, testRelationWithData) + checkCondition('e || !'f, testRelationWithData.where('e || !'f).analyze) + checkCondition(!'f || 'e, testRelationWithData.where(!'f || 'e).analyze) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 7450a1a35b8f6..60bfb21ce7238 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -1789,4 +1789,14 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { test("SPARK-22469: compare string with decimal") { checkAnswer(Seq("1.5").toDF("s").filter("s > 0.5"), Row("1.5")) } + + test("SPARK-25402 Null handling in BooleanSimplification") { + val schema = StructType.fromDDL("a boolean, b int") + val rows = Seq(Row(null, 1)) + + val rdd = sparkContext.parallelize(rows) + val df = spark.createDataFrame(rdd, schema) + + checkAnswer(df.where("(NOT a) OR a"), Seq.empty) + } } From ec437276d4be8bb905a2fd2f119f4ea372d03d1c Mon Sep 17 00:00:00 2001 From: gatorsmile Date: Thu, 13 Sep 2018 00:42:41 -0700 Subject: [PATCH 2/2] fix. --- .../catalyst/optimizer/BooleanSimplificationSuite.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala index 71825b988fc41..df2624eaba181 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala @@ -43,10 +43,11 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper { PruneFilters(conf)) :: Nil } - val testRelation = LocalRelation('a.int, 'b.int, 'c.int, 'd.string) + val testRelation = LocalRelation('a.int, 'b.int, 'c.int, 'd.string, + 'e.boolean, 'f.boolean, 'g.boolean, 'h.boolean) val testRelationWithData = LocalRelation.fromExternalRows( - testRelation.output, Seq(Row(1, 2, 3, "abc")) + testRelation.output, Seq(Row(1, 2, 3, "abc", true, null, true, null)) ) val testNotNullableRelation = LocalRelation('a.int.notNull, 'b.int.notNull, 'c.int.notNull, @@ -54,7 +55,7 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper { 'h.boolean.notNull) val testNotNullableRelationWithData = LocalRelation.fromExternalRows( - testNotNullableRelation.output, Seq(Row(1, 2, 3, "abc")) + testNotNullableRelation.output, Seq(Row(1, 2, 3, "abc", true, false, true, false)) ) private def checkCondition(input: Expression, expected: LogicalPlan): Unit = {