From 8f5dfc939f71ff1399d705d20741bb0a12209fe0 Mon Sep 17 00:00:00 2001
From: Nong Li <nong@databricks.com>
Date: Thu, 17 Mar 2016 12:55:02 -0700
Subject: [PATCH 1/2] [SPARK-13981][SQL] Defer evaluating variables within
 Filter operators and NULL improvements.

This improves the Filter codegen to optimize IsNotNull filters which are common. This patch
defers loading attributes as late as possible within the filter operator. This takes advantage
of short-circuiting.

Instead of generating code like:
boolean isNull = ...
int value = ...
boolean isNull2 = ...
int value2 = ...
if (isNull) continue;

we will generate:
boolean isNull = ...
int value = ...
if (isNull) continue;
int value2 = ...
if (isNull) continue;

On tpcds q55, this fixes the regression from introducing the IsNotNull predicates.

TPCDS Snappy:                       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
--------------------------------------------------------------------------------
q55                                      4564 / 5036         25.2          39.6
q55                                      4064 / 4340         28.3          35.3
---
 .../spark/sql/execution/basicOperators.scala  | 80 +++++++++++++++----
 1 file changed, 63 insertions(+), 17 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 6e2a5aa4f97c7..d61e49b0e684f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, ExpressionCanonicalizer}
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution.metric.{LongSQLMetricValue, SQLMetrics}
+import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.types.LongType
 import org.apache.spark.util.random.PoissonSampler
 
@@ -79,16 +79,20 @@ case class Filter(condition: Expression, child: SparkPlan)
 
   // Split out all the IsNotNulls from condition.
   private val (notNullPreds, otherPreds) = splitConjunctivePredicates(condition).partition {
-    case IsNotNull(a) if child.output.contains(a) => true
+    case IsNotNull(a) if child.output.exists(_.semanticEquals(a)) => true
     case _ => false
   }
 
   // The columns that will filtered out by `IsNotNull` could be considered as not nullable.
   private val notNullAttributes = notNullPreds.flatMap(_.references)
 
+  // Mark this as empty. We'll evaluate the input during doConsume(). We don't want to evaluate
+  // all the variables at the beginning to take advantage of short circuiting.
+  override def usedInputs: AttributeSet = AttributeSet.empty
+
   override def output: Seq[Attribute] = {
     child.output.map { a =>
-      if (a.nullable && notNullAttributes.contains(a)) {
+      if (a.nullable && notNullAttributes.exists(_.semanticEquals(a))) {
         a.withNullability(false)
       } else {
         a
@@ -110,39 +114,81 @@ case class Filter(condition: Expression, child: SparkPlan)
   override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: String): String = {
     val numOutput = metricTerm(ctx, "numOutputRows")
 
-    // filter out the nulls
-    val filterOutNull = notNullAttributes.map { a =>
-      val idx = child.output.indexOf(a)
-      s"if (${input(idx).isNull}) continue;"
-    }.mkString("\n")
+    /**
+     * Generates code for `c`, using `in` for input attributes and `attrs` for nullability.
+     */
+    def genPredicate(c: Expression, in: Seq[ExprCode], attrs: Seq[Attribute]): String = {
+      val bound = BindReferences.bindReference(c, attrs)
+      val evaluated = evaluateRequiredVariables(child.output, in, c.references)
 
-    ctx.currentVars = input
-    val predicates = otherPreds.map { e =>
-      val bound = ExpressionCanonicalizer.execute(
-        BindReferences.bindReference(e, output))
-      val ev = bound.gen(ctx)
+      // Generate the code for the predicate.
+      val ev = ExpressionCanonicalizer.execute(bound).gen(ctx)
       val nullCheck = if (bound.nullable) {
         s"${ev.isNull} || "
       } else {
         s""
       }
+
       s"""
+         |$evaluated
          |${ev.code}
          |if (${nullCheck}!${ev.value}) continue;
        """.stripMargin
+    }
+
+    ctx.currentVars = input
+
+    // To generate the predicates we will follow this algorithm.
+    // For each predicate that is not IsNotNull, we will generate them one by one loading attributes
+    // as necessary. For each of both attributes, if there is a IsNotNull predicate we will generate
+    // that check *before* the predicate. After all of these predicates, we will generate the
+    // remaining IsNotNull checks that were not part of other predicates.
+    // This has the property of not doing redundant IsNotNull checks and taking better advantage of
+    // short-circuiting, not loading attributes until they are needed.
+    // This is very perf sensitive.
+    // TODO: revisit this. We can consider reodering predicates as well.
+    val generatedIsNotNullChecks = new Array[Boolean](notNullPreds.length)
+    val generated = otherPreds.map { c =>
+      val nullChecks = c.references.map { r =>
+        val idx = notNullPreds.indexWhere { n => n.asInstanceOf[IsNotNull].child.semanticEquals(r)}
+        if (idx != -1 && !generatedIsNotNullChecks(idx)) {
+          // Use the child's output. The nullability is what the child produced.
+          val code = genPredicate(notNullPreds(idx), input, child.output)
+          generatedIsNotNullChecks(idx) = true
+          code
+        } else {
+          ""
+        }
+      }.mkString("\n").trim
+
+      // Here we use *this* operator's output with this output's nullability since we already
+      // enforced them with the IsNotNull checks above.
+      s"""
+         |$nullChecks
+         |${genPredicate(c, input, output)}
+       """.stripMargin.trim
+    }.mkString("\n")
+
+    val nullChecks = notNullPreds.zipWithIndex.map { case (c, idx) =>
+      if (!generatedIsNotNullChecks(idx)) {
+        genPredicate(c, input, child.output)
+      } else {
+        ""
+      }
     }.mkString("\n")
 
     // Reset the isNull to false for the not-null columns, then the followed operators could
-    // generate better code (remove dead branches).
+    // generate better code (remove dead branches).                                              O
     val resultVars = input.zipWithIndex.map { case (ev, i) =>
-      if (notNullAttributes.contains(child.output(i))) {
+      if (notNullAttributes.exists(_.semanticEquals(child.output(i)))) {
         ev.isNull = "false"
       }
       ev
     }
+
     s"""
-       |$filterOutNull
-       |$predicates
+       |$generated
+       |$nullChecks
        |$numOutput.add(1);
        |${consume(ctx, resultVars)}
      """.stripMargin

From 1a770129f7f19bbace8548f069fb5bd1ca2a322c Mon Sep 17 00:00:00 2001
From: Nong Li <nong@databricks.com>
Date: Mon, 28 Mar 2016 12:01:10 -0700
Subject: [PATCH 2/2] CR

---
 .../org/apache/spark/sql/execution/basicOperators.scala    | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index d61e49b0e684f..22c52dacdffde 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -152,10 +152,9 @@ case class Filter(condition: Expression, child: SparkPlan)
       val nullChecks = c.references.map { r =>
         val idx = notNullPreds.indexWhere { n => n.asInstanceOf[IsNotNull].child.semanticEquals(r)}
         if (idx != -1 && !generatedIsNotNullChecks(idx)) {
-          // Use the child's output. The nullability is what the child produced.
-          val code = genPredicate(notNullPreds(idx), input, child.output)
           generatedIsNotNullChecks(idx) = true
-          code
+          // Use the child's output. The nullability is what the child produced.
+          genPredicate(notNullPreds(idx), input, child.output)
         } else {
           ""
         }
@@ -178,7 +177,7 @@ case class Filter(condition: Expression, child: SparkPlan)
     }.mkString("\n")
 
     // Reset the isNull to false for the not-null columns, then the followed operators could
-    // generate better code (remove dead branches).                                              O
+    // generate better code (remove dead branches).
     val resultVars = input.zipWithIndex.map { case (ev, i) =>
       if (notNullAttributes.exists(_.semanticEquals(child.output(i)))) {
         ev.isNull = "false"