apache · szehon-ho · Jun 14, 2023 · Jun 15, 2023 · Jun 16, 2023 · Jun 16, 2023
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2182,6 +2182,15 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
+  val ENABLE_BUILD_SIDE_OUTER_SHUFFLED_HASH_JOIN_CODEGEN =
+    buildConf("spark.sql.codegen.join.buildSideOuterShuffledHashJoin.enabled")
+      .internal()
+      .doc("When true, enable code-gen for an OUTER shuffled hash join where outer side" +
+        " is the build side.")
+      .version("3.5.0")
+      .booleanConf
+      .createWithDefault(true)
+
   val ENABLE_FULL_OUTER_SORT_MERGE_JOIN_CODEGEN =
     buildConf("spark.sql.codegen.join.fullOuterSortMergeJoin.enabled")
       .internal()

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
@@ -340,8 +340,10 @@ case class ShuffledHashJoinExec(
 
   override def supportCodegen: Boolean = joinType match {
     case FullOuter => conf.getConf(SQLConf.ENABLE_FULL_OUTER_SHUFFLED_HASH_JOIN_CODEGEN)
-    case LeftOuter if buildSide == BuildLeft => false
-    case RightOuter if buildSide == BuildRight => false
+    case LeftOuter if buildSide == BuildLeft =>
+      conf.getConf(SQLConf.ENABLE_BUILD_SIDE_OUTER_SHUFFLED_HASH_JOIN_CODEGEN)
+    case RightOuter if buildSide == BuildRight =>
+      conf.getConf(SQLConf.ENABLE_BUILD_SIDE_OUTER_SHUFFLED_HASH_JOIN_CODEGEN)
     case _ => true
   }
 
@@ -362,9 +364,15 @@ case class ShuffledHashJoinExec(
   }
 
   override def doProduce(ctx: CodegenContext): String = {
-    // Specialize `doProduce` code for full outer join, because full outer join needs to
-    // iterate streamed and build side separately.
-    if (joinType != FullOuter) {
+    // Specialize `doProduce` code for full outer join and build-side outer join,
+    // because we need to iterate streamed and build side separately.
+    val specializedProduce = joinType match {
+      case FullOuter => true
+      case LeftOuter if buildSide == BuildLeft => true
+      case RightOuter if buildSide == BuildRight => true
+      case _ => false
+    }
+    if (!specializedProduce) {
       return super.doProduce(ctx)
     }
 
@@ -407,21 +415,24 @@ case class ShuffledHashJoinExec(
       case BuildLeft => buildResultVars ++ streamedResultVars
       case BuildRight => streamedResultVars ++ buildResultVars
     }
-    val consumeFullOuterJoinRow = ctx.freshName("consumeFullOuterJoinRow")
-    ctx.addNewFunction(consumeFullOuterJoinRow,
+    val consumeOuterJoinRow = ctx.freshName("consumeOuterJoinRow")
+    ctx.addNewFunction(consumeOuterJoinRow,
       s"""
-         |private void $consumeFullOuterJoinRow() throws java.io.IOException {
+         |private void $consumeOuterJoinRow() throws java.io.IOException {
          |  ${metricTerm(ctx, "numOutputRows")}.add(1);
          |  ${consume(ctx, resultVars)}
          |}
        """.stripMargin)
 
-    val joinWithUniqueKey = codegenFullOuterJoinWithUniqueKey(
+    val isFullOuterJoin = joinType == FullOuter
+    val joinWithUniqueKey = codegenBuildSideOrFullOuterJoinWithUniqueKey(
       ctx, (streamedRow, buildRow), (streamedInput, buildInput), streamedKeyEv, streamedKeyAnyNull,
-      streamedKeyExprCode.value, relationTerm, conditionCheck, consumeFullOuterJoinRow)
-    val joinWithNonUniqueKey = codegenFullOuterJoinWithNonUniqueKey(
+      streamedKeyExprCode.value, relationTerm, conditionCheck, consumeOuterJoinRow,
+      isFullOuterJoin)
+    val joinWithNonUniqueKey = codegenBuildSideOrFullOuterJoinNonUniqueKey(
       ctx, (streamedRow, buildRow), (streamedInput, buildInput), streamedKeyEv, streamedKeyAnyNull,
-      streamedKeyExprCode.value, relationTerm, conditionCheck, consumeFullOuterJoinRow)
+      streamedKeyExprCode.value, relationTerm, conditionCheck, consumeOuterJoinRow,
+      isFullOuterJoin)
 
     s"""
        |if ($keyIsUnique) {
@@ -433,10 +444,10 @@ case class ShuffledHashJoinExec(
   }
 
   /**
-   * Generates the code for full outer join with unique join keys.
-   * This is code-gen version of `fullOuterJoinWithUniqueKey()`.
+   * Generates the code for build-side or full outer join with unique join keys.
+   * This is code-gen version of `buildSideOrFullOuterJoinUniqueKey()`.
    */
-  private def codegenFullOuterJoinWithUniqueKey(
+  private def codegenBuildSideOrFullOuterJoinWithUniqueKey(
       ctx: CodegenContext,
       rows: (String, String),
       inputs: (String, String),
@@ -445,7 +456,8 @@ case class ShuffledHashJoinExec(
       streamedKeyValue: ExprValue,
       relationTerm: String,
       conditionCheck: String,
-      consumeFullOuterJoinRow: String): String = {
+      consumeOuterJoinRow: String,
+      isFullOuterJoin: Boolean): String = {
     // Inline mutable state since not many join operations in a task
     val matchedKeySetClsName = classOf[BitSet].getName
     val matchedKeySet = ctx.addMutableState(matchedKeySetClsName, "matchedKeySet",
@@ -484,7 +496,10 @@ case class ShuffledHashJoinExec(
          |    }
          |  }
          |
-         |  $consumeFullOuterJoinRow();
+         |  if ($foundMatch || $isFullOuterJoin) {
+         |    $consumeOuterJoinRow();
+         |  }
+         |
          |  if (shouldStop()) return;
          |}
        """.stripMargin
@@ -500,7 +515,7 @@ case class ShuffledHashJoinExec(
          |  // check if key index is not in matched keys set
          |  if (!$matchedKeySet.get($rowWithIndex.getKeyIndex())) {
          |    $buildRow = $rowWithIndex.getValue();
-         |    $consumeFullOuterJoinRow();
+         |    $consumeOuterJoinRow();
          |  }
          |
          |  if (shouldStop()) return;
@@ -514,10 +529,10 @@ case class ShuffledHashJoinExec(
   }
 
   /**
-   * Generates the code for full outer join with non-unique join keys.
-   * This is code-gen version of `fullOuterJoinWithNonUniqueKey()`.
+   * Generates the code for build-side or full outer join with non-unique join keys.
+   * This is code-gen version of `buildSideOrFullOuterJoinNonUniqueKey()`.
    */
-  private def codegenFullOuterJoinWithNonUniqueKey(
+  private def codegenBuildSideOrFullOuterJoinNonUniqueKey(
       ctx: CodegenContext,
       rows: (String, String),
       inputs: (String, String),
@@ -526,7 +541,8 @@ case class ShuffledHashJoinExec(
       streamedKeyValue: ExprValue,
       relationTerm: String,
       conditionCheck: String,
-      consumeFullOuterJoinRow: String): String = {
+      consumeOuterJoinRow: String,
+      isFullOuterJoin: Boolean): String = {
     // Inline mutable state since not many join operations in a task
     val matchedRowSetClsName = classOf[OpenHashSet[_]].getName
     val matchedRowSet = ctx.addMutableState(matchedRowSetClsName, "matchedRowSet",
@@ -572,13 +588,15 @@ case class ShuffledHashJoinExec(
          |      // set row index in matched row set
          |      $matchedRowSet.add($rowIndex);
          |      $foundMatch = true;
-         |      $consumeFullOuterJoinRow();
+         |      $consumeOuterJoinRow();
          |    }
          |  }
          |
          |  if (!$foundMatch) {
          |    $buildRow = null;
-         |    $consumeFullOuterJoinRow();
+         |    if ($isFullOuterJoin) {
+         |      $consumeOuterJoinRow();
+         |    }
          |  }
          |
          |  if (shouldStop()) return;
@@ -603,7 +621,7 @@ case class ShuffledHashJoinExec(
          |  // check if row index is not in matched row set
          |  if (!$matchedRowSet.contains($rowIndex)) {
          |    $buildRow = $rowWithIndex.getValue();
-         |    $consumeFullOuterJoinRow();
+         |    $consumeOuterJoinRow();
          |  }
          |
          |  if (shouldStop()) return;

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -1315,78 +1315,84 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan
 
   test("SPARK-36612: Support left outer join build left or right outer join build right in " +
     "shuffled hash join") {
-    val inputDFs = Seq(
-      // Test unique join key
-      (spark.range(10).selectExpr("id as k1"),
-        spark.range(30).selectExpr("id as k2"),
-        $"k1" === $"k2"),
-      // Test non-unique join key
-      (spark.range(10).selectExpr("id % 5 as k1"),
-        spark.range(30).selectExpr("id % 5 as k2"),
-        $"k1" === $"k2"),
-      // Test empty build side
-      (spark.range(10).selectExpr("id as k1").filter("k1 < -1"),
-        spark.range(30).selectExpr("id as k2"),
-        $"k1" === $"k2"),
-      // Test empty stream side
-      (spark.range(10).selectExpr("id as k1"),
-        spark.range(30).selectExpr("id as k2").filter("k2 < -1"),
-        $"k1" === $"k2"),
-      // Test empty build and stream side
-      (spark.range(10).selectExpr("id as k1").filter("k1 < -1"),
-        spark.range(30).selectExpr("id as k2").filter("k2 < -1"),
-        $"k1" === $"k2"),
-      // Test string join key
-      (spark.range(10).selectExpr("cast(id * 3 as string) as k1"),
-        spark.range(30).selectExpr("cast(id as string) as k2"),
-        $"k1" === $"k2"),
-      // Test build side at right
-      (spark.range(30).selectExpr("cast(id / 3 as string) as k1"),
-        spark.range(10).selectExpr("cast(id as string) as k2"),
-        $"k1" === $"k2"),
-      // Test NULL join key
-      (spark.range(10).map(i => if (i % 2 == 0) i else null).selectExpr("value as k1"),
-        spark.range(30).map(i => if (i % 4 == 0) i else null).selectExpr("value as k2"),
-        $"k1" === $"k2"),
-      (spark.range(10).map(i => if (i % 3 == 0) i else null).selectExpr("value as k1"),
-        spark.range(30).map(i => if (i % 5 == 0) i else null).selectExpr("value as k2"),
-        $"k1" === $"k2"),
-      // Test multiple join keys
-      (spark.range(10).map(i => if (i % 2 == 0) i else null).selectExpr(
-        "value as k1", "cast(value % 5 as short) as k2", "cast(value * 3 as long) as k3"),
-        spark.range(30).map(i => if (i % 3 == 0) i else null).selectExpr(
-          "value as k4", "cast(value % 5 as short) as k5", "cast(value * 3 as long) as k6"),
-        $"k1" === $"k4" && $"k2" === $"k5" && $"k3" === $"k6")
-    )
-
-    // test left outer with left side build
-    inputDFs.foreach { case (df1, df2, joinExprs) =>
-      val smjDF = df1.hint("SHUFFLE_MERGE").join(df2, joinExprs, "leftouter")
-      assert(collect(smjDF.queryExecution.executedPlan) {
-        case _: SortMergeJoinExec => true }.size === 1)
-      val smjResult = smjDF.collect()
-
-      val shjDF = df1.hint("SHUFFLE_HASH").join(df2, joinExprs, "leftouter")
-      assert(collect(shjDF.queryExecution.executedPlan) {
-        case _: ShuffledHashJoinExec => true
-      }.size === 1)
-      // Same result between shuffled hash join and sort merge join
-      checkAnswer(shjDF, smjResult)
-    }
+    Seq("true", "false").foreach{ codegen =>
+      withSQLConf(SQLConf.ENABLE_BUILD_SIDE_OUTER_SHUFFLED_HASH_JOIN_CODEGEN.key -> codegen) {
+        val inputDFs = Seq(
+          // Test unique join key
+          (spark.range(10).selectExpr("id as k1"),
+            spark.range(30).selectExpr("id as k2"),
+            $"k1" === $"k2"),
+          // Test non-unique join key
+          (spark.range(10).selectExpr("id % 5 as k1"),
+            spark.range(30).selectExpr("id % 5 as k2"),
+            $"k1" === $"k2"),
+          // Test empty build side
+          (spark.range(10).selectExpr("id as k1").filter("k1 < -1"),
+            spark.range(30).selectExpr("id as k2"),
+            $"k1" === $"k2"),
+          // Test empty stream side
+          (spark.range(10).selectExpr("id as k1"),
+            spark.range(30).selectExpr("id as k2").filter("k2 < -1"),
+            $"k1" === $"k2"),
+          // Test empty build and stream side
+          (spark.range(10).selectExpr("id as k1").filter("k1 < -1"),
+            spark.range(30).selectExpr("id as k2").filter("k2 < -1"),
+            $"k1" === $"k2"),
+          // Test string join key
+          (spark.range(10).selectExpr("cast(id * 3 as string) as k1"),
+            spark.range(30).selectExpr("cast(id as string) as k2"),
+            $"k1" === $"k2"),
+          // Test build side at right
+          (spark.range(30).selectExpr("cast(id / 3 as string) as k1"),
+            spark.range(10).selectExpr("cast(id as string) as k2"),
+            $"k1" === $"k2"),
+          // Test NULL join key
+          (spark.range(10).map(i => if (i % 2 == 0) i else null).selectExpr("value as k1"),
+            spark.range(30).map(i => if (i % 4 == 0) i else null).selectExpr("value as k2"),
+            $"k1" === $"k2"),
+          (spark.range(10).map(i => if (i % 3 == 0) i else null).selectExpr("value as k1"),
+            spark.range(30).map(i => if (i % 5 == 0) i else null).selectExpr("value as k2"),
+            $"k1" === $"k2"),
+          // Test multiple join keys
+          (spark.range(10).map(i => if (i % 2 == 0) i else null).selectExpr(
+            "value as k1", "cast(value % 5 as short) as k2", "cast(value * 3 as long) as k3"),
+            spark.range(30).map(i => if (i % 3 == 0) i else null).selectExpr(
+              "value as k4", "cast(value % 5 as short) as k5", "cast(value * 3 as long) as k6"),
+            $"k1" === $"k4" && $"k2" === $"k5" && $"k3" === $"k6")
+        )
 
-    // test right outer with right side build
-    inputDFs.foreach { case (df2, df1, joinExprs) =>
-      val smjDF = df2.join(df1.hint("SHUFFLE_MERGE"), joinExprs, "rightouter")
-      assert(collect(smjDF.queryExecution.executedPlan) {
-        case _: SortMergeJoinExec => true }.size === 1)
-      val smjResult = smjDF.collect()
+        // test left outer with left side build
+        inputDFs.foreach { case (df1, df2, joinExprs) =>
+          val smjDF = df1.hint("SHUFFLE_MERGE").join(df2, joinExprs, "leftouter")
+          assert(collect(smjDF.queryExecution.executedPlan) {
+            case _: SortMergeJoinExec => true
+          }.size === 1)
+          val smjResult = smjDF.collect()
+
+          val shjDF = df1.hint("SHUFFLE_HASH").join(df2, joinExprs, "leftouter")
+          assert(collect(shjDF.queryExecution.executedPlan) {
+            case _: ShuffledHashJoinExec => true
+          }.size === 1)
+          // Same result between shuffled hash join and sort merge join
+          checkAnswer(shjDF, smjResult)
+        }
 
-      val shjDF = df2.join(df1.hint("SHUFFLE_HASH"), joinExprs, "rightouter")
-      assert(collect(shjDF.queryExecution.executedPlan) {
-        case _: ShuffledHashJoinExec => true
-      }.size === 1)
-      // Same result between shuffled hash join and sort merge join
-      checkAnswer(shjDF, smjResult)
+        // test right outer with right side build
+        inputDFs.foreach { case (df2, df1, joinExprs) =>
+          val smjDF = df2.join(df1.hint("SHUFFLE_MERGE"), joinExprs, "rightouter")
+          assert(collect(smjDF.queryExecution.executedPlan) {
+            case _: SortMergeJoinExec => true
+          }.size === 1)
+          val smjResult = smjDF.collect()
+
+          val shjDF = df2.join(df1.hint("SHUFFLE_HASH"), joinExprs, "rightouter")
+          assert(collect(shjDF.queryExecution.executedPlan) {
+            case _: ShuffledHashJoinExec => true
+          }.size === 1)
+          // Same result between shuffled hash join and sort merge join
+          checkAnswer(shjDF, smjResult)
+        }
+      }
     }
   }