From 03c7aac25282f2d75ed9bd4b82f45babc308f2d7 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Tue, 22 Dec 2020 19:42:06 -0800
Subject: [PATCH 1/3] Use ContextAwareIterator to stop consuming after the task
 ends.

---
 .../sql/execution/python/EvalPythonExec.scala  | 18 +++++++++++++++++-
 .../sql/execution/python/MapInPandasExec.scala |  7 ++++---
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
index 7c476ab03c002..0fe6985fdf904 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
@@ -89,6 +89,7 @@ trait EvalPythonExec extends UnaryExecNode {
 
     inputRDD.mapPartitions { iter =>
       val context = TaskContext.get()
+      val contextAwareIterator = new ContextAwareIterator(iter, context)
 
       // The queue used to buffer input rows so we can drain it to
       // combine input with output from Python.
@@ -120,7 +121,7 @@ trait EvalPythonExec extends UnaryExecNode {
       }.toSeq)
 
       // Add rows to queue to join later with the result.
-      val projectedRowIter = iter.map { inputRow =>
+      val projectedRowIter = contextAwareIterator.map { inputRow =>
         queue.add(inputRow.asInstanceOf[UnsafeRow])
         projection(inputRow)
       }
@@ -137,3 +138,18 @@ trait EvalPythonExec extends UnaryExecNode {
     }
   }
 }
+
+/**
+ * A TaskContext aware iterator.
+ *
+ * As the Python evaluation consumes the parent iterator in a separate thread,
+ * it could consume more data from the parent even after the task ends and the parent is closed.
+ * Thus, we should use ContextAwareIterator to stop consuming after the task ends.
+ */
+class ContextAwareIterator[IN](iter: Iterator[IN], context: TaskContext) extends Iterator[IN] {
+
+  override def hasNext: Boolean =
+    !context.isCompleted() && !context.isInterrupted() && iter.hasNext
+
+  override def next(): IN = iter.next()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala
index 2bb808119c0ae..7fc18f885a2d3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala
@@ -61,16 +61,17 @@ case class MapInPandasExec(
       val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf)
       val outputTypes = child.schema
 
+      val context = TaskContext.get()
+      val contextAwareIterator = new ContextAwareIterator(inputIter, context)
+
       // Here we wrap it via another row so that Python sides understand it
       // as a DataFrame.
-      val wrappedIter = inputIter.map(InternalRow(_))
+      val wrappedIter = contextAwareIterator.map(InternalRow(_))
 
       // DO NOT use iter.grouped(). See BatchIterator.
       val batchIter =
         if (batchSize > 0) new BatchIterator(wrappedIter, batchSize) else Iterator(wrappedIter)
 
-      val context = TaskContext.get()
-
       val columnarBatchIter = new ArrowPythonRunner(
         chainedFunc,
         PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,

From 3606a4c0bb2a9a2755ec2898cea64345b0263c01 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Wed, 23 Dec 2020 10:53:09 -0800
Subject: [PATCH 2/3] Move ContextAwareIterator to org.apache.spark.util.

---
 .../spark/util/ContextAwareIterator.scala     | 37 +++++++++++++++++++
 .../sql/execution/python/EvalPythonExec.scala | 17 +--------
 .../execution/python/MapInPandasExec.scala    |  1 +
 3 files changed, 39 insertions(+), 16 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/util/ContextAwareIterator.scala

diff --git a/core/src/main/scala/org/apache/spark/util/ContextAwareIterator.scala b/core/src/main/scala/org/apache/spark/util/ContextAwareIterator.scala
new file mode 100644
index 0000000000000..e6c16084351e8
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/ContextAwareIterator.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import org.apache.spark.TaskContext
+
+/**
+ * A TaskContext aware iterator.
+ *
+ * As the Python evaluation consumes the parent iterator in a separate thread,
+ * it could consume more data from the parent even after the task ends and the parent is closed.
+ * If an off-heap access exists in the parent iterator, it could cause segmentation fault
+ * which crashes the executor.
+ * Thus, we should use [[ContextAwareIterator]] to stop consuming after the task ends.
+ */
+class ContextAwareIterator[IN](iter: Iterator[IN], context: TaskContext) extends Iterator[IN] {
+
+  override def hasNext: Boolean =
+    !context.isCompleted() && !context.isInterrupted() && iter.hasNext
+
+  override def next(): IN = iter.next()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
index 0fe6985fdf904..e91a0f8865c4c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.UnaryExecNode
 import org.apache.spark.sql.types.{DataType, StructField, StructType}
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ContextAwareIterator, Utils}
 
 
 /**
@@ -138,18 +138,3 @@ trait EvalPythonExec extends UnaryExecNode {
     }
   }
 }
-
-/**
- * A TaskContext aware iterator.
- *
- * As the Python evaluation consumes the parent iterator in a separate thread,
- * it could consume more data from the parent even after the task ends and the parent is closed.
- * Thus, we should use ContextAwareIterator to stop consuming after the task ends.
- */
-class ContextAwareIterator[IN](iter: Iterator[IN], context: TaskContext) extends Iterator[IN] {
-
-  override def hasNext: Boolean =
-    !context.isCompleted() && !context.isInterrupted() && iter.hasNext
-
-  override def next(): IN = iter.next()
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala
index 7fc18f885a2d3..ce83c6ebb44b9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.types.{StructField, StructType}
 import org.apache.spark.sql.util.ArrowUtils
 import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
+import org.apache.spark.util.ContextAwareIterator
 
 /**
  * A relation produced by applying a function that takes an iterator of pandas DataFrames

From 36176f7db62a5c0c3c1d9c80b079f57276dab106 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Wed, 23 Dec 2020 12:13:48 -0800
Subject: [PATCH 3/3] Move to org.apache.spark and make it follow the way of
 InterruptibleIterator.

---
 .../spark/{util => }/ContextAwareIterator.scala     | 13 ++++++++-----
 .../spark/sql/execution/python/EvalPythonExec.scala |  6 +++---
 .../sql/execution/python/MapInPandasExec.scala      |  5 ++---
 3 files changed, 13 insertions(+), 11 deletions(-)
 rename core/src/main/scala/org/apache/spark/{util => }/ContextAwareIterator.scala (79%)

diff --git a/core/src/main/scala/org/apache/spark/util/ContextAwareIterator.scala b/core/src/main/scala/org/apache/spark/ContextAwareIterator.scala
similarity index 79%
rename from core/src/main/scala/org/apache/spark/util/ContextAwareIterator.scala
rename to core/src/main/scala/org/apache/spark/ContextAwareIterator.scala
index e6c16084351e8..c4d0dd8aceab0 100644
--- a/core/src/main/scala/org/apache/spark/util/ContextAwareIterator.scala
+++ b/core/src/main/scala/org/apache/spark/ContextAwareIterator.scala
@@ -15,11 +15,12 @@
  * limitations under the License.
  */
 
-package org.apache.spark.util
+package org.apache.spark
 
-import org.apache.spark.TaskContext
+import org.apache.spark.annotation.DeveloperApi
 
 /**
+ * :: DeveloperApi ::
  * A TaskContext aware iterator.
  *
  * As the Python evaluation consumes the parent iterator in a separate thread,
@@ -28,10 +29,12 @@ import org.apache.spark.TaskContext
  * which crashes the executor.
  * Thus, we should use [[ContextAwareIterator]] to stop consuming after the task ends.
  */
-class ContextAwareIterator[IN](iter: Iterator[IN], context: TaskContext) extends Iterator[IN] {
+@DeveloperApi
+class ContextAwareIterator[+T](val context: TaskContext, val delegate: Iterator[T])
+  extends Iterator[T] {
 
   override def hasNext: Boolean =
-    !context.isCompleted() && !context.isInterrupted() && iter.hasNext
+    !context.isCompleted() && !context.isInterrupted() && delegate.hasNext
 
-  override def next(): IN = iter.next()
+  override def next(): T = delegate.next()
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
index e91a0f8865c4c..fca43e454bff5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
@@ -21,14 +21,14 @@ import java.io.File
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.{SparkEnv, TaskContext}
+import org.apache.spark.{ContextAwareIterator, SparkEnv, TaskContext}
 import org.apache.spark.api.python.ChainedPythonFunctions
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.UnaryExecNode
 import org.apache.spark.sql.types.{DataType, StructField, StructType}
-import org.apache.spark.util.{ContextAwareIterator, Utils}
+import org.apache.spark.util.Utils
 
 
 /**
@@ -89,7 +89,7 @@ trait EvalPythonExec extends UnaryExecNode {
 
     inputRDD.mapPartitions { iter =>
       val context = TaskContext.get()
-      val contextAwareIterator = new ContextAwareIterator(iter, context)
+      val contextAwareIterator = new ContextAwareIterator(context, iter)
 
       // The queue used to buffer input rows so we can drain it to
       // combine input with output from Python.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala
index ce83c6ebb44b9..71f51f1abc6f5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.python
 
 import scala.collection.JavaConverters._
 
-import org.apache.spark.TaskContext
+import org.apache.spark.{ContextAwareIterator, TaskContext}
 import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
@@ -29,7 +29,6 @@ import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.types.{StructField, StructType}
 import org.apache.spark.sql.util.ArrowUtils
 import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
-import org.apache.spark.util.ContextAwareIterator
 
 /**
  * A relation produced by applying a function that takes an iterator of pandas DataFrames
@@ -63,7 +62,7 @@ case class MapInPandasExec(
       val outputTypes = child.schema
 
       val context = TaskContext.get()
-      val contextAwareIterator = new ContextAwareIterator(inputIter, context)
+      val contextAwareIterator = new ContextAwareIterator(context, inputIter)
 
       // Here we wrap it via another row so that Python sides understand it
       // as a DataFrame.