From 03c7aac25282f2d75ed9bd4b82f45babc308f2d7 Mon Sep 17 00:00:00 2001 From: Takuya UESHIN Date: Tue, 22 Dec 2020 19:42:06 -0800 Subject: [PATCH 1/3] Use ContextAwareIterator to stop consuming after the task ends. --- .../sql/execution/python/EvalPythonExec.scala | 18 +++++++++++++++++- .../sql/execution/python/MapInPandasExec.scala | 7 ++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala index 7c476ab03c002..0fe6985fdf904 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala @@ -89,6 +89,7 @@ trait EvalPythonExec extends UnaryExecNode { inputRDD.mapPartitions { iter => val context = TaskContext.get() + val contextAwareIterator = new ContextAwareIterator(iter, context) // The queue used to buffer input rows so we can drain it to // combine input with output from Python. @@ -120,7 +121,7 @@ trait EvalPythonExec extends UnaryExecNode { }.toSeq) // Add rows to queue to join later with the result. - val projectedRowIter = iter.map { inputRow => + val projectedRowIter = contextAwareIterator.map { inputRow => queue.add(inputRow.asInstanceOf[UnsafeRow]) projection(inputRow) } @@ -137,3 +138,18 @@ trait EvalPythonExec extends UnaryExecNode { } } } + +/** + * A TaskContext aware iterator. + * + * As the Python evaluation consumes the parent iterator in a separate thread, + * it could consume more data from the parent even after the task ends and the parent is closed. + * Thus, we should use ContextAwareIterator to stop consuming after the task ends. + */ +class ContextAwareIterator[IN](iter: Iterator[IN], context: TaskContext) extends Iterator[IN] { + + override def hasNext: Boolean = + !context.isCompleted() && !context.isInterrupted() && iter.hasNext + + override def next(): IN = iter.next() +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala index 2bb808119c0ae..7fc18f885a2d3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala @@ -61,16 +61,17 @@ case class MapInPandasExec( val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf) val outputTypes = child.schema + val context = TaskContext.get() + val contextAwareIterator = new ContextAwareIterator(inputIter, context) + // Here we wrap it via another row so that Python sides understand it // as a DataFrame. - val wrappedIter = inputIter.map(InternalRow(_)) + val wrappedIter = contextAwareIterator.map(InternalRow(_)) // DO NOT use iter.grouped(). See BatchIterator. val batchIter = if (batchSize > 0) new BatchIterator(wrappedIter, batchSize) else Iterator(wrappedIter) - val context = TaskContext.get() - val columnarBatchIter = new ArrowPythonRunner( chainedFunc, PythonEvalType.SQL_MAP_PANDAS_ITER_UDF, From 3606a4c0bb2a9a2755ec2898cea64345b0263c01 Mon Sep 17 00:00:00 2001 From: Takuya UESHIN Date: Wed, 23 Dec 2020 10:53:09 -0800 Subject: [PATCH 2/3] Move ContextAwareIterator to org.apache.spark.util. --- .../spark/util/ContextAwareIterator.scala | 37 +++++++++++++++++++ .../sql/execution/python/EvalPythonExec.scala | 17 +-------- .../execution/python/MapInPandasExec.scala | 1 + 3 files changed, 39 insertions(+), 16 deletions(-) create mode 100644 core/src/main/scala/org/apache/spark/util/ContextAwareIterator.scala diff --git a/core/src/main/scala/org/apache/spark/util/ContextAwareIterator.scala b/core/src/main/scala/org/apache/spark/util/ContextAwareIterator.scala new file mode 100644 index 0000000000000..e6c16084351e8 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/util/ContextAwareIterator.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util + +import org.apache.spark.TaskContext + +/** + * A TaskContext aware iterator. + * + * As the Python evaluation consumes the parent iterator in a separate thread, + * it could consume more data from the parent even after the task ends and the parent is closed. + * If an off-heap access exists in the parent iterator, it could cause segmentation fault + * which crashes the executor. + * Thus, we should use [[ContextAwareIterator]] to stop consuming after the task ends. + */ +class ContextAwareIterator[IN](iter: Iterator[IN], context: TaskContext) extends Iterator[IN] { + + override def hasNext: Boolean = + !context.isCompleted() && !context.isInterrupted() && iter.hasNext + + override def next(): IN = iter.next() +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala index 0fe6985fdf904..e91a0f8865c4c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.UnaryExecNode import org.apache.spark.sql.types.{DataType, StructField, StructType} -import org.apache.spark.util.Utils +import org.apache.spark.util.{ContextAwareIterator, Utils} /** @@ -138,18 +138,3 @@ trait EvalPythonExec extends UnaryExecNode { } } } - -/** - * A TaskContext aware iterator. - * - * As the Python evaluation consumes the parent iterator in a separate thread, - * it could consume more data from the parent even after the task ends and the parent is closed. - * Thus, we should use ContextAwareIterator to stop consuming after the task ends. - */ -class ContextAwareIterator[IN](iter: Iterator[IN], context: TaskContext) extends Iterator[IN] { - - override def hasNext: Boolean = - !context.isCompleted() && !context.isInterrupted() && iter.hasNext - - override def next(): IN = iter.next() -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala index 7fc18f885a2d3..ce83c6ebb44b9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala @@ -29,6 +29,7 @@ import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.util.ArrowUtils import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch} +import org.apache.spark.util.ContextAwareIterator /** * A relation produced by applying a function that takes an iterator of pandas DataFrames From 36176f7db62a5c0c3c1d9c80b079f57276dab106 Mon Sep 17 00:00:00 2001 From: Takuya UESHIN Date: Wed, 23 Dec 2020 12:13:48 -0800 Subject: [PATCH 3/3] Move to org.apache.spark and make it follow the way of InterruptibleIterator. --- .../spark/{util => }/ContextAwareIterator.scala | 13 ++++++++----- .../spark/sql/execution/python/EvalPythonExec.scala | 6 +++--- .../sql/execution/python/MapInPandasExec.scala | 5 ++--- 3 files changed, 13 insertions(+), 11 deletions(-) rename core/src/main/scala/org/apache/spark/{util => }/ContextAwareIterator.scala (79%) diff --git a/core/src/main/scala/org/apache/spark/util/ContextAwareIterator.scala b/core/src/main/scala/org/apache/spark/ContextAwareIterator.scala similarity index 79% rename from core/src/main/scala/org/apache/spark/util/ContextAwareIterator.scala rename to core/src/main/scala/org/apache/spark/ContextAwareIterator.scala index e6c16084351e8..c4d0dd8aceab0 100644 --- a/core/src/main/scala/org/apache/spark/util/ContextAwareIterator.scala +++ b/core/src/main/scala/org/apache/spark/ContextAwareIterator.scala @@ -15,11 +15,12 @@ * limitations under the License. */ -package org.apache.spark.util +package org.apache.spark -import org.apache.spark.TaskContext +import org.apache.spark.annotation.DeveloperApi /** + * :: DeveloperApi :: * A TaskContext aware iterator. * * As the Python evaluation consumes the parent iterator in a separate thread, @@ -28,10 +29,12 @@ import org.apache.spark.TaskContext * which crashes the executor. * Thus, we should use [[ContextAwareIterator]] to stop consuming after the task ends. */ -class ContextAwareIterator[IN](iter: Iterator[IN], context: TaskContext) extends Iterator[IN] { +@DeveloperApi +class ContextAwareIterator[+T](val context: TaskContext, val delegate: Iterator[T]) + extends Iterator[T] { override def hasNext: Boolean = - !context.isCompleted() && !context.isInterrupted() && iter.hasNext + !context.isCompleted() && !context.isInterrupted() && delegate.hasNext - override def next(): IN = iter.next() + override def next(): T = delegate.next() } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala index e91a0f8865c4c..fca43e454bff5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala @@ -21,14 +21,14 @@ import java.io.File import scala.collection.mutable.ArrayBuffer -import org.apache.spark.{SparkEnv, TaskContext} +import org.apache.spark.{ContextAwareIterator, SparkEnv, TaskContext} import org.apache.spark.api.python.ChainedPythonFunctions import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.UnaryExecNode import org.apache.spark.sql.types.{DataType, StructField, StructType} -import org.apache.spark.util.{ContextAwareIterator, Utils} +import org.apache.spark.util.Utils /** @@ -89,7 +89,7 @@ trait EvalPythonExec extends UnaryExecNode { inputRDD.mapPartitions { iter => val context = TaskContext.get() - val contextAwareIterator = new ContextAwareIterator(iter, context) + val contextAwareIterator = new ContextAwareIterator(context, iter) // The queue used to buffer input rows so we can drain it to // combine input with output from Python. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala index ce83c6ebb44b9..71f51f1abc6f5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.python import scala.collection.JavaConverters._ -import org.apache.spark.TaskContext +import org.apache.spark.{ContextAwareIterator, TaskContext} import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow @@ -29,7 +29,6 @@ import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.util.ArrowUtils import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch} -import org.apache.spark.util.ContextAwareIterator /** * A relation produced by applying a function that takes an iterator of pandas DataFrames @@ -63,7 +62,7 @@ case class MapInPandasExec( val outputTypes = child.schema val context = TaskContext.get() - val contextAwareIterator = new ContextAwareIterator(inputIter, context) + val contextAwareIterator = new ContextAwareIterator(context, inputIter) // Here we wrap it via another row so that Python sides understand it // as a DataFrame.