-
Notifications
You must be signed in to change notification settings - Fork 307
fix: Native window operator should be CometUnaryExec #774
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,20 +19,15 @@ | |
|
|
||
| package org.apache.spark.sql.comet | ||
|
|
||
| import scala.collection.JavaConverters.asJavaIterableConverter | ||
|
|
||
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Expression, NamedExpression, SortOrder, WindowExpression} | ||
| import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression, SortOrder} | ||
| import org.apache.spark.sql.catalyst.plans.physical.Partitioning | ||
| import org.apache.spark.sql.comet.CometWindowExec.getNativePlan | ||
| import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec.{METRIC_NATIVE_TIME_DESCRIPTION, METRIC_NATIVE_TIME_NAME} | ||
| import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} | ||
| import org.apache.spark.sql.execution.SparkPlan | ||
| import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter} | ||
| import org.apache.spark.sql.vectorized.ColumnarBatch | ||
|
|
||
| import org.apache.comet.serde.OperatorOuterClass | ||
| import com.google.common.base.Objects | ||
|
|
||
| import org.apache.comet.serde.OperatorOuterClass.Operator | ||
| import org.apache.comet.serde.QueryPlanSerde.{exprToProto, serializeDataType, windowExprToProto} | ||
|
|
||
| /** | ||
| * Comet physical plan node for Spark `WindowsExec`. | ||
|
|
@@ -42,14 +37,15 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProto, serializeDataType, wi | |
| * executions separated by a Comet shuffle exchange. | ||
| */ | ||
| case class CometWindowExec( | ||
| override val nativeOp: Operator, | ||
| override val originalPlan: SparkPlan, | ||
| override val output: Seq[Attribute], | ||
| windowExpression: Seq[NamedExpression], | ||
| partitionSpec: Seq[Expression], | ||
| orderSpec: Seq[SortOrder], | ||
| child: SparkPlan) | ||
| extends CometExec | ||
| with UnaryExecNode { | ||
| child: SparkPlan, | ||
| override val serializedPlanOpt: SerializedPlan) | ||
| extends CometUnaryExec { | ||
|
|
||
| override def nodeName: String = "CometWindowExec" | ||
|
|
||
|
|
@@ -65,71 +61,27 @@ case class CometWindowExec( | |
| sparkContext, | ||
| "number of partitions")) ++ readMetrics ++ writeMetrics | ||
|
|
||
| override def supportsColumnar: Boolean = true | ||
|
|
||
| protected override def doExecuteColumnar(): RDD[ColumnarBatch] = { | ||
| val childRDD = child.executeColumnar() | ||
|
|
||
| childRDD.mapPartitionsInternal { iter => | ||
| CometExec.getCometIterator( | ||
| Seq(iter), | ||
| getNativePlan(output, windowExpression, partitionSpec, orderSpec, child).get) | ||
| } | ||
| } | ||
|
|
||
| override def outputOrdering: Seq[SortOrder] = child.outputOrdering | ||
|
|
||
| override def outputPartitioning: Partitioning = child.outputPartitioning | ||
|
|
||
| protected def withNewChildInternal(newChild: SparkPlan): SparkPlan = | ||
| this.copy(child = newChild) | ||
|
|
||
| } | ||
|
|
||
| object CometWindowExec { | ||
| def getNativePlan( | ||
| outputAttributes: Seq[Attribute], | ||
| windowExpression: Seq[NamedExpression], | ||
| partitionSpec: Seq[Expression], | ||
| orderSpec: Seq[SortOrder], | ||
| child: SparkPlan): Option[Operator] = { | ||
|
|
||
| val orderSpecs = orderSpec.map(exprToProto(_, child.output)) | ||
| val partitionSpecs = partitionSpec.map(exprToProto(_, child.output)) | ||
| val scanBuilder = OperatorOuterClass.Scan.newBuilder() | ||
| val scanOpBuilder = OperatorOuterClass.Operator.newBuilder() | ||
|
|
||
| val scanTypes = outputAttributes.flatten { attr => | ||
| serializeDataType(attr.dataType) | ||
| override def stringArgs: Iterator[Any] = | ||
| Iterator(output, windowExpression, partitionSpec, orderSpec, child) | ||
|
|
||
| override def equals(obj: Any): Boolean = { | ||
| obj match { | ||
| case other: CometWindowExec => | ||
| this.windowExpression == other.windowExpression && this.child == other.child && | ||
| this.partitionSpec == other.partitionSpec && this.orderSpec == other.orderSpec && | ||
| this.serializedPlanOpt == other.serializedPlanOpt | ||
| case _ => | ||
| false | ||
| } | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This native plan is incorrect. This uses the window operator's output as the pseudo scan's output, but it should be the child plan's output. The incorrect output causes errors in DataFusion Window operator.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Besides, native window operator requires correct partitioning and ordering information from its child operator. The current native scan operator doesn't have these information. That also causes DataFusion Window operator cannot produce correct ordering partition by info. |
||
|
|
||
| val windowExprs = windowExpression.map(w => | ||
| windowExprToProto( | ||
| w.asInstanceOf[Alias].child.asInstanceOf[WindowExpression], | ||
| outputAttributes)) | ||
|
|
||
| val windowBuilder = OperatorOuterClass.Window | ||
| .newBuilder() | ||
|
|
||
| if (windowExprs.forall(_.isDefined)) { | ||
| windowBuilder | ||
| .addAllWindowExpr(windowExprs.map(_.get).asJava) | ||
|
|
||
| if (orderSpecs.forall(_.isDefined)) { | ||
| windowBuilder.addAllOrderByList(orderSpecs.map(_.get).asJava) | ||
| } | ||
|
|
||
| if (partitionSpecs.forall(_.isDefined)) { | ||
| windowBuilder.addAllPartitionByList(partitionSpecs.map(_.get).asJava) | ||
| } | ||
|
|
||
| scanBuilder.addAllFields(scanTypes.asJava) | ||
|
|
||
| val opBuilder = OperatorOuterClass.Operator | ||
| .newBuilder() | ||
| .addChildren(scanOpBuilder.setScan(scanBuilder)) | ||
|
|
||
| Some(opBuilder.setWindow(windowBuilder).build()) | ||
| } else None | ||
| } | ||
|
|
||
| override def hashCode(): Int = | ||
| Objects.hashCode(windowExpression, partitionSpec, orderSpec, child) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because
Windowis native operator, we should chain it in the native operators instead of creating a pseudo scan for it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It also can improve performance as we don't need additional native/JVM bridge.