-
Notifications
You must be signed in to change notification settings - Fork 29.2k
[SPARK-48949][SQL] SPJ: Runtime partition filtering #47426
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -429,8 +429,19 @@ case class EnsureRequirements( | |
| // expressions | ||
| val partitionExprs = leftSpec.partitioning.expressions | ||
|
|
||
| var mergedPartValues = InternalRowComparableWrapper | ||
| .mergePartitions(leftSpec.partitioning, rightSpec.partitioning, partitionExprs) | ||
| // in case of compatible but not identical partition expressions, we apply 'reduce' | ||
| // transforms to group one side's partitions as well as the common partition values | ||
| val leftReducers = leftSpec.reducers(rightSpec) | ||
| val leftParts = reducePartValues(leftSpec.partitioning.partitionValues, | ||
| partitionExprs, | ||
| leftReducers) | ||
| val rightReducers = rightSpec.reducers(leftSpec) | ||
| val rightParts = reducePartValues(rightSpec.partitioning.partitionValues, | ||
| partitionExprs, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
For compatible partition expressions, it is It just requires
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes you may be right, let me double check this with a test and get back to you. |
||
| rightReducers) | ||
|
|
||
| // merge values on both sides | ||
| var mergedPartValues = mergePartitions(leftParts, rightParts, partitionExprs, joinType) | ||
| .map(v => (v, 1)) | ||
|
|
||
| logInfo(log"After merging, there are " + | ||
|
|
@@ -525,23 +536,6 @@ case class EnsureRequirements( | |
| } | ||
| } | ||
|
|
||
| // in case of compatible but not identical partition expressions, we apply 'reduce' | ||
| // transforms to group one side's partitions as well as the common partition values | ||
| val leftReducers = leftSpec.reducers(rightSpec) | ||
| val rightReducers = rightSpec.reducers(leftSpec) | ||
|
|
||
| if (leftReducers.isDefined || rightReducers.isDefined) { | ||
| mergedPartValues = reduceCommonPartValues(mergedPartValues, | ||
| leftSpec.partitioning.expressions, | ||
| leftReducers) | ||
| mergedPartValues = reduceCommonPartValues(mergedPartValues, | ||
| rightSpec.partitioning.expressions, | ||
| rightReducers) | ||
| val rowOrdering = RowOrdering | ||
| .createNaturalAscendingOrdering(partitionExprs.map(_.dataType)) | ||
| mergedPartValues = mergedPartValues.sorted(rowOrdering.on((t: (InternalRow, _)) => t._1)) | ||
| } | ||
|
|
||
| // Now we need to push-down the common partition information to the scan in each child | ||
| newLeft = populateCommonPartitionInfo(left, mergedPartValues, leftSpec.joinKeyPositions, | ||
| leftReducers, applyPartialClustering, replicateLeftSide) | ||
|
|
@@ -602,15 +596,15 @@ case class EnsureRequirements( | |
| child, joinKeyPositions)) | ||
| } | ||
|
|
||
| private def reduceCommonPartValues( | ||
| commonPartValues: Seq[(InternalRow, Int)], | ||
| private def reducePartValues( | ||
| partValues: Seq[InternalRow], | ||
| expressions: Seq[Expression], | ||
| reducers: Option[Seq[Option[Reducer[_, _]]]]) = { | ||
| reducers match { | ||
| case Some(reducers) => commonPartValues.groupBy { case (row, _) => | ||
| case Some(reducers) => partValues.map { row => | ||
| KeyGroupedShuffleSpec.reducePartitionValue(row, expressions, reducers) | ||
| }.map{ case(wrapper, splits) => (wrapper.row, splits.map(_._2).sum) }.toSeq | ||
| case _ => commonPartValues | ||
| }.distinct.map(_.row) | ||
| case _ => partValues | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -651,6 +645,46 @@ case class EnsureRequirements( | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Merge and sort partitions values for SPJ and optionally enable partition filtering. | ||
| * Both sides must have | ||
| * matching partition expressions. | ||
| * @param leftPartitioning left side partition values | ||
| * @param rightPartitioning right side partition values | ||
| * @param partitionExpression partition expressions | ||
| * @param joinType join type for optional partition filtering | ||
| * @return merged and sorted partition values | ||
| */ | ||
| private def mergePartitions( | ||
| leftPartitioning: Seq[InternalRow], | ||
| rightPartitioning: Seq[InternalRow], | ||
| partitionExpression: Seq[Expression], | ||
| joinType: JoinType): Seq[InternalRow] = { | ||
|
|
||
| val merged = if (SQLConf.get.getConf(SQLConf.V2_BUCKETING_PARTITION_FILTER_ENABLED)) { | ||
| joinType match { | ||
| case Inner => InternalRowComparableWrapper.mergePartitions( | ||
| leftPartitioning, rightPartitioning, partitionExpression, intersect = true) | ||
| case LeftOuter => leftPartitioning.map( | ||
| InternalRowComparableWrapper(_, partitionExpression)) | ||
| case RightOuter => rightPartitioning.map( | ||
| InternalRowComparableWrapper(_, partitionExpression)) | ||
| case _ => InternalRowComparableWrapper.mergePartitions(leftPartitioning, | ||
| rightPartitioning, partitionExpression) | ||
| } | ||
| } else { | ||
| InternalRowComparableWrapper.mergePartitions(leftPartitioning, rightPartitioning, | ||
| partitionExpression) | ||
| } | ||
|
|
||
| // SPARK-41471: We keep to order of partitions to make sure the order of | ||
| // partitions is deterministic in different case. | ||
| val partitionOrdering: Ordering[InternalRow] = { | ||
| RowOrdering.createNaturalAscendingOrdering(partitionExpression.map(_.dataType)) | ||
| } | ||
| merged.map(_.row).sorted(partitionOrdering) | ||
| } | ||
|
|
||
| def apply(plan: SparkPlan): SparkPlan = { | ||
| val newPlan = plan.transformUp { | ||
| case operator @ ShuffleExchangeExec(upper: HashPartitioning, child, shuffleOrigin, _) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we still need to sort the result partitions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ah I see it is sorted later in the other method now