-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Fixes a bug when running queries with a limit clause #16643
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3dd9dca
85c32e8
e0e641c
d220889
3bb783f
809f90a
f092860
defb0c7
5be62a4
fc6d54e
604113c
c7ddafc
3ebae3a
8db7a05
1386ed7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,6 +34,7 @@ | |
| import org.apache.druid.msq.querykit.DataSourcePlan; | ||
| import org.apache.druid.msq.querykit.QueryKit; | ||
| import org.apache.druid.msq.querykit.QueryKitUtils; | ||
| import org.apache.druid.msq.querykit.ShuffleSpecFactories; | ||
| import org.apache.druid.msq.querykit.ShuffleSpecFactory; | ||
| import org.apache.druid.msq.querykit.common.OffsetLimitFrameProcessorFactory; | ||
| import org.apache.druid.msq.util.MultiStageQueryContext; | ||
|
|
@@ -111,69 +112,77 @@ public QueryDefinition makeQueryDefinition( | |
| final ScanQuery queryToRun = originalQuery.withDataSource(dataSourcePlan.getNewDataSource()); | ||
| final int firstStageNumber = Math.max(minStageNumber, queryDefBuilder.getNextStageNumber()); | ||
| final RowSignature scanSignature = getAndValidateSignature(queryToRun, jsonMapper); | ||
| final ShuffleSpec shuffleSpec; | ||
| final RowSignature signatureToUse; | ||
| final boolean hasLimitOrOffset = queryToRun.isLimited() || queryToRun.getScanRowsOffset() > 0; | ||
|
|
||
| final RowSignature.Builder signatureBuilder = RowSignature.builder().addAll(scanSignature); | ||
| final Granularity segmentGranularity = | ||
| QueryKitUtils.getSegmentGranularityFromContext(jsonMapper, queryToRun.getContext()); | ||
| final List<KeyColumn> clusterByColumns = new ArrayList<>(); | ||
|
|
||
| // Add regular orderBys. | ||
| for (final ScanQuery.OrderBy orderBy : queryToRun.getOrderBys()) { | ||
| clusterByColumns.add( | ||
| new KeyColumn( | ||
| orderBy.getColumnName(), | ||
| orderBy.getOrder() == ScanQuery.Order.DESCENDING ? KeyOrder.DESCENDING : KeyOrder.ASCENDING | ||
| ) | ||
| ); | ||
| } | ||
|
|
||
| // We ignore the resultShuffleSpecFactory in case: | ||
| // 1. There is no cluster by | ||
| // 2. There is an offset which means everything gets funneled into a single partition hence we use MaxCountShuffleSpec | ||
| if (queryToRun.getOrderBys().isEmpty() && hasLimitOrOffset) { | ||
| shuffleSpec = MixShuffleSpec.instance(); | ||
| signatureToUse = scanSignature; | ||
| } else { | ||
| final RowSignature.Builder signatureBuilder = RowSignature.builder().addAll(scanSignature); | ||
| final Granularity segmentGranularity = | ||
| QueryKitUtils.getSegmentGranularityFromContext(jsonMapper, queryToRun.getContext()); | ||
| final List<KeyColumn> clusterByColumns = new ArrayList<>(); | ||
|
|
||
| // Add regular orderBys. | ||
| for (final ScanQuery.OrderBy orderBy : queryToRun.getOrderBys()) { | ||
| clusterByColumns.add( | ||
| new KeyColumn( | ||
| orderBy.getColumnName(), | ||
| orderBy.getOrder() == ScanQuery.Order.DESCENDING ? KeyOrder.DESCENDING : KeyOrder.ASCENDING | ||
| ) | ||
| ); | ||
| } | ||
|
|
||
| // Update partition by of next window | ||
| final RowSignature signatureSoFar = signatureBuilder.build(); | ||
| boolean addShuffle = true; | ||
| if (originalQuery.getContext().containsKey(MultiStageQueryContext.NEXT_WINDOW_SHUFFLE_COL)) { | ||
| final ClusterBy windowClusterBy = (ClusterBy) originalQuery.getContext() | ||
| .get(MultiStageQueryContext.NEXT_WINDOW_SHUFFLE_COL); | ||
| for (KeyColumn c : windowClusterBy.getColumns()) { | ||
| if (!signatureSoFar.contains(c.columnName())) { | ||
| addShuffle = false; | ||
| break; | ||
| } | ||
| } | ||
| if (addShuffle) { | ||
| clusterByColumns.addAll(windowClusterBy.getColumns()); | ||
| // Update partition by of next window | ||
| final RowSignature signatureSoFar = signatureBuilder.build(); | ||
| boolean addShuffle = true; | ||
| if (originalQuery.getContext().containsKey(MultiStageQueryContext.NEXT_WINDOW_SHUFFLE_COL)) { | ||
| final ClusterBy windowClusterBy = (ClusterBy) originalQuery.getContext() | ||
| .get(MultiStageQueryContext.NEXT_WINDOW_SHUFFLE_COL); | ||
| for (KeyColumn c : windowClusterBy.getColumns()) { | ||
| if (!signatureSoFar.contains(c.columnName())) { | ||
| addShuffle = false; | ||
| break; | ||
| } | ||
| } else { | ||
| // Add partition boosting column. | ||
| clusterByColumns.add(new KeyColumn(QueryKitUtils.PARTITION_BOOST_COLUMN, KeyOrder.ASCENDING)); | ||
| signatureBuilder.add(QueryKitUtils.PARTITION_BOOST_COLUMN, ColumnType.LONG); | ||
| } | ||
| if (addShuffle) { | ||
| clusterByColumns.addAll(windowClusterBy.getColumns()); | ||
| } | ||
| } else { | ||
| // Add partition boosting column. | ||
| clusterByColumns.add(new KeyColumn(QueryKitUtils.PARTITION_BOOST_COLUMN, KeyOrder.ASCENDING)); | ||
| signatureBuilder.add(QueryKitUtils.PARTITION_BOOST_COLUMN, ColumnType.LONG); | ||
| } | ||
|
|
||
| final ClusterBy clusterBy = | ||
| QueryKitUtils.clusterByWithSegmentGranularity(new ClusterBy(clusterByColumns, 0), segmentGranularity); | ||
| final ShuffleSpec finalShuffleSpec = resultShuffleSpecFactory.build(clusterBy, false); | ||
|
|
||
| final ClusterBy clusterBy = | ||
| QueryKitUtils.clusterByWithSegmentGranularity(new ClusterBy(clusterByColumns, 0), segmentGranularity); | ||
| shuffleSpec = resultShuffleSpecFactory.build(clusterBy, false); | ||
| signatureToUse = QueryKitUtils.sortableSignature( | ||
| QueryKitUtils.signatureWithSegmentGranularity(signatureBuilder.build(), segmentGranularity), | ||
| clusterBy.getColumns() | ||
| ); | ||
| final RowSignature signatureToUse = QueryKitUtils.sortableSignature( | ||
| QueryKitUtils.signatureWithSegmentGranularity(signatureBuilder.build(), segmentGranularity), | ||
| clusterBy.getColumns() | ||
| ); | ||
|
|
||
| ShuffleSpec scanShuffleSpec; | ||
| if (!hasLimitOrOffset) { | ||
| // If there is no limit spec, apply the final shuffling here itself. This will ensure partition sizes etc are respected. | ||
| scanShuffleSpec = finalShuffleSpec; | ||
| } else { | ||
| // If there is a limit spec, check if there are any non-boost columns to sort in. | ||
| boolean requiresSort = clusterByColumns.stream() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to see if we are sorting on the non-boost column? Isn't that automatically added by the ScanQueryKit et al? Maybe we can simplify the condition by checking if the orderBy is non-empty (before adding any boosting)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have a if-branch for window functions above this bit of code, which adds its own cluster bys, which are not dependent on the orderBy. |
||
| .anyMatch(keyColumn -> !QueryKitUtils.PARTITION_BOOST_COLUMN.equals(keyColumn.columnName())); | ||
| if (requiresSort) { | ||
| // If yes, do a sort into a single partition. | ||
| scanShuffleSpec = ShuffleSpecFactories.singlePartition().build(clusterBy, false); | ||
| } else { | ||
| // If the only clusterBy column is the boost column, we just use a mix shuffle to avoid unused shuffling. | ||
| // Note that we still need the boost column to be present in the row signature, since the limit stage would | ||
| // need it to be populated to do its own shuffling later. | ||
|
Comment on lines
+175
to
+176
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was under the impression that the limit factory couldn't partition boost. Am I mistaken?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. otoh, if there isn't any partition boosting, then even the original code would have run into similar problem if there wasn't a cluster key right - too large partitions
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The limit factory does not increment the partition boosting at regular intervals. The value would be all 0 if it was not added to the row signature at the scan stage.
The original code would have mix shuffle speced. It would always make a single partition, so yes, it should have been too large. |
||
| scanShuffleSpec = MixShuffleSpec.instance(); | ||
| } | ||
| } | ||
|
|
||
| queryDefBuilder.add( | ||
| StageDefinition.builder(Math.max(minStageNumber, queryDefBuilder.getNextStageNumber())) | ||
| .inputs(dataSourcePlan.getInputSpecs()) | ||
| .broadcastInputs(dataSourcePlan.getBroadcastInputs()) | ||
| .shuffleSpec(shuffleSpec) | ||
| .shuffleSpec(scanShuffleSpec) | ||
| .signature(signatureToUse) | ||
| .maxWorkerCount(dataSourcePlan.isSingleWorker() ? 1 : maxWorkerCount) | ||
| .processorFactory(new ScanQueryFrameProcessorFactory(queryToRun)) | ||
|
|
@@ -185,7 +194,7 @@ public QueryDefinition makeQueryDefinition( | |
| .inputs(new StageInputSpec(firstStageNumber)) | ||
| .signature(signatureToUse) | ||
| .maxWorkerCount(1) | ||
| .shuffleSpec(null) // no shuffling should be required after a limit processor. | ||
| .shuffleSpec(finalShuffleSpec) // Apply the final shuffling after limit spec. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like a lot of shuffling. Is there any way we can avoid reshuffling the data by the same cluster by, and just repartition? Perhaps not without any supersorter changes, but I wanted to confirm.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well maybe we can preserve the optimisation that we had earlier - if there's no Actually, we don't need to partitionBoost the intermediate shuffle spec in any case (i.e. the shuffleSpec for the scan stage if there's a limit present) - Since it's all going into a single partition anyway, the partitionBoost won't have any use.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The initial ScanFP is the one who increments the boost column, if we do not apply the boost column at that stage, the limit processor output would have boost columns of 0, which can't be split.
adarshsanjeev marked this conversation as resolved.
|
||
| .processorFactory( | ||
| new OffsetLimitFrameProcessorFactory( | ||
| queryToRun.getScanRowsOffset(), | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.