diff --git a/docs/multi-stage-query/reference.md b/docs/multi-stage-query/reference.md index 19b1740b9d4d..8c0e0ff15b37 100644 --- a/docs/multi-stage-query/reference.md +++ b/docs/multi-stage-query/reference.md @@ -399,6 +399,7 @@ The following table lists the context parameters for the MSQ task engine: | `rowsPerPage` | SELECT

The number of rows per page to target. The actual number of rows per page may be somewhat higher or lower than this number. In most cases, use the default.
This property comes into effect only when `selectDestination` is set to `durableStorage` | 100000 | | `skipTypeVerification` | INSERT or REPLACE

During query validation, Druid validates that [string arrays](../querying/arrays.md) and [multi-value dimensions](../querying/multi-value-dimensions.md) are not mixed in the same column. If you are intentionally migrating from one to the other, use this context parameter to disable type validation.

Provide the column list as comma-separated values or as a JSON array in string form.| empty list | | `failOnEmptyInsert` | INSERT or REPLACE

When set to false (the default), an INSERT query generating no output rows will be no-op, and a REPLACE query generating no output rows will delete all data that matches the OVERWRITE clause. When set to true, an ingest query generating no output rows will throw an `InsertCannotBeEmpty` fault. | `false` | +| `storeCompactionState` | REPLACE

When set to true, a REPLACE query stores as part of each segment's metadata a `lastCompactionState` field that captures the various specs used to create the segment. Future compaction jobs skip segments whose `lastCompactionState` matches the desired compaction state. Works the same as [`storeCompactionState`](../ingestion/tasks.md#context-parameters) task context flag. | `false` | ## Joins diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java index c29259e318c6..13afc358c337 100644 --- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java +++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java @@ -20,6 +20,7 @@ package org.apache.druid.msq.exec; import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; @@ -41,6 +42,7 @@ import it.unimi.dsi.fastutil.ints.IntSet; import org.apache.calcite.sql.type.SqlTypeName; import org.apache.druid.client.ImmutableSegmentLoadInfo; +import org.apache.druid.client.indexing.ClientCompactionTaskTransformSpec; import org.apache.druid.common.guava.FutureUtils; import org.apache.druid.data.input.StringTuple; import org.apache.druid.data.input.impl.DimensionSchema; @@ -62,6 +64,9 @@ import org.apache.druid.frame.write.InvalidNullByteException; import org.apache.druid.indexer.TaskState; import org.apache.druid.indexer.TaskStatus; +import org.apache.druid.indexer.partitions.DimensionRangePartitionsSpec; +import org.apache.druid.indexer.partitions.DynamicPartitionsSpec; +import org.apache.druid.indexer.partitions.PartitionsSpec; import org.apache.druid.indexing.common.LockGranularity; import org.apache.druid.indexing.common.TaskLock; import org.apache.druid.indexing.common.TaskLockType; @@ -76,6 +81,7 @@ import org.apache.druid.indexing.common.actions.SegmentTransactionalReplaceAction; import org.apache.druid.indexing.common.actions.TaskAction; import org.apache.druid.indexing.common.actions.TaskActionClient; +import org.apache.druid.indexing.common.task.Tasks; import org.apache.druid.indexing.common.task.batch.TooManyBucketsException; import org.apache.druid.indexing.common.task.batch.parallel.TombstoneHelper; import org.apache.druid.indexing.overlord.SegmentPublishResult; @@ -191,6 +197,7 @@ import org.apache.druid.query.operator.WindowOperatorQuery; import org.apache.druid.query.scan.ScanQuery; import org.apache.druid.segment.DimensionHandlerUtils; +import org.apache.druid.segment.IndexSpec; import org.apache.druid.segment.column.ColumnHolder; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.column.RowSignature; @@ -198,6 +205,7 @@ import org.apache.druid.segment.indexing.DataSchema; import org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec; import org.apache.druid.segment.indexing.granularity.GranularitySpec; +import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec; import org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec; import org.apache.druid.segment.transform.TransformSpec; import org.apache.druid.server.DruidNode; @@ -207,6 +215,7 @@ import org.apache.druid.sql.calcite.rel.DruidQuery; import org.apache.druid.sql.http.ResultFormat; import org.apache.druid.storage.ExportStorageProvider; +import org.apache.druid.timeline.CompactionState; import org.apache.druid.timeline.DataSegment; import org.apache.druid.timeline.SegmentTimeline; import org.apache.druid.timeline.partition.DimensionRangeShardSpec; @@ -230,6 +239,7 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.Queue; import java.util.Set; @@ -241,6 +251,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Consumer; +import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.StreamSupport; @@ -1731,12 +1742,114 @@ private void publishSegmentsIfNeeded( //noinspection unchecked @SuppressWarnings("unchecked") - final Set segments = (Set) queryKernel.getResultObjectForStage(finalStageId); + Set segments = (Set) queryKernel.getResultObjectForStage(finalStageId); + + boolean storeCompactionState = QueryContext.of(task.getQuerySpec().getQuery().getContext()) + .getBoolean( + Tasks.STORE_COMPACTION_STATE_KEY, + Tasks.DEFAULT_STORE_COMPACTION_STATE + ); + + if (!segments.isEmpty() && storeCompactionState) { + DataSourceMSQDestination destination = (DataSourceMSQDestination) task.getQuerySpec().getDestination(); + if (!destination.isReplaceTimeChunks()) { + // Store compaction state only for replace queries. + log.warn( + "storeCompactionState flag set for a non-REPLACE query [%s]. Ignoring the flag for now.", + queryDef.getQueryId() + ); + } else { + DataSchema dataSchema = ((SegmentGeneratorFrameProcessorFactory) queryKernel + .getStageDefinition(finalStageId).getProcessorFactory()).getDataSchema(); + + ShardSpec shardSpec = segments.stream().findFirst().get().getShardSpec(); + + Function, Set> compactionStateAnnotateFunction = addCompactionStateToSegments( + task(), + context.jsonMapper(), + dataSchema, + shardSpec, + queryDef.getQueryId() + ); + segments = compactionStateAnnotateFunction.apply(segments); + } + } log.info("Query [%s] publishing %d segments.", queryDef.getQueryId(), segments.size()); publishAllSegments(segments); } } + private static Function, Set> addCompactionStateToSegments( + MSQControllerTask task, + ObjectMapper jsonMapper, + DataSchema dataSchema, + ShardSpec shardSpec, + String queryId + ) + { + final MSQTuningConfig tuningConfig = task.getQuerySpec().getTuningConfig(); + PartitionsSpec partitionSpec; + + if (Objects.equals(shardSpec.getType(), ShardSpec.Type.RANGE)) { + List partitionDimensions = ((DimensionRangeShardSpec) shardSpec).getDimensions(); + partitionSpec = new DimensionRangePartitionsSpec( + tuningConfig.getRowsPerSegment(), + null, + partitionDimensions, + false + ); + } else if (Objects.equals(shardSpec.getType(), ShardSpec.Type.NUMBERED)) { + // MSQ tasks don't use maxTotalRows. Hence using LONG.MAX_VALUE. + partitionSpec = new DynamicPartitionsSpec(tuningConfig.getRowsPerSegment(), Long.MAX_VALUE); + } else { + // SingleDimenionShardSpec and other shard specs are never created in MSQ. + throw new MSQException( + UnknownFault.forMessage( + StringUtils.format( + "Query[%s] cannot store compaction state in segments as shard spec of unsupported type[%s].", + queryId, + shardSpec.getType() + ))); + } + + Granularity segmentGranularity = ((DataSourceMSQDestination) task.getQuerySpec().getDestination()) + .getSegmentGranularity(); + + GranularitySpec granularitySpec = new UniformGranularitySpec( + segmentGranularity, + dataSchema.getGranularitySpec().getQueryGranularity(), + dataSchema.getGranularitySpec().isRollup(), + dataSchema.getGranularitySpec().inputIntervals() + ); + + DimensionsSpec dimensionsSpec = dataSchema.getDimensionsSpec(); + Map transformSpec = TransformSpec.NONE.equals(dataSchema.getTransformSpec()) + ? null + : new ClientCompactionTaskTransformSpec( + dataSchema.getTransformSpec().getFilter() + ).asMap(jsonMapper); + List metricsSpec = dataSchema.getAggregators() == null + ? null + : jsonMapper.convertValue( + dataSchema.getAggregators(), new TypeReference>() + { + }); + + + IndexSpec indexSpec = tuningConfig.getIndexSpec(); + + log.info("Query[%s] storing compaction state in segments.", queryId); + + return CompactionState.addCompactionStateToSegments( + partitionSpec, + dimensionsSpec, + metricsSpec, + transformSpec, + indexSpec.asMap(jsonMapper), + granularitySpec.asMap(jsonMapper) + ); + } + /** * Clean up durable storage, if used for stage output. *

@@ -1797,7 +1910,9 @@ private static QueryDefinition makeQueryDefinition( } } else { shuffleSpecFactory = querySpec.getDestination() - .getShuffleSpecFactory(MultiStageQueryContext.getRowsPerPage(querySpec.getQuery().context())); + .getShuffleSpecFactory( + MultiStageQueryContext.getRowsPerPage(querySpec.getQuery().context()) + ); queryToPlan = querySpec.getQuery(); } @@ -1899,9 +2014,11 @@ private static QueryDefinition makeQueryDefinition( if (filesIterator.hasNext()) { throw DruidException.forPersona(DruidException.Persona.USER) .ofCategory(DruidException.Category.RUNTIME_FAILURE) - .build("Found files at provided export destination[%s]. Export is only allowed to " - + "an empty path. Please provide an empty path/subdirectory or move the existing files.", - exportStorageProvider.getBasePath()); + .build( + "Found files at provided export destination[%s]. Export is only allowed to " + + "an empty path. Please provide an empty path/subdirectory or move the existing files.", + exportStorageProvider.getBasePath() + ); } } catch (IOException e) { @@ -1933,7 +2050,6 @@ private static QueryDefinition makeQueryDefinition( } - private static DataSchema generateDataSchema( MSQSpec querySpec, RowSignature querySignature, @@ -2388,7 +2504,9 @@ private static MSQStatusReport makeStatusReport( workerStatsMap = taskLauncher.getWorkerStats(); } - SegmentLoadStatusFetcher.SegmentLoadWaiterStatus status = segmentLoadWaiter == null ? null : segmentLoadWaiter.status(); + SegmentLoadStatusFetcher.SegmentLoadWaiterStatus status = segmentLoadWaiter == null + ? null + : segmentLoadWaiter.status(); return new MSQStatusReport( taskState, diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/util/MultiStageQueryContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/util/MultiStageQueryContext.java index 13f845defd32..60734b5b1dad 100644 --- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/util/MultiStageQueryContext.java +++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/util/MultiStageQueryContext.java @@ -131,7 +131,7 @@ public class MultiStageQueryContext public static final String DEFAULT_CLUSTER_STATISTICS_MERGE_MODE = ClusterStatisticsMergeMode.SEQUENTIAL.toString(); public static final String CTX_ROWS_PER_SEGMENT = "rowsPerSegment"; - static final int DEFAULT_ROWS_PER_SEGMENT = 3000000; + public static final int DEFAULT_ROWS_PER_SEGMENT = 3000000; public static final String CTX_ROWS_PER_PAGE = "rowsPerPage"; static final int DEFAULT_ROWS_PER_PAGE = 100000; diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQReplaceTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQReplaceTest.java index 9a158f5aec88..9a4fb98666b3 100644 --- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQReplaceTest.java +++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQReplaceTest.java @@ -23,18 +23,33 @@ import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import org.apache.druid.common.config.NullHandling; +import org.apache.druid.data.input.impl.DimensionSchema; +import org.apache.druid.data.input.impl.DimensionsSpec; +import org.apache.druid.data.input.impl.DoubleDimensionSchema; +import org.apache.druid.data.input.impl.FloatDimensionSchema; +import org.apache.druid.data.input.impl.LongDimensionSchema; +import org.apache.druid.data.input.impl.StringDimensionSchema; +import org.apache.druid.indexer.partitions.DimensionRangePartitionsSpec; +import org.apache.druid.indexer.partitions.DynamicPartitionsSpec; +import org.apache.druid.indexer.partitions.PartitionsSpec; import org.apache.druid.indexing.common.TaskLockType; import org.apache.druid.indexing.common.actions.RetrieveUsedSegmentsAction; import org.apache.druid.indexing.common.task.Tasks; import org.apache.druid.java.util.common.DateTimes; import org.apache.druid.java.util.common.Intervals; import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.java.util.common.granularity.GranularityType; import org.apache.druid.msq.indexing.report.MSQSegmentReport; import org.apache.druid.msq.test.CounterSnapshotMatcher; import org.apache.druid.msq.test.MSQTestBase; import org.apache.druid.msq.test.MSQTestTaskActionClient; +import org.apache.druid.msq.util.MultiStageQueryContext; +import org.apache.druid.segment.IndexSpec; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.column.RowSignature; +import org.apache.druid.segment.indexing.granularity.GranularitySpec; +import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec; +import org.apache.druid.timeline.CompactionState; import org.apache.druid.timeline.DataSegment; import org.apache.druid.timeline.SegmentId; import org.apache.druid.timeline.partition.DimensionRangeShardSpec; @@ -61,14 +76,12 @@ public class MSQReplaceTest extends MSQTestBase { - private static final String WITH_REPLACE_LOCK = "WITH_REPLACE_LOCK"; - private static final Map QUERY_CONTEXT_WITH_REPLACE_LOCK = + private static final String WITH_REPLACE_LOCK_AND_COMPACTION_STATE = "with_replace_lock_and_compaction_state"; + private static final Map QUERY_CONTEXT_WITH_REPLACE_LOCK_AND_COMPACTION_STATE = ImmutableMap.builder() .putAll(DEFAULT_MSQ_CONTEXT) - .put( - Tasks.TASK_LOCK_TYPE, - StringUtils.toLowerCase(TaskLockType.REPLACE.name()) - ) + .put(Tasks.TASK_LOCK_TYPE, StringUtils.toLowerCase(TaskLockType.REPLACE.name())) + .put(Tasks.STORE_COMPACTION_STATE_KEY, true) .build(); public static Collection data() @@ -78,8 +91,8 @@ public static Collection data() {DURABLE_STORAGE, DURABLE_STORAGE_MSQ_CONTEXT}, {FAULT_TOLERANCE, FAULT_TOLERANCE_MSQ_CONTEXT}, {PARALLEL_MERGE, PARALLEL_MERGE_MSQ_CONTEXT}, - {WITH_REPLACE_LOCK, QUERY_CONTEXT_WITH_REPLACE_LOCK} - }; + {WITH_REPLACE_LOCK_AND_COMPACTION_STATE, QUERY_CONTEXT_WITH_REPLACE_LOCK_AND_COMPACTION_STATE}, + }; return Arrays.asList(data); } @MethodSource("data") @@ -161,6 +174,14 @@ public void testReplaceOnFooWithAll(String contextName, Map cont .with().segmentRowsProcessed(6), 1, 0 ) + .setExpectedLastCompactionState( + expectedCompactionState( + context, + Collections.emptyList(), + Collections.singletonList(new FloatDimensionSchema("m1")), + GranularityType.DAY + ) + ) .verifyResults(); } @@ -211,6 +232,14 @@ public void testReplaceOnFooWithWhere(String contextName, Map co .with().segmentRowsProcessed(1), 1, 0 ) + .setExpectedLastCompactionState( + expectedCompactionState( + context, + Collections.emptyList(), + Collections.singletonList(new FloatDimensionSchema("m1")), + GranularityType.DAY + ) + ) .verifyResults(); } @@ -293,6 +322,14 @@ public void testReplaceOnFoo1WithAllExtern(String contextName, Map c .with().segmentRowsProcessed(2), 1, 0 ) + .setExpectedLastCompactionState( + expectedCompactionState( + context, + Collections.emptyList(), + Collections.singletonList(new FloatDimensionSchema("m1")), + GranularityType.MONTH + ) + ) .verifyResults(); } @@ -668,6 +737,14 @@ public void testReplaceWhereClauseLargerThanData(String contextName, Map contex + "WHERE __time >= TIMESTAMP '2000-01-01' AND __time < TIMESTAMP '2000-01-03' " + "PARTITIONED BY MONTH") .setExpectedDataSource("foo") - .setQueryContext(DEFAULT_MSQ_CONTEXT) .setExpectedRowSignature(rowSignature) .setQueryContext(context) .setExpectedDestinationIntervals(Collections.singletonList(Intervals.of("2000-01-01T/2000-03-01T"))) @@ -749,6 +825,13 @@ public void testReplaceTimeChunks(String contextName, Map contex ImmutableList.of( new Object[]{946684800000L, 1.0f}, new Object[]{946771200000L, 2.0f} + )) + .setExpectedLastCompactionState( + expectedCompactionState( + context, + Collections.emptyList(), + Collections.singletonList(new FloatDimensionSchema("m1")), + GranularityType.MONTH ) ) .verifyResults(); @@ -807,6 +890,14 @@ public void testReplaceTimeChunksLargerThanData(String contextName, Map conte .setQueryContext(context) .setExpectedSegment(expectedFooSegments()) .setExpectedResultRows(expectedFooRows()) + .setExpectedLastCompactionState( + expectedCompactionState( + context, + Collections.singletonList("dim1"), + Arrays.asList( + new StringDimensionSchema("dim1"), + new LongDimensionSchema("cnt") + ), + GranularityType.DAY + ) + ) .verifyResults(); } @@ -961,6 +1071,72 @@ public void testReplaceSegmentsInsertIntoNewTable(String contextName, Map context) + { + RowSignature rowSignature = RowSignature.builder() + .add("__time", ColumnType.LONG) + .add("m1", ColumnType.FLOAT) + .add("m2", ColumnType.DOUBLE) + .build(); + + testIngestQuery().setSql(" REPLACE INTO foobar " + + "OVERWRITE ALL " + + "SELECT __time, m1, m2 " + + "FROM foo " + + "PARTITIONED by TIME_FLOOR(__time, 'P3M') ") + .setExpectedDataSource("foobar") + .setExpectedRowSignature(rowSignature) + .setQueryContext(context) + .setExpectedDestinationIntervals(Intervals.ONLY_ETERNITY) + .setExpectedSegment( + ImmutableSet.of( + SegmentId.of( + "foobar", + Intervals.of( + "2000-01-01T00:00:00.000Z/2000-04-01T00:00:00.000Z"), + "test", + 0 + ), + SegmentId.of( + "foobar", + Intervals.of( + "2001-01-01T00:00:00.000Z/2001-04-01T00:00:00.000Z"), + "test", + 0 + ) + ) + ) + .setExpectedResultRows( + ImmutableList.of( + new Object[]{946684800000L, 1.0f, 1.0}, + new Object[]{946771200000L, 2.0f, 2.0}, + new Object[]{946857600000L, 3.0f, 3.0}, + new Object[]{978307200000L, 4.0f, 4.0}, + new Object[]{978393600000L, 5.0f, 5.0}, + new Object[]{978480000000L, 6.0f, 6.0} + ) + ) + .setExpectedLastCompactionState( + expectedCompactionState( + context, + Collections.emptyList(), + Arrays.asList(new FloatDimensionSchema("m1"), new DoubleDimensionSchema("m2")), + GranularityType.QUARTER + ) + ) .verifyResults(); } @@ -1045,6 +1221,14 @@ public void testReplaceUnnestSegmentEntireTable(String contextName, Map expectedFooRows() )); return expectedRows; } + + private CompactionState expectedCompactionState( + Map context, + List partitionDimensions, + List dimensions, + GranularityType segmentGranularity + ) + { + if (!context.containsKey(Tasks.STORE_COMPACTION_STATE_KEY) + || !((Boolean) context.get(Tasks.STORE_COMPACTION_STATE_KEY))) { + return null; + } + PartitionsSpec partitionsSpec; + if (partitionDimensions.isEmpty()) { + partitionsSpec = new DynamicPartitionsSpec(MultiStageQueryContext.DEFAULT_ROWS_PER_SEGMENT, Long.MAX_VALUE); + + } else { + partitionsSpec = new DimensionRangePartitionsSpec(MultiStageQueryContext.DEFAULT_ROWS_PER_SEGMENT, null, + partitionDimensions, false + ); + } + DimensionsSpec dimensionsSpec = new DimensionsSpec.Builder() + .setDimensions(dimensions) + .setDimensionExclusions(Collections.singletonList( + "__time")) + .build(); + + IndexSpec indexSpec = new IndexSpec(null, null, null, null, null, null, null); + GranularitySpec granularitySpec = new UniformGranularitySpec( + segmentGranularity.getDefaultGranularity(), + GranularityType.NONE.getDefaultGranularity(), + false, + Intervals.ONLY_ETERNITY + ); + List metricsSpec = Collections.emptyList(); + + return new CompactionState( + partitionsSpec, + dimensionsSpec, + metricsSpec, + null, + indexSpec.asMap(objectMapper), + granularitySpec.asMap(objectMapper) + ); + + } } diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java index 7da53c4ee99a..57f052e6f369 100644 --- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java +++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java @@ -187,6 +187,7 @@ import org.apache.druid.storage.StorageConnectorModule; import org.apache.druid.storage.StorageConnectorProvider; import org.apache.druid.storage.local.LocalFileStorageConnector; +import org.apache.druid.timeline.CompactionState; import org.apache.druid.timeline.DataSegment; import org.apache.druid.timeline.PruneLoadSpec; import org.apache.druid.timeline.SegmentId; @@ -855,6 +856,7 @@ public abstract class MSQTester> protected MSQSpec expectedMSQSpec = null; protected MSQTuningConfig expectedTuningConfig = null; protected Set expectedSegments = null; + protected CompactionState expectedLastCompactionState = null; protected Set expectedTombstoneIntervals = null; protected List expectedResultRows = null; protected Matcher expectedValidationErrorMatcher = null; @@ -902,6 +904,12 @@ public Builder setExpectedSegment(Set expectedSegments) return asBuilder(); } + public Builder setExpectedLastCompactionState(CompactionState expectedLastCompactionState) + { + this.expectedLastCompactionState = expectedLastCompactionState; + return asBuilder(); + } + public Builder setExpectedTombstoneIntervals(Set tombstoneIntervals) { Preconditions.checkArgument(!tombstoneIntervals.isEmpty(), "Segments cannot be empty"); @@ -1278,6 +1286,12 @@ public void verifyResults() // SegmentGeneratorFrameProcessorFactory. We can get the tombstone segment ids published by taking a set // difference of all the segments published with the segments that are created by the SegmentGeneratorFrameProcessorFactory if (!testTaskActionClient.getPublishedSegments().isEmpty()) { + if (expectedLastCompactionState != null) { + CompactionState compactionState = testTaskActionClient.getPublishedSegments().stream().findFirst().get() + .getLastCompactionState(); + Assert.assertEquals(expectedLastCompactionState, compactionState); + + } Set publishedSegmentIds = testTaskActionClient.getPublishedSegments() .stream() .map(DataSegment::getId) @@ -1495,4 +1509,6 @@ private static List resultSignatureFromRowSignat } return retVal; } + } + diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/AbstractBatchIndexTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/AbstractBatchIndexTask.java index 4a093cde2ce8..7c3608cd8d5a 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/AbstractBatchIndexTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/AbstractBatchIndexTask.java @@ -607,7 +607,7 @@ public static boolean isGuaranteedRollup( return tuningConfig.isForceGuaranteedRollup(); } - public static Function, Set> compactionStateAnnotateFunction( + public static Function, Set> addCompactionStateToSegments( boolean storeCompactionState, TaskToolbox toolbox, IngestionSpec ingestionSpec @@ -628,7 +628,7 @@ public static Function, Set> compactionStateAnnota ? null : toolbox.getJsonMapper().convertValue(ingestionSpec.getDataSchema().getAggregators(), new TypeReference>() {}); - final CompactionState compactionState = new CompactionState( + return CompactionState.addCompactionStateToSegments( tuningConfig.getPartitionsSpec(), dimensionsSpec, metricsSpec, @@ -636,10 +636,6 @@ public static Function, Set> compactionStateAnnota tuningConfig.getIndexSpec().asMap(toolbox.getJsonMapper()), granularitySpec.asMap(toolbox.getJsonMapper()) ); - return segments -> segments - .stream() - .map(s -> s.withLastCompactionState(compactionState)) - .collect(Collectors.toSet()); } else { return Function.identity(); } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CompactionTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CompactionTask.java index eabb9e062366..102019b94b09 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CompactionTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CompactionTask.java @@ -1221,10 +1221,7 @@ CompactionTuningConfig computeTuningConfig() final DynamicPartitionsSpec dynamicPartitionsSpec = (DynamicPartitionsSpec) partitionsSpec; partitionsSpec = new DynamicPartitionsSpec( dynamicPartitionsSpec.getMaxRowsPerSegment(), - // Setting maxTotalRows to Long.MAX_VALUE to respect the computed maxRowsPerSegment. - // If this is set to something too small, compactionTask can generate small segments - // which need to be compacted again, which in turn making auto compaction stuck in the same interval. - dynamicPartitionsSpec.getMaxTotalRowsOr(Long.MAX_VALUE) + dynamicPartitionsSpec.getMaxTotalRowsOr(DynamicPartitionsSpec.DEFAULT_COMPACTION_MAX_TOTAL_ROWS) ); } return newTuningConfig.withPartitionsSpec(partitionsSpec); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/IndexTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/IndexTask.java index 532529ecfc19..c2160686ab26 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/IndexTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/IndexTask.java @@ -923,7 +923,7 @@ private TaskStatus generateAndPublishSegments( Tasks.DEFAULT_STORE_COMPACTION_STATE ); final Function, Set> annotateFunction = - compactionStateAnnotateFunction( + addCompactionStateToSegments( storeCompactionState, toolbox, ingestionSchema diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java index 27694360ed95..826bfd243bc1 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java @@ -1149,7 +1149,7 @@ private void publishSegments( Tasks.STORE_COMPACTION_STATE_KEY, Tasks.DEFAULT_STORE_COMPACTION_STATE ); - final Function, Set> annotateFunction = compactionStateAnnotateFunction( + final Function, Set> annotateFunction = addCompactionStateToSegments( storeCompactionState, toolbox, ingestionSchema diff --git a/processing/src/main/java/org/apache/druid/indexer/partitions/DynamicPartitionsSpec.java b/processing/src/main/java/org/apache/druid/indexer/partitions/DynamicPartitionsSpec.java index 05dec7cb58e7..2c5d294f3c1c 100644 --- a/processing/src/main/java/org/apache/druid/indexer/partitions/DynamicPartitionsSpec.java +++ b/processing/src/main/java/org/apache/druid/indexer/partitions/DynamicPartitionsSpec.java @@ -34,6 +34,10 @@ public class DynamicPartitionsSpec implements PartitionsSpec * Default maxTotalRows for most task types except compaction task. */ public static final long DEFAULT_MAX_TOTAL_ROWS = 20_000_000; + // Using MAX_VALUE as the default for setting maxTotalRows for compaction to respect the computed maxRowsPerSegment. + // If this is set to something too small, compactionTask can generate small segments + // which need to be compacted again, which in turn making auto compaction stuck in the same interval. + public static final long DEFAULT_COMPACTION_MAX_TOTAL_ROWS = Long.MAX_VALUE; static final String NAME = "dynamic"; private final int maxRowsPerSegment; diff --git a/processing/src/main/java/org/apache/druid/timeline/CompactionState.java b/processing/src/main/java/org/apache/druid/timeline/CompactionState.java index cb9ddf1a93b5..2c6e0d96c397 100644 --- a/processing/src/main/java/org/apache/druid/timeline/CompactionState.java +++ b/processing/src/main/java/org/apache/druid/timeline/CompactionState.java @@ -27,6 +27,9 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; /** * This class describes what compaction task spec was used to create a given segment. @@ -146,4 +149,29 @@ public String toString() ", metricsSpec=" + metricsSpec + '}'; } + + public static Function, Set> addCompactionStateToSegments( + PartitionsSpec partitionsSpec, + DimensionsSpec dimensionsSpec, + List metricsSpec, + Map transformSpec, + Map indexSpec, + Map granularitySpec + ) + { + CompactionState compactionState = new CompactionState( + partitionsSpec, + dimensionsSpec, + metricsSpec, + transformSpec, + indexSpec, + granularitySpec + ); + + return segments -> segments + .stream() + .map(s -> s.withLastCompactionState(compactionState)) + .collect(Collectors.toSet()); + } + } diff --git a/processing/src/test/java/org/apache/druid/timeline/DataSegmentTest.java b/processing/src/test/java/org/apache/druid/timeline/DataSegmentTest.java index 97c3d7a2aaac..3f0667b870c9 100644 --- a/processing/src/test/java/org/apache/druid/timeline/DataSegmentTest.java +++ b/processing/src/test/java/org/apache/druid/timeline/DataSegmentTest.java @@ -23,6 +23,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; import com.google.common.collect.RangeSet; import org.apache.druid.data.input.impl.DimensionsSpec; import org.apache.druid.indexer.partitions.DynamicPartitionsSpec; @@ -47,6 +48,8 @@ import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.function.Function; /** */ @@ -365,6 +368,59 @@ public void testWithLastCompactionState() Assert.assertEquals(segment1, segment2.withLastCompactionState(compactionState)); } + @Test + public void testAnnotateWithLastCompactionState() + { + DynamicPartitionsSpec dynamicPartitionsSpec = new DynamicPartitionsSpec(null, null); + DimensionsSpec dimensionsSpec = new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of( + "bar", + "foo" + ))); + List metricsSpec = ImmutableList.of(ImmutableMap.of("type", "count", "name", "count")); + Map transformSpec = ImmutableMap.of( + "filter", + ImmutableMap.of("type", "selector", "dimension", "dim1", "value", "foo") + ); + Map indexSpec = Collections.singletonMap("test", "map"); + Map granularitySpec = Collections.singletonMap("test2", "map"); + + final CompactionState compactionState = new CompactionState( + dynamicPartitionsSpec, + dimensionsSpec, + metricsSpec, + transformSpec, + indexSpec, + granularitySpec + ); + + final Function, Set> addCompactionStateFunction = + CompactionState.addCompactionStateToSegments( + dynamicPartitionsSpec, + dimensionsSpec, + metricsSpec, + transformSpec, + indexSpec, + granularitySpec + ); + + final DataSegment segment1 = DataSegment.builder() + .dataSource("foo") + .interval(Intervals.of("2012-01-01/2012-01-02")) + .version(DateTimes.of("2012-01-01T11:22:33.444Z").toString()) + .shardSpec(getShardSpec(7)) + .size(0) + .lastCompactionState(compactionState) + .build(); + final DataSegment segment2 = DataSegment.builder() + .dataSource("foo") + .interval(Intervals.of("2012-01-01/2012-01-02")) + .version(DateTimes.of("2012-01-01T11:22:33.444Z").toString()) + .shardSpec(getShardSpec(7)) + .size(0) + .build(); + Assert.assertEquals(ImmutableSet.of(segment1), addCompactionStateFunction.apply(ImmutableSet.of(segment2))); + } + @Test public void testTombstoneType() {