From a166b7e18dacc3b6ad40a24fc68065ff66f23cf4 Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Wed, 20 Nov 2019 17:57:18 -0800 Subject: [PATCH 01/17] Parallel indexing single dim partitions Implements single dimension range partitioning for native parallel batch indexing as described in #8769. This initial version requires the druid-datasketches extension to be loaded. The algorithm has 5 phases that are orchestrated by the supervisor in `ParallelIndexSupervisorTask#runRangePartitionMultiPhaseParallel()`. These phases and the main classes involved are described below: 1) In parallel, determine the distribution of dimension values for each input source split. `PartialDimensionDistributionTask` uses `StringSketch` to generate the approximate distribution of dimension values for each input source split. If the rows are ungrouped, `PartialDimensionDistributionTask.UngroupedRowDimensionValueFilter` uses a Bloom filter to skip rows that would be grouped. The final distribution is sent back to the supervisor via `DimensionDistributionReport`. 2) The range partitions are determined. In `ParallelIndexSupervisorTask#determineAllRangePartitions()`, the supervisor uses `StringSketchMerger` to merge the individual `StringSketch`es created in the preceding phase. The merged sketch is then used to create the range partitions. 3) In parallel, generate partial range-partitioned segments. `PartialRangeSegmentGenerateTask` uses the range partitions determined in the preceding phase and `RangePartitionCachingLocalSegmentAllocator` to generate `SingleDimensionShardSpec`s. The partition information is sent back to the supervisor via `GeneratedGenericPartitionsReport`. 4) The partial range segments are grouped. In `ParallelIndexSupervisorTask#groupGenericPartitionLocationsPerPartition()`, the supervisor creates the `PartialGenericSegmentMergeIOConfig`s necessary for the next phase. 5) In parallel, merge partial range-partitioned segments. `PartialGenericSegmentMergeTask` uses `GenericPartitionLocation` to retrieve the partial range-partitioned segments generated earlier and then merges and publishes them. --- .../SingleDimensionPartitionsSpec.java | 6 +- docs/ingestion/hadoop.md | 2 +- docs/ingestion/index.md | 2 +- docs/ingestion/native-batch.md | 39 +- indexing-service/pom.xml | 5 + .../task/CachingLocalSegmentAllocator.java | 1 + ...PartitionCachingLocalSegmentAllocator.java | 193 +++++++ .../druid/indexing/common/task/Task.java | 6 + .../parallel/DimensionDistributionReport.java | 68 +++ .../GeneratedGenericPartitionsReport.java | 44 ++ .../parallel/GenericPartitionLocation.java | 62 +++ .../batch/parallel/GenericPartitionStat.java | 91 ++++ .../parallel/ParallelIndexSupervisorTask.java | 190 ++++++- ...onDistributionParallelIndexTaskRunner.java | 124 +++++ .../PartialDimensionDistributionTask.java | 448 +++++++++++++++++ .../PartialGenericSegmentMergeIOConfig.java | 40 ++ ...rtialGenericSegmentMergeIngestionSpec.java | 38 ++ ...icSegmentMergeParallelIndexTaskRunner.java | 111 ++++ .../PartialGenericSegmentMergeTask.java | 116 +++++ ...egmentGenerateParallelIndexTaskRunner.java | 130 +++++ .../PartialRangeSegmentGenerateTask.java | 183 +++++++ .../task/batch/parallel/SubTaskReport.java | 4 +- .../distribution/StringDistribution.java | 66 +++ .../StringDistributionMerger.java | 36 ++ .../parallel/distribution/StringSketch.java | 194 +++++++ .../distribution/StringSketchMerger.java | 52 ++ .../parallel/distribution/TimeDimTuple.java | 86 ++++ .../distribution/TimeDimTupleFactory.java | 47 ++ .../distribution/TimeDimTupleFunnel.java | 38 ++ ...faultIndexTaskInputRowIteratorBuilder.java | 2 + ...itionIndexTaskInputRowIteratorBuilder.java | 102 ++++ .../common/task/IngestionTestBase.java | 5 + ...itionCachingLocalSegmentAllocatorTest.java | 233 +++++++++ .../DimensionDistributionReportTest.java | 55 ++ .../GenericPartitionLocationTest.java | 58 +++ .../parallel/GenericPartitionStatTest.java | 59 +++ .../ParallelIndexSupervisorTaskSerdeTest.java | 21 +- .../parallel/ParallelIndexTestingFactory.java | 16 +- .../PartialDimensionDistributionTaskTest.java | 470 +++++++++++++++++ ...artialGenericSegmentMergeIOConfigTest.java | 54 ++ ...lGenericSegmentMergeIngestionSpecTest.java | 68 +++ .../PartialGenericSegmentMergeTaskTest.java | 90 ++++ .../PartialRangeSegmentGenerateTaskTest.java | 151 ++++++ ...rtitionMultiPhaseParallelIndexingTest.java | 472 ++++++++++++++++++ .../distribution/StringSketchMergerTest.java | 80 +++ .../distribution/StringSketchTest.java | 379 ++++++++++++++ .../distribution/TimeDimTupleFactoryTest.java | 66 +++ .../distribution/TimeDimTupleFunnelTest.java | 37 ++ .../distribution/TimeDimTupleTest.java | 71 +++ ...InputRowIteratorBuilderTestingFactory.java | 1 + ...titionTaskInputRowIteratorBuilderTest.java | 74 +++ .../indexer/AbstractITBatchIndexTest.java | 8 +- .../tests/indexer/ITParallelIndexTest.java | 4 +- 53 files changed, 4974 insertions(+), 24 deletions(-) create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/DimensionDistributionReport.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedGenericPartitionsReport.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionLocation.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStat.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionParallelIndexTaskRunner.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIOConfig.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIngestionSpec.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeParallelIndexTaskRunner.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTask.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistribution.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistributionMerger.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMerger.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTuple.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFactory.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFunnel.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionIndexTaskInputRowIteratorBuilder.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/DimensionDistributionReportTest.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionLocationTest.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStatTest.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIOConfigTest.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIngestionSpecTest.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTaskTest.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTaskTest.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFactoryTest.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFunnelTest.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleTest.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionTaskInputRowIteratorBuilderTest.java diff --git a/core/src/main/java/org/apache/druid/indexer/partitions/SingleDimensionPartitionsSpec.java b/core/src/main/java/org/apache/druid/indexer/partitions/SingleDimensionPartitionsSpec.java index d193b30b2cf1..031f160d7655 100644 --- a/core/src/main/java/org/apache/druid/indexer/partitions/SingleDimensionPartitionsSpec.java +++ b/core/src/main/java/org/apache/druid/indexer/partitions/SingleDimensionPartitionsSpec.java @@ -175,7 +175,11 @@ public List getPartitionDimensions() @Override public String getForceGuaranteedRollupIncompatiblityReason() { - return NAME + " partitions unsupported"; + if (getPartitionDimension() == null) { + return PARITION_DIMENSION + " must be specified"; + } + + return FORCE_GUARANTEED_ROLLUP_COMPATIBLE; } @Override diff --git a/docs/ingestion/hadoop.md b/docs/ingestion/hadoop.md index cb86355d4189..81a5ce2e844e 100644 --- a/docs/ingestion/hadoop.md +++ b/docs/ingestion/hadoop.md @@ -366,7 +366,7 @@ The configuration options are: |type|Type of partitionSpec to be used.|"single_dim"| |targetRowsPerSegment|Target number of rows to include in a partition, should be a number that targets segments of 500MB\~1GB.|yes| |targetPartitionSize|Deprecated. Renamed to `targetRowsPerSegment`. Target number of rows to include in a partition, should be a number that targets segments of 500MB\~1GB.|no| -|maxRowsPerSegment|Maximum number of rows to include in a partition. Defaults to 50% larger than the `targetPartitionSize`.|no| +|maxRowsPerSegment|Maximum number of rows to include in a partition. Defaults to 50% larger than the `targetRowsPerSegment`.|no| |maxPartitionSize|Deprecated. Use `maxRowsPerSegment` instead. Maximum number of rows to include in a partition. Defaults to 50% larger than the `targetPartitionSize`.|no| |partitionDimension|The dimension to partition on. Leave blank to select a dimension automatically.|no| |assumeGrouped|Assume that input data has already been grouped on time and dimensions. Ingestion will run faster, but may choose sub-optimal partitions if this assumption is violated.|no| diff --git a/docs/ingestion/index.md b/docs/ingestion/index.md index 23b240922e93..54324ab65eea 100644 --- a/docs/ingestion/index.md +++ b/docs/ingestion/index.md @@ -88,7 +88,7 @@ This table compares the three available options: | **Input locations** | Any [firehose](native-batch.md#firehoses). | Any [firehose](native-batch.md#firehoses). | Any Hadoop FileSystem or Druid datasource. | | **File formats** | Text file formats (CSV, TSV, JSON). Support for binary formats is coming in a future release. | Text file formats (CSV, TSV, JSON). Support for binary formats is coming in a future release. | Any Hadoop InputFormat. | | **[Rollup modes](#rollup)** | Perfect if `forceGuaranteedRollup` = true in the [`tuningConfig`](native-batch.md#tuningconfig).| Perfect if `forceGuaranteedRollup` = true in the [`tuningConfig`](native-batch.md#tuningconfig). | Always perfect. | -| **Partitioning options** | Hash-based partitioning is supported when `forceGuaranteedRollup` = true in the [`tuningConfig`](native-batch.md#tuningconfig). | Hash-based partitioning (when `forceGuaranteedRollup` = true). | Hash-based or range-based partitioning via [`partitionsSpec`](hadoop.md#partitionsspec). | +| **Partitioning options** | Hash-based partitioning is supported when `forceGuaranteedRollup` = true in the [`tuningConfig`](native-batch.md#tuningconfig). | Hash-based or range-based partitioning (when `forceGuaranteedRollup` = true). | Hash-based or range-based partitioning via [`partitionsSpec`](hadoop.md#partitionsspec). | diff --git a/docs/ingestion/native-batch.md b/docs/ingestion/native-batch.md index 428df3eff2b1..e11986ed7a49 100644 --- a/docs/ingestion/native-batch.md +++ b/docs/ingestion/native-batch.md @@ -54,7 +54,7 @@ each sub task creates segments individually and reports them to the supervisor t If `forceGuaranteedRollup` = true, it's executed in two phases with data shuffle which is similar to [MapReduce](https://en.wikipedia.org/wiki/MapReduce). In the first phase, each sub task partitions input data based on `segmentGranularity` (primary partition key) in `granularitySpec` -and `partitionDimensions` (secondary partition key) in `partitionsSpec`. The partitioned data is served by +and `partitionDimension` or `partitionDimensions` (secondary partition key) in `partitionsSpec`. The partitioned data is served by the [middleManager](../design/middlemanager.md) or the [indexer](../design/indexer.md) where the first phase tasks ran. In the second phase, each sub task fetches partitioned data from MiddleManagers or indexers and merges them to create the final segments. @@ -205,13 +205,13 @@ The tuningConfig is optional and default parameters will be used if no tuningCon |maxRowsInMemory|Used in determining when intermediate persists to disk should occur. Normally user does not need to set this, but depending on the nature of data, if rows are short in terms of bytes, user may not want to store a million rows in memory and this value should be set.|1000000|no| |maxBytesInMemory|Used in determining when intermediate persists to disk should occur. Normally this is computed internally and user does not need to set it. This value represents number of bytes to aggregate in heap memory before persisting. This is based on a rough estimate of memory usage and not actual usage. The maximum heap memory usage for indexing is maxBytesInMemory * (2 + maxPendingPersists)|1/6 of max JVM memory|no| |maxTotalRows|Deprecated. Use `partitionsSpec` instead. Total number of rows in segments waiting for being pushed. Used in determining when intermediate pushing should occur.|20000000|no| -|numShards|Deprecated. Use `partitionsSpec` instead. Directly specify the number of shards to create. If this is specified and `intervals` is specified in the `granularitySpec`, the index task can skip the determine intervals/partitions pass through the data. `numShards` cannot be specified if `maxRowsPerSegment` is set.|null|no| +|numShards|Deprecated. Use `partitionsSpec` instead. Directly specify the number of shards to create when using a `hashed` `partitionsSpec`. If this is specified and `intervals` is specified in the `granularitySpec`, the index task can skip the determine intervals/partitions pass through the data. `numShards` cannot be specified if `maxRowsPerSegment` is set.|null|no| |splitHintSpec|Used to give a hint to control the amount of data that each first phase task reads. This hint could be ignored depending on the implementation of firehose. See [SplitHintSpec](#splithintspec) for more details.|null|no| -|partitionsSpec|Defines how to partition data in each timeChunk, see [PartitionsSpec](#partitionsspec)|`dynamic` if `forceGuaranteedRollup` = false, `hashed` if `forceGuaranteedRollup` = true|no| +|partitionsSpec|Defines how to partition data in each timeChunk, see [PartitionsSpec](#partitionsspec)|`dynamic` if `forceGuaranteedRollup` = false, `hashed` or `single_dim` if `forceGuaranteedRollup` = true|no| |indexSpec|Defines segment storage format options to be used at indexing time, see [IndexSpec](index.md#indexspec)|null|no| |indexSpecForIntermediatePersists|Defines segment storage format options to be used at indexing time for intermediate persisted temporary segments. this can be used to disable dimension/metric compression on intermediate segments to reduce memory required for final merging. however, disabling compression on intermediate segments might increase page cache use while they are used before getting merged into final segment published, see [IndexSpec](index.md#indexspec) for possible values.|same as indexSpec|no| |maxPendingPersists|Maximum number of persists that can be pending but not started. If this limit would be exceeded by a new intermediate persist, ingestion will block until the currently-running persist finishes. Maximum heap memory usage for indexing scales with maxRowsInMemory * (2 + maxPendingPersists).|0 (meaning one persist can be running concurrently with ingestion, and none can be queued up)|no| -|forceGuaranteedRollup|Forces guaranteeing the [perfect rollup](../ingestion/index.md#rollup). The perfect rollup optimizes the total size of generated segments and querying time while indexing time will be increased. If this is set to true, `numShards` in `tuningConfig` and `intervals` in `granularitySpec` must be set. Note that the result segments would be hash-partitioned. This flag cannot be used with `appendToExisting` of IOConfig. For more details, see the below __Segment pushing modes__ section.|false|no| +|forceGuaranteedRollup|Forces guaranteeing the [perfect rollup](../ingestion/index.md#rollup). The perfect rollup optimizes the total size of generated segments and querying time while indexing time will be increased. If this is set to true, `intervals` in `granularitySpec` must be set and `hashed` or `single_dim` must be used for `partitionsSpec`. This flag cannot be used with `appendToExisting` of IOConfig. For more details, see the below __Segment pushing modes__ section.|false|no| |reportParseExceptions|If true, exceptions encountered during parsing will be thrown and will halt ingestion; if false, unparseable rows and fields will be skipped.|false|no| |pushTimeout|Milliseconds to wait for pushing segments. It must be >= 0, where 0 means to wait forever.|0|no| |segmentWriteOutMediumFactory|Segment write-out medium to use when creating segments. See [SegmentWriteOutMediumFactory](#segmentwriteoutmediumfactory).|Not specified, the value from `druid.peon.defaultSegmentWriteOutMediumFactory.type` is used|no| @@ -241,18 +241,37 @@ Currently only one splitHintSpec, i.e., `segments`, is available. ### `partitionsSpec` -PartitionsSpec is to describe the secondary partitioning method. +PartitionsSpec is used to describe the secondary partitioning method. You should use different partitionsSpec depending on the [rollup mode](../ingestion/index.md#rollup) you want. -For perfect rollup, you should use `hashed`. +For perfect rollup, you should use either `hashed` (partitioning based on the hash of dimensions in each row) or +`single_dim` (based on ranges of a single dimension. For best-effort rollup, you should use `dynamic`. + +Hashed partitioning is recommended in most cases, as it will improve indexing performance and create more uniformly +sized data segments relative to single-dimension or dynamic partitioning. + +#### Hash-based partitioning |property|description|default|required?| |--------|-----------|-------|---------| |type|This should always be `hashed`|none|yes| -|targetRowsPerSegment|Target number of rows to include in a partition, should be a number that targets segments of 500MB\~1GB.|5000000 (if `numShards` is not set)|either this or `numShards`| -|numShards|Directly specify the number of shards to create. If this is specified and `intervals` is specified in the `granularitySpec`, the index task can skip the determine intervals/partitions pass through the data. `numShards` cannot be specified if `targetRowsPerSegment` is set.|null|no| -|partitionDimensions|The dimensions to partition on. Leave blank to select all dimensions. Only used with `numShards`, will be ignored when `targetRowsPerSegment` is set.|null|no| +|numShards|Directly specify the number of shards to create. If this is specified and `intervals` is specified in the `granularitySpec`, the index task can skip the determine intervals/partitions pass through the data. `numShards` cannot be specified if `targetRowsPerSegment` is set.|null|yes| +|partitionDimensions|The dimensions to partition on. Leave blank to select all dimensions.|null|no| -For best-effort rollup, you should use `dynamic`. +#### Single-dimension range partitioning + +> Single-dimension range partitioning currently requires the +> [druid-datasketches](../development/extensions-core/datasketches-extension.md) +> extension to be loaded. + +|property|description|default|required?| +|--------|-----------|-------|---------| +|type|This should always be `single_dim`|none|yes| +|targetRowsPerSegment|Target number of rows to include in a partition, should be a number that targets segments of 500MB\~1GB.|none|either this or `maxRowsPerSegment`| +|maxRowsPerSegment|Maximum number of rows to include in a partition. Defaults to 50% larger than the `targetRowsPerSegment`.|none|either this or `targetRowsPerSegment`| +|partitionDimension|The dimension to partition on.|none|yes| +|assumeGrouped|Assume that input data has already been grouped on time and dimensions. Ingestion will run faster, but may choose sub-optimal partitions if this assumption is violated.|false|no| + +#### Dynamic partitioning |property|description|default|required?| |--------|-----------|-------|---------| diff --git a/indexing-service/pom.xml b/indexing-service/pom.xml index 2a71a9a72c56..c55e2de46827 100644 --- a/indexing-service/pom.xml +++ b/indexing-service/pom.xml @@ -199,6 +199,11 @@ it.unimi.dsi fastutil + + com.yahoo.datasketches + sketches-core + provided + diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CachingLocalSegmentAllocator.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CachingLocalSegmentAllocator.java index 21157bf13957..fbb9081aafa3 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CachingLocalSegmentAllocator.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CachingLocalSegmentAllocator.java @@ -43,6 +43,7 @@ * Allocates all necessary segments locally at the beginning and reuses them. * * @see HashPartitionCachingLocalSegmentAllocator + * @see RangePartitionCachingLocalSegmentAllocator */ class CachingLocalSegmentAllocator implements IndexTaskSegmentAllocator { diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java new file mode 100644 index 000000000000..15c9b56c60d5 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task; + +import com.google.common.collect.Maps; +import org.apache.druid.data.input.InputRow; +import org.apache.druid.indexing.common.TaskToolbox; +import org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec; +import org.apache.druid.timeline.partition.SingleDimensionShardSpec; +import org.joda.time.Interval; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Allocates all necessary range-partitioned segments locally at the beginning and reuses them. + * + * @see CachingLocalSegmentAllocator + */ +public class RangePartitionCachingLocalSegmentAllocator implements IndexTaskSegmentAllocator +{ + private final String dataSource; + private final String partitionDimension; + private final Map intervalsToPartitions; + private final IndexTaskSegmentAllocator delegate; + + public RangePartitionCachingLocalSegmentAllocator( + TaskToolbox toolbox, + String taskId, + String dataSource, + String partitionDimension, + Map intervalsToPartitions + ) throws IOException + { + this.dataSource = dataSource; + this.partitionDimension = partitionDimension; + this.intervalsToPartitions = intervalsToPartitions; + + this.delegate = new CachingLocalSegmentAllocator( + toolbox, + taskId, + this::getIntervalToSegmentIds + ); + } + + private Map> getIntervalToSegmentIds(Function versionFinder) + { + Map> intervalToSegmentIds = + Maps.newHashMapWithExpectedSize(intervalsToPartitions.size()); + + intervalsToPartitions.forEach( + (interval, partitions) -> + intervalToSegmentIds.put( + interval, + translatePartitions(interval, partitions, versionFinder) + ) + ); + + return intervalToSegmentIds; + } + + private List translatePartitions( + Interval interval, + String[] partitions, + Function versionFinder + ) + { + if (partitions.length == 0) { + return Collections.emptyList(); + } + + String[] uniquePartitions = Arrays.stream(partitions).distinct().toArray(String[]::new); + int numUniquePartition = uniquePartitions.length; + + if (numUniquePartition == 1) { + return Collections.singletonList( + createLastSegmentIdWithShardSpec( + interval, + versionFinder.apply(interval), + uniquePartitions[0], + 0 + ) + ); + } + + if (isLastPartitionOnlyMaxValue(partitions)) { + // The last partition only contains the max value. A shard that just contains the max value is likely to be + // small, so combine it with the second to last one. + numUniquePartition -= 1; + } + + List segmentIds = + IntStream.range(0, numUniquePartition - 1) + .mapToObj(i -> createSegmentIdWithShardSpec( + interval, + versionFinder.apply(interval), + uniquePartitions[i], + uniquePartitions[i + 1], + i + )) + .collect(Collectors.toCollection(ArrayList::new)); + segmentIds.add( + createLastSegmentIdWithShardSpec( + interval, + versionFinder.apply(interval), + uniquePartitions[numUniquePartition - 1], + segmentIds.size() + ) + ); + + return segmentIds; + } + + private boolean isLastPartitionOnlyMaxValue(String[] partitions) + { + String lastPartition = partitions[partitions.length - 1]; + String secondToLastPartition = partitions[partitions.length - 2]; + return !lastPartition.equals(secondToLastPartition); + } + + private SegmentIdWithShardSpec createLastSegmentIdWithShardSpec( + Interval interval, + String version, + String partitionStart, + int partitionNum + ) + { + return createSegmentIdWithShardSpec(interval, version, partitionStart, null, partitionNum); + } + + private SegmentIdWithShardSpec createSegmentIdWithShardSpec( + Interval interval, + String version, + String partitionStart, + @Nullable String partitionEnd, + int partitionNum + ) + { + return new SegmentIdWithShardSpec( + dataSource, + interval, + version, + new SingleDimensionShardSpec( + partitionDimension, + partitionStart, + partitionEnd, + partitionNum + ) + ); + } + + @Override + public String getSequenceName(Interval interval, InputRow inputRow) + { + return delegate.getSequenceName(interval, inputRow); + } + + @Override + public SegmentIdWithShardSpec allocate( + InputRow row, + String sequenceName, + String previousSegmentId, + boolean skipSegmentLineageCheck + ) throws IOException + { + return delegate.allocate(row, sequenceName, previousSegmentId, skipSegmentLineageCheck); + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/Task.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/Task.java index e2857d1fc553..a5db7586439b 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/Task.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/Task.java @@ -28,8 +28,11 @@ import org.apache.druid.indexing.common.config.TaskConfig; import org.apache.druid.indexing.common.task.batch.parallel.LegacySinglePhaseSubTask; import org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexSupervisorTask; +import org.apache.druid.indexing.common.task.batch.parallel.PartialDimensionDistributionTask; +import org.apache.druid.indexing.common.task.batch.parallel.PartialGenericSegmentMergeTask; import org.apache.druid.indexing.common.task.batch.parallel.PartialHashSegmentGenerateTask; import org.apache.druid.indexing.common.task.batch.parallel.PartialHashSegmentMergeTask; +import org.apache.druid.indexing.common.task.batch.parallel.PartialRangeSegmentGenerateTask; import org.apache.druid.indexing.common.task.batch.parallel.SinglePhaseSubTask; import org.apache.druid.query.Query; import org.apache.druid.query.QueryRunner; @@ -60,6 +63,9 @@ @Type(name = SinglePhaseSubTask.OLD_TYPE_NAME, value = LegacySinglePhaseSubTask.class), // for backward compatibility @Type(name = PartialHashSegmentGenerateTask.TYPE, value = PartialHashSegmentGenerateTask.class), @Type(name = PartialHashSegmentMergeTask.TYPE, value = PartialHashSegmentMergeTask.class), + @Type(name = PartialRangeSegmentGenerateTask.TYPE, value = PartialRangeSegmentGenerateTask.class), + @Type(name = PartialDimensionDistributionTask.TYPE, value = PartialDimensionDistributionTask.class), + @Type(name = PartialGenericSegmentMergeTask.TYPE, value = PartialGenericSegmentMergeTask.class), @Type(name = "index_hadoop", value = HadoopIndexTask.class), @Type(name = "index_realtime", value = RealtimeIndexTask.class), @Type(name = "index_realtime_appenderator", value = AppenderatorDriverRealtimeIndexTask.class), diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/DimensionDistributionReport.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/DimensionDistributionReport.java new file mode 100644 index 000000000000..a2e6dd0c476d --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/DimensionDistributionReport.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution; +import org.joda.time.Interval; + +import java.util.Map; + +public class DimensionDistributionReport implements SubTaskReport +{ + static final String TYPE = "dimension_distribution"; + private static final String PROP_DISTRIBUTIONS = "distributions"; + + private final String taskId; + private final Map intervalToDistribution; + + @JsonCreator + public DimensionDistributionReport( + @JsonProperty("taskId") String taskId, + @JsonProperty(PROP_DISTRIBUTIONS) Map intervalToDistribution + ) + { + this.taskId = taskId; + this.intervalToDistribution = intervalToDistribution; + } + + @Override + @JsonProperty + public String getTaskId() + { + return taskId; + } + + @JsonProperty(PROP_DISTRIBUTIONS) + public Map getIntervalToDistribution() + { + return intervalToDistribution; + } + + @Override + public String toString() + { + return "DimensionDistributionReport{" + + "taskId='" + taskId + '\'' + + ", intervalToDistribution=" + intervalToDistribution + + '}'; + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedGenericPartitionsReport.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedGenericPartitionsReport.java new file mode 100644 index 000000000000..0f6570505003 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedGenericPartitionsReport.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.util.List; + +/** + * Report containing the {@link GenericPartitionStat}s created by a {@link PartialSegmentGenerateTask}. + * This report is collected by {@link ParallelIndexSupervisorTask} and + * used to generate {@link PartialGenericSegmentMergeIOConfig}. + */ +class GeneratedGenericPartitionsReport extends GeneratedPartitionsReport implements SubTaskReport +{ + public static final String TYPE = "generated_generic_partitions"; + + @JsonCreator + GeneratedGenericPartitionsReport( + @JsonProperty("taskId") String taskId, + @JsonProperty("partitionStats") List partitionStats + ) + { + super(taskId, partitionStats); + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionLocation.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionLocation.java new file mode 100644 index 000000000000..23bb69a3d525 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionLocation.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.druid.timeline.partition.ShardSpec; +import org.joda.time.Interval; + +/** + * This class represents the intermediary data server where the partition of {@link #interval} and {@link #shardSpec} + * is stored. + */ +public class GenericPartitionLocation extends PartitionLocation +{ + private final ShardSpec shardSpec; + + @JsonCreator + public GenericPartitionLocation( + @JsonProperty("host") String host, + @JsonProperty("port") int port, + @JsonProperty("useHttps") boolean useHttps, + @JsonProperty("subTaskId") String subTaskId, + @JsonProperty("interval") Interval interval, + @JsonProperty("shardSpec") ShardSpec shardSpec + ) + { + super(host, port, useHttps, subTaskId, interval, shardSpec); + this.shardSpec = shardSpec; + } + + @JsonIgnore + @Override + public int getPartitionId() + { + return shardSpec.getPartitionNum(); + } + + @JsonProperty + ShardSpec getShardSpec() + { + return shardSpec; + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStat.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStat.java new file mode 100644 index 000000000000..04a98c284476 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStat.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.druid.timeline.partition.ShardSpec; +import org.joda.time.Interval; + +import javax.annotation.Nullable; +import java.util.Objects; + +/** + * Statistics about a partition created by {@link PartialSegmentGenerateTask}. Each partition is a set of data + * of the same time chunk (primary partition key) and the same {@link ShardSpec} (secondary partition key). This class + * holds the statistics of a single partition created by a task. + */ +public class GenericPartitionStat extends PartitionStat +{ + private static final String PROP_SHARD_SPEC = "shardSpec"; + + // Secondary partition key + private final ShardSpec shardSpec; + + @JsonCreator + public GenericPartitionStat( + @JsonProperty("taskExecutorHost") String taskExecutorHost, + @JsonProperty("taskExecutorPort") int taskExecutorPort, + @JsonProperty("useHttps") boolean useHttps, + @JsonProperty("interval") Interval interval, + @JsonProperty(PROP_SHARD_SPEC) ShardSpec shardSpec, + @JsonProperty("numRows") @Nullable Integer numRows, + @JsonProperty("sizeBytes") @Nullable Long sizeBytes + ) + { + super(taskExecutorHost, taskExecutorPort, useHttps, interval, numRows, sizeBytes); + this.shardSpec = shardSpec; + } + + @Override + public int getPartitionId() + { + return shardSpec.getPartitionNum(); + } + + @JsonProperty(PROP_SHARD_SPEC) + @Override + ShardSpec getSecondaryPartition() + { + return shardSpec; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + if (!super.equals(o)) { + return false; + } + GenericPartitionStat that = (GenericPartitionStat) o; + return Objects.equals(shardSpec, that.shardSpec); + } + + @Override + public int hashCode() + { + return Objects.hash(super.hashCode(), shardSpec); + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java index 28bfc7c421b1..d2e94a675e9b 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java @@ -27,6 +27,8 @@ import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Throwables; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; import org.apache.druid.client.indexing.IndexingServiceClient; import org.apache.druid.data.input.FiniteFirehoseFactory; import org.apache.druid.data.input.InputFormat; @@ -36,6 +38,7 @@ import org.apache.druid.indexer.TaskStatus; import org.apache.druid.indexer.partitions.HashedPartitionsSpec; import org.apache.druid.indexer.partitions.PartitionsSpec; +import org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec; import org.apache.druid.indexing.appenderator.ActionBasedUsedSegmentChecker; import org.apache.druid.indexing.common.Counters; import org.apache.druid.indexing.common.TaskLock; @@ -56,6 +59,10 @@ import org.apache.druid.indexing.common.task.TaskResource; import org.apache.druid.indexing.common.task.Tasks; import org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexTaskRunner.SubTaskSpecStatus; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistributionMerger; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketchMerger; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.Pair; @@ -75,6 +82,7 @@ import org.apache.druid.server.security.AuthorizerMapper; import org.apache.druid.timeline.DataSegment; import org.apache.druid.timeline.partition.NumberedShardSpec; +import org.apache.druid.utils.CollectionUtils; import org.checkerframework.checker.nullness.qual.MonotonicNonNull; import org.joda.time.DateTime; import org.joda.time.Interval; @@ -93,6 +101,7 @@ import javax.ws.rs.core.Response.Status; import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -298,6 +307,36 @@ PartialHashSegmentGenerateParallelIndexTaskRunner createPartialHashSegmentGenera ); } + @VisibleForTesting + PartialDimensionDistributionParallelIndexTaskRunner createPartialDimensionDistributionRunner(TaskToolbox toolbox) + { + return new PartialDimensionDistributionParallelIndexTaskRunner( + toolbox, + getId(), + getGroupId(), + ingestionSchema, + getContext(), + indexingServiceClient + ); + } + + @VisibleForTesting + PartialRangeSegmentGenerateParallelIndexTaskRunner createPartialRangeSegmentGenerateRunner( + TaskToolbox toolbox, + Map intervalToPartitions + ) + { + return new PartialRangeSegmentGenerateParallelIndexTaskRunner( + toolbox, + getId(), + getGroupId(), + ingestionSchema, + getContext(), + indexingServiceClient, + intervalToPartitions + ); + } + @VisibleForTesting PartialHashSegmentMergeParallelIndexTaskRunner createPartialHashSegmentMergeRunner( TaskToolbox toolbox, @@ -316,6 +355,24 @@ PartialHashSegmentMergeParallelIndexTaskRunner createPartialHashSegmentMergeRunn ); } + @VisibleForTesting + PartialGenericSegmentMergeParallelIndexTaskRunner createPartialGenericSegmentMergeRunner( + TaskToolbox toolbox, + List ioConfigs + ) + { + return new PartialGenericSegmentMergeParallelIndexTaskRunner( + toolbox, + getId(), + getGroupId(), + getIngestionSchema().getDataSchema(), + ioConfigs, + getIngestionSchema().getTuningConfig(), + getContext(), + indexingServiceClient + ); + } + @Override public boolean isReady(TaskActionClient taskActionClient) throws Exception { @@ -471,11 +528,9 @@ private TaskStatus runSinglePhaseParallel(TaskToolbox toolbox) throws Exception */ private TaskStatus runMultiPhaseParallel(TaskToolbox toolbox) throws Exception { - if (useHashPartitions()) { - return runHashPartitionMultiPhaseParallel(toolbox); - } else { - throw new UnsupportedOperationException("hash partition required"); - } + return useHashPartitions() + ? runHashPartitionMultiPhaseParallel(toolbox) + : runRangePartitionMultiPhaseParallel(toolbox); } private boolean useHashPartitions() @@ -519,6 +574,101 @@ private TaskStatus runHashPartitionMultiPhaseParallel(TaskToolbox toolbox) throw return TaskStatus.fromCode(getId(), state); } + private TaskStatus runRangePartitionMultiPhaseParallel(TaskToolbox toolbox) throws Exception + { + assertDataSketchesAvailable(); + + ParallelIndexTaskRunner distributionRunner = + createRunner( + toolbox, + this::createPartialDimensionDistributionRunner + ); + + TaskState distributionState = runNextPhase(distributionRunner); + if (distributionState.isFailure()) { + return TaskStatus.failure(getId()); + } + + Map intervalToPartitions = + determineAllRangePartitions(distributionRunner.getReports().values()); + + if (intervalToPartitions.isEmpty()) { + String msg = "No valid rows for single dimension partitioning." + + " All rows may have invalid timestamps or multiple dimension values."; + LOG.warn(msg); + return TaskStatus.success(getId(), msg); + } + + ParallelIndexTaskRunner> indexingRunner = + createRunner(toolbox, tb -> createPartialRangeSegmentGenerateRunner(tb, intervalToPartitions)); + + TaskState indexingState = runNextPhase(indexingRunner); + if (indexingState.isFailure()) { + return TaskStatus.failure(getId()); + } + + // partition (interval, partitionId) -> partition locations + Map, List> partitionToLocations = + groupGenericPartitionLocationsPerPartition(indexingRunner.getReports()); + final List ioConfigs = createGenericMergeIOConfigs( + ingestionSchema.getTuningConfig().getTotalNumMergeTasks(), + partitionToLocations + ); + + ParallelIndexTaskRunner mergeRunner = createRunner( + toolbox, + tb -> createPartialGenericSegmentMergeRunner(tb, ioConfigs) + ); + TaskState mergeState = runNextPhase(mergeRunner); + if (mergeState.isSuccess()) { + publishSegments(toolbox, mergeRunner.getReports()); + } + + return TaskStatus.fromCode(getId(), mergeState); + } + + private static void assertDataSketchesAvailable() + { + try { + //noinspection ResultOfObjectAllocationIgnored + new StringSketch(); + } + catch (Exception e) { + throw new ISE(e, "DataSketches is unvailable. Try loading the druid-datasketches extension."); + } + } + + private Map determineAllRangePartitions(Collection reports) + { + Multimap intervalToDistributions = ArrayListMultimap.create(); + reports.forEach(report -> { + Map intervalToDistribution = report.getIntervalToDistribution(); + intervalToDistribution.forEach(intervalToDistributions::put); + }); + + return CollectionUtils.mapValues(intervalToDistributions.asMap(), this::determineRangePartition); + } + + private String[] determineRangePartition(Collection distributions) + { + StringDistributionMerger distributionMerger = new StringSketchMerger(); + distributions.forEach(distributionMerger::merge); + StringDistribution mergedDistribution = distributionMerger.getResult(); + + SingleDimensionPartitionsSpec partitionsSpec = + (SingleDimensionPartitionsSpec) ingestionSchema.getTuningConfig().getGivenOrDefaultPartitionsSpec(); + + final String[] partitions; + Integer targetRowsPerSegment = partitionsSpec.getTargetRowsPerSegment(); + if (targetRowsPerSegment == null) { + partitions = mergedDistribution.getEvenPartitionsByMaxSize(partitionsSpec.getMaxRowsPerSegment()); + } else { + partitions = mergedDistribution.getEvenPartitionsByTargetSize(targetRowsPerSegment); + } + + return partitions; + } + private static Map, List> groupHashPartitionLocationsPerPartition( Map subTaskIdToReport ) @@ -537,6 +687,24 @@ private static Map, List> groupHa return groupPartitionLocationsPerPartition(subTaskIdToReport, createPartitionLocationFunction); } + private static Map, List> groupGenericPartitionLocationsPerPartition( + Map> subTaskIdToReport + ) + { + BiFunction createPartitionLocationFunction = + (subtaskId, partitionStat) -> + new GenericPartitionLocation( + partitionStat.getTaskExecutorHost(), + partitionStat.getTaskExecutorPort(), + partitionStat.isUseHttps(), + subtaskId, + partitionStat.getInterval(), + partitionStat.getSecondaryPartition() + ); + + return groupPartitionLocationsPerPartition(subTaskIdToReport, createPartitionLocationFunction); + } + private static Map, List> groupPartitionLocationsPerPartition( Map> subTaskIdToReport, @@ -572,6 +740,18 @@ private static List createHashMergeIOConfigs( ); } + private static List createGenericMergeIOConfigs( + int totalNumMergeTasks, + Map, List> partitionToLocations + ) + { + return createMergeIOConfigs( + totalNumMergeTasks, + partitionToLocations, + PartialGenericSegmentMergeIOConfig::new + ); + } + private static List createMergeIOConfigs( int totalNumMergeTasks, Map, List> partitionToLocations, diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionParallelIndexTaskRunner.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionParallelIndexTaskRunner.java new file mode 100644 index 000000000000..239976b77caa --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionParallelIndexTaskRunner.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.druid.client.indexing.IndexingServiceClient; +import org.apache.druid.data.input.InputSplit; +import org.apache.druid.indexing.common.TaskToolbox; +import org.apache.druid.indexing.common.task.IndexTaskClientFactory; + +import java.util.Map; + +/** + * {@link ParallelIndexTaskRunner} for the phase to determine distribution of dimension values in + * multi-phase parallel indexing. + */ +class PartialDimensionDistributionParallelIndexTaskRunner + extends InputSourceSplitParallelIndexTaskRunner +{ + // For tests + private final IndexTaskClientFactory taskClientFactory; + + PartialDimensionDistributionParallelIndexTaskRunner( + TaskToolbox toolbox, + String taskId, + String groupId, + ParallelIndexIngestionSpec ingestionSchema, + Map context, + IndexingServiceClient indexingServiceClient + ) + { + this( + toolbox, + taskId, + groupId, + ingestionSchema, + context, + indexingServiceClient, + null + ); + } + + @VisibleForTesting + PartialDimensionDistributionParallelIndexTaskRunner( + TaskToolbox toolbox, + String taskId, + String groupId, + ParallelIndexIngestionSpec ingestionSchema, + Map context, + IndexingServiceClient indexingServiceClient, + IndexTaskClientFactory taskClientFactory + ) + { + super( + toolbox, + taskId, + groupId, + ingestionSchema, + context, + indexingServiceClient + ); + this.taskClientFactory = taskClientFactory; + } + + @Override + public String getName() + { + return PartialDimensionDistributionTask.TYPE; + } + + @Override + SubTaskSpec createSubTaskSpec( + String id, + String groupId, + String supervisorTaskId, + Map context, + InputSplit split, + ParallelIndexIngestionSpec subTaskIngestionSpec, + IndexingServiceClient indexingServiceClient + ) + { + return new SubTaskSpec( + id, + groupId, + supervisorTaskId, + context, + split + ) + { + @Override + public PartialDimensionDistributionTask newSubTask(int numAttempts) + { + return new PartialDimensionDistributionTask( + null, + getGroupId(), + null, + getSupervisorTaskId(), + numAttempts, + subTaskIngestionSpec, + getContext(), + getIndexingServiceClient(), + taskClientFactory + ); + } + }; + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java new file mode 100644 index 000000000000..508b5c8615ed --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java @@ -0,0 +1,448 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.annotation.JacksonInject; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.hash.BloomFilter; +import org.apache.druid.client.indexing.IndexingServiceClient; +import org.apache.druid.data.input.HandlingInputRowIterator; +import org.apache.druid.data.input.InputFormat; +import org.apache.druid.data.input.InputRow; +import org.apache.druid.data.input.InputRowSchema; +import org.apache.druid.data.input.InputSource; +import org.apache.druid.data.input.InputSourceReader; +import org.apache.druid.indexer.TaskStatus; +import org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec; +import org.apache.druid.indexing.common.TaskToolbox; +import org.apache.druid.indexing.common.actions.TaskActionClient; +import org.apache.druid.indexing.common.task.ClientBasedTaskInfoProvider; +import org.apache.druid.indexing.common.task.IndexTaskClientFactory; +import org.apache.druid.indexing.common.task.TaskResource; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.TimeDimTuple; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.TimeDimTupleFactory; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.TimeDimTupleFunnel; +import org.apache.druid.indexing.common.task.batch.parallel.iterator.IndexTaskInputRowIteratorBuilder; +import org.apache.druid.indexing.common.task.batch.parallel.iterator.RangePartitionIndexTaskInputRowIteratorBuilder; +import org.apache.druid.java.util.common.granularity.Granularity; +import org.apache.druid.java.util.common.logger.Logger; +import org.apache.druid.java.util.common.parsers.CloseableIterator; +import org.apache.druid.java.util.common.parsers.ParseException; +import org.apache.druid.query.aggregation.AggregatorFactory; +import org.apache.druid.segment.indexing.DataSchema; +import org.apache.druid.segment.indexing.granularity.GranularitySpec; +import org.joda.time.DateTime; +import org.joda.time.Interval; + +import javax.annotation.Nullable; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +/** + * The worker task of {@link PartialDimensionDistributionParallelIndexTaskRunner}. This task + * determines the distribution of dimension values of input data. + */ + +public class PartialDimensionDistributionTask extends PerfectRollupWorkerTask +{ + public static final String TYPE = "partial_dimension_distribution"; + private static final Logger LOG = new Logger(PartialDimensionDistributionTask.class); + + private final int numAttempts; + private final ParallelIndexIngestionSpec ingestionSchema; + private final String supervisorTaskId; + private final IndexingServiceClient indexingServiceClient; + private final IndexTaskClientFactory taskClientFactory; + + // For testing + private final Supplier ungroupedRowDimValueFilterSupplier; + + @JsonCreator + PartialDimensionDistributionTask( + // id shouldn't be null except when this task is created by ParallelIndexSupervisorTask + @JsonProperty("id") @Nullable String id, + @JsonProperty("groupId") final String groupId, + @JsonProperty("resource") final TaskResource taskResource, + @JsonProperty("supervisorTaskId") final String supervisorTaskId, + @JsonProperty("numAttempts") final int numAttempts, // zero-based counting + @JsonProperty("spec") final ParallelIndexIngestionSpec ingestionSchema, + @JsonProperty("context") final Map context, + @JacksonInject IndexingServiceClient indexingServiceClient, + @JacksonInject IndexTaskClientFactory taskClientFactory + ) + { + this( + id, + groupId, + taskResource, + supervisorTaskId, + numAttempts, + ingestionSchema, + context, + indexingServiceClient, + taskClientFactory, + () -> new UngroupedRowDimensionValueFilter( + ingestionSchema.getDataSchema().getGranularitySpec().getQueryGranularity() + ) + ); + } + + @VisibleForTesting // Only for testing + PartialDimensionDistributionTask( + @Nullable String id, + final String groupId, + final TaskResource taskResource, + final String supervisorTaskId, + final int numAttempts, + final ParallelIndexIngestionSpec ingestionSchema, + final Map context, + IndexingServiceClient indexingServiceClient, + IndexTaskClientFactory taskClientFactory, + Supplier ungroupedRowDimValueFilterSupplier + ) + { + super( + getOrMakeId(id, TYPE, ingestionSchema.getDataSchema().getDataSource()), + groupId, + taskResource, + ingestionSchema.getDataSchema(), + ingestionSchema.getTuningConfig(), + context + ); + + Preconditions.checkArgument( + ingestionSchema.getTuningConfig().getPartitionsSpec() instanceof SingleDimensionPartitionsSpec, + "%s partitionsSpec required", + SingleDimensionPartitionsSpec.NAME + ); + + this.numAttempts = numAttempts; + this.ingestionSchema = ingestionSchema; + this.supervisorTaskId = supervisorTaskId; + this.indexingServiceClient = indexingServiceClient; + this.taskClientFactory = taskClientFactory; + this.ungroupedRowDimValueFilterSupplier = ungroupedRowDimValueFilterSupplier; + } + + @JsonProperty + private int getNumAttempts() + { + return numAttempts; + } + + @JsonProperty("spec") + private ParallelIndexIngestionSpec getIngestionSchema() + { + return ingestionSchema; + } + + @JsonProperty + private String getSupervisorTaskId() + { + return supervisorTaskId; + } + + @Override + public String getType() + { + return TYPE; + } + + @Override + public boolean isReady(TaskActionClient taskActionClient) throws Exception + { + return tryTimeChunkLock( + taskActionClient, + getIngestionSchema().getDataSchema().getGranularitySpec().inputIntervals() + ); + } + + @Override + public TaskStatus runTask(TaskToolbox toolbox) throws Exception + { + DataSchema dataSchema = ingestionSchema.getDataSchema(); + GranularitySpec granularitySpec = dataSchema.getGranularitySpec(); + ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig(); + + SingleDimensionPartitionsSpec partitionsSpec = (SingleDimensionPartitionsSpec) tuningConfig.getPartitionsSpec(); + Preconditions.checkNotNull(partitionsSpec); + String partitionDimension = partitionsSpec.getPartitionDimension(); + Preconditions.checkNotNull(partitionDimension, "partitionDimension required"); + boolean isAssumeGrouped = partitionsSpec.isAssumeGrouped(); + + InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource( + ingestionSchema.getDataSchema().getParser() + ); + List metricsNames = Arrays.stream(dataSchema.getAggregators()) + .map(AggregatorFactory::getName) + .collect(Collectors.toList()); + InputFormat inputFormat = ParallelIndexSupervisorTask.getInputFormat(ingestionSchema); + InputSourceReader inputSourceReader = dataSchema.getTransformSpec().decorate( + inputSource.reader( + new InputRowSchema( + dataSchema.getTimestampSpec(), + dataSchema.getDimensionsSpec(), + metricsNames + ), + inputFormat, + null + ) + ); + + try ( + CloseableIterator inputRowIterator = inputSourceReader.read(); + HandlingInputRowIterator iterator = new RangePartitionIndexTaskInputRowIteratorBuilder(partitionDimension) + .delegate(inputRowIterator) + .granularitySpec(granularitySpec) + .nullRowRunnable(IndexTaskInputRowIteratorBuilder.NOOP_RUNNABLE) + .absentBucketIntervalConsumer(IndexTaskInputRowIteratorBuilder.NOOP_CONSUMER) + .build() + ) { + Map distribution = determineDistribution( + iterator, + granularitySpec, + partitionDimension, + isAssumeGrouped, + tuningConfig.isLogParseExceptions(), + tuningConfig.getMaxParseExceptions() + ); + sendReport(new DimensionDistributionReport(getId(), distribution)); + } + + return TaskStatus.success(getId()); + } + + private Map determineDistribution( + HandlingInputRowIterator inputRowIterator, + GranularitySpec granularitySpec, + String partitionDimension, + boolean isAssumeGrouped, + boolean isLogParseExceptions, + long maxParseExceptions + ) + { + Map intervalToDistribution = new HashMap<>(); + DimensionValueFilter dimValueFilter = + isAssumeGrouped + ? new GroupedRowDimensionValueFilter() + : ungroupedRowDimValueFilterSupplier.get(); + + long numParseExceptions = 0; + + while (inputRowIterator.hasNext()) { + try { + InputRow inputRow = inputRowIterator.next(); + if (inputRow == null) { + continue; + } + + DateTime timestamp = inputRow.getTimestamp(); + + //noinspection OptionalGetWithoutIsPresent (InputRowIterator returns rows with present intervals) + Interval interval = granularitySpec.bucketInterval(timestamp).get(); + StringDistribution stringDistribution = + intervalToDistribution.computeIfAbsent(interval, k -> new StringSketch()); + + String dimensionValue = dimValueFilter.accept( + interval, + timestamp, + inputRow.getDimension(partitionDimension).get(0) + ); + + if (dimensionValue != null) { + stringDistribution.put(dimensionValue); + } + } + catch (ParseException e) { + if (isLogParseExceptions) { + LOG.error(e, "Encountered parse exception:"); + } + + numParseExceptions++; + if (numParseExceptions > maxParseExceptions) { + throw new RuntimeException("Max parse exceptions exceeded, terminating task..."); + } + } + } + + // UngroupedDimValueFilter may not accept the min/max dimensionValue. If needed, add the min/max + // values to the distributions so they have an accurate min/max. + dimValueFilter.getIntervalToMinDimensionValue() + .forEach((interval, min) -> intervalToDistribution.get(interval).putIfNewMin(min)); + dimValueFilter.getIntervalToMaxDimensionValue() + .forEach((interval, max) -> intervalToDistribution.get(interval).putIfNewMax(max)); + + return intervalToDistribution; + } + + private void sendReport(DimensionDistributionReport report) + { + final ParallelIndexSupervisorTaskClient taskClient = taskClientFactory.build( + new ClientBasedTaskInfoProvider(indexingServiceClient), + getId(), + 1, // always use a single http thread + ingestionSchema.getTuningConfig().getChatHandlerTimeout(), + ingestionSchema.getTuningConfig().getChatHandlerNumRetries() + ); + taskClient.report(supervisorTaskId, report); + } + + private interface DimensionValueFilter + { + /** + * @return Dimension value if it should be accepted, else null + */ + @Nullable + String accept(Interval interval, DateTime timestamp, String dimesionValue); + + /** + * @return Minimum dimension value for each interval processed so far. + */ + Map getIntervalToMinDimensionValue(); + + /** + * @return Maximum dimension value for each interval processed so far. + */ + Map getIntervalToMaxDimensionValue(); + } + + @VisibleForTesting + static class UngroupedRowDimensionValueFilter implements DimensionValueFilter + { + // A bloom filter is used to approximately group rows by query granularity. These values assume + // time chunks have fewer than BLOOM_FILTER_EXPECTED_INSERTIONS rows. With the below values, the + // Bloom filter will use about 170MB of memory. + // + // For more details on the Bloom filter memory consumption: + // https://github.com/google/guava/issues/2520#issuecomment-231233736 + private static final int BLOOM_FILTER_EXPECTED_INSERTIONS = 100_000_000; + private static final double BLOOM_FILTER_EXPECTED_FALSE_POSITIVE_PROBABILTY = 0.001; + + private final GroupedRowDimensionValueFilter delegate; + private final TimeDimTupleFactory timeDimTupleFactory; + private final BloomFilter timeDimTupleBloomFilter; + + UngroupedRowDimensionValueFilter(Granularity queryGranularity) + { + this(queryGranularity, BLOOM_FILTER_EXPECTED_INSERTIONS, BLOOM_FILTER_EXPECTED_FALSE_POSITIVE_PROBABILTY); + } + + @VisibleForTesting // to allow controlling false positive rate of bloom filter + UngroupedRowDimensionValueFilter( + Granularity queryGranularity, + int bloomFilterExpectedInsertions, + double bloomFilterFalsePositiveProbability + ) + { + delegate = new GroupedRowDimensionValueFilter(); + timeDimTupleFactory = new TimeDimTupleFactory(queryGranularity); + timeDimTupleBloomFilter = BloomFilter.create( + TimeDimTupleFunnel.INSTANCE, + bloomFilterExpectedInsertions, + bloomFilterFalsePositiveProbability + ); + } + + @Nullable + @Override + public String accept(Interval interval, DateTime timestamp, String dimensionValue) + { + delegate.accept(interval, timestamp, dimensionValue); + + TimeDimTuple timeDimTuple = timeDimTupleFactory.createWithBucketedTimestamp(timestamp, dimensionValue); + if (timeDimTupleBloomFilter.mightContain(timeDimTuple)) { + return null; + } else { + timeDimTupleBloomFilter.put(timeDimTuple); + return dimensionValue; + } + } + + @Override + public Map getIntervalToMinDimensionValue() + { + return delegate.getIntervalToMinDimensionValue(); + } + + @Override + public Map getIntervalToMaxDimensionValue() + { + return delegate.getIntervalToMaxDimensionValue(); + } + } + + private static class GroupedRowDimensionValueFilter implements DimensionValueFilter + { + private final Map intervalToMinDimensionValue; + private final Map intervalToMaxDimensionValue; + + GroupedRowDimensionValueFilter() + { + this.intervalToMinDimensionValue = new HashMap<>(); + this.intervalToMaxDimensionValue = new HashMap<>(); + } + + @Override + @Nullable + public String accept(Interval interval, DateTime timestamp, String dimensionValue) + { + updateMinDimensionValue(interval, dimensionValue); + updateMaxDimensionValue(interval, dimensionValue); + return dimensionValue; + } + + private void updateMinDimensionValue(Interval interval, String dimensionValue) + { + String minDimensionValue = intervalToMinDimensionValue.get(interval); + if (minDimensionValue == null || dimensionValue.compareTo(minDimensionValue) < 0) { + intervalToMinDimensionValue.put(interval, dimensionValue); + } + } + + private void updateMaxDimensionValue(Interval interval, String dimensionValue) + { + String maxDimensionValue = intervalToMaxDimensionValue.get(interval); + if (maxDimensionValue == null || dimensionValue.compareTo(maxDimensionValue) > 0) { + intervalToMaxDimensionValue.put(interval, dimensionValue); + } + } + + @Override + public Map getIntervalToMinDimensionValue() + { + return intervalToMinDimensionValue; + } + + @Override + public Map getIntervalToMaxDimensionValue() + { + return intervalToMaxDimensionValue; + } + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIOConfig.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIOConfig.java new file mode 100644 index 000000000000..bbec73f9a446 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIOConfig.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonTypeName; +import org.apache.druid.segment.indexing.IOConfig; + +import java.util.List; + +@JsonTypeName(PartialGenericSegmentMergeTask.TYPE) +class PartialGenericSegmentMergeIOConfig extends PartialSegmentMergeIOConfig + implements IOConfig +{ + @JsonCreator + PartialGenericSegmentMergeIOConfig( + @JsonProperty("partitionLocations") List partitionLocations + ) + { + super(partitionLocations); + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIngestionSpec.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIngestionSpec.java new file mode 100644 index 000000000000..52edad6e1e91 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIngestionSpec.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.druid.segment.indexing.DataSchema; + +class PartialGenericSegmentMergeIngestionSpec + extends PartialSegmentMergeIngestionSpec +{ + @JsonCreator + PartialGenericSegmentMergeIngestionSpec( + @JsonProperty("dataSchema") DataSchema dataSchema, + @JsonProperty("ioConfig") PartialGenericSegmentMergeIOConfig ioConfig, + @JsonProperty("tuningConfig") ParallelIndexTuningConfig tuningConfig + ) + { + super(dataSchema, ioConfig, tuningConfig); + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeParallelIndexTaskRunner.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeParallelIndexTaskRunner.java new file mode 100644 index 000000000000..e53b1d22451a --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeParallelIndexTaskRunner.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.druid.client.indexing.IndexingServiceClient; +import org.apache.druid.data.input.InputSplit; +import org.apache.druid.indexing.common.TaskToolbox; +import org.apache.druid.segment.indexing.DataSchema; + +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +/** + * {@link ParallelIndexTaskRunner} for the phase to merge partitioned segments in multi-phase parallel indexing. + */ +class PartialGenericSegmentMergeParallelIndexTaskRunner + extends ParallelIndexPhaseRunner +{ + private final DataSchema dataSchema; + private final List mergeIOConfigs; + + PartialGenericSegmentMergeParallelIndexTaskRunner( + TaskToolbox toolbox, + String taskId, + String groupId, + DataSchema dataSchema, + List mergeIOConfigs, + ParallelIndexTuningConfig tuningConfig, + Map context, + IndexingServiceClient indexingServiceClient + ) + { + super(toolbox, taskId, groupId, tuningConfig, context, indexingServiceClient); + + this.dataSchema = dataSchema; + this.mergeIOConfigs = mergeIOConfigs; + } + + @Override + public String getName() + { + return PartialGenericSegmentMergeTask.TYPE; + } + + @Override + Iterator> subTaskSpecIterator() + { + return mergeIOConfigs.stream().map(this::newTaskSpec).iterator(); + } + + @Override + int getTotalNumSubTasks() + { + return mergeIOConfigs.size(); + } + + @VisibleForTesting + SubTaskSpec newTaskSpec(PartialGenericSegmentMergeIOConfig ioConfig) + { + final PartialGenericSegmentMergeIngestionSpec ingestionSpec = + new PartialGenericSegmentMergeIngestionSpec( + dataSchema, + ioConfig, + getTuningConfig() + ); + return new SubTaskSpec( + getTaskId() + "_" + getAndIncrementNextSpecId(), + getGroupId(), + getTaskId(), + getContext(), + new InputSplit<>(ioConfig.getPartitionLocations()) + ) + { + @Override + public PartialGenericSegmentMergeTask newSubTask(int numAttempts) + { + return new PartialGenericSegmentMergeTask( + null, + getGroupId(), + null, + getSupervisorTaskId(), + numAttempts, + ingestionSpec, + getContext(), + null, + null, + null + ); + } + }; + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTask.java new file mode 100644 index 000000000000..559a9b5317ef --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTask.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.annotation.JacksonInject; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.HashBasedTable; +import com.google.common.collect.Table; +import org.apache.druid.client.indexing.IndexingServiceClient; +import org.apache.druid.guice.annotations.EscalatedClient; +import org.apache.druid.indexing.common.TaskToolbox; +import org.apache.druid.indexing.common.task.IndexTaskClientFactory; +import org.apache.druid.indexing.common.task.TaskResource; +import org.apache.druid.java.util.http.client.HttpClient; +import org.apache.druid.timeline.partition.ShardSpec; +import org.joda.time.Interval; + +import javax.annotation.Nullable; +import java.util.List; +import java.util.Map; + +/** + * {@link ParallelIndexTaskRunner} for the phase to merge generic partitioned segments in multi-phase parallel indexing. + */ +public class PartialGenericSegmentMergeTask extends PartialSegmentMergeTask +{ + public static final String TYPE = "partial_index_generic_merge"; + + private final PartialGenericSegmentMergeIngestionSpec ingestionSchema; + private final Table createIntervalAndIntegerToShardSpec; + + @JsonCreator + public PartialGenericSegmentMergeTask( + // id shouldn't be null except when this task is created by ParallelIndexSupervisorTask + @JsonProperty("id") @Nullable String id, + @JsonProperty("groupId") final String groupId, + @JsonProperty("resource") final TaskResource taskResource, + @JsonProperty("supervisorTaskId") final String supervisorTaskId, + @JsonProperty("numAttempts") final int numAttempts, // zero-based counting + @JsonProperty("spec") final PartialGenericSegmentMergeIngestionSpec ingestionSchema, + @JsonProperty("context") final Map context, + @JacksonInject IndexingServiceClient indexingServiceClient, + @JacksonInject IndexTaskClientFactory taskClientFactory, + @JacksonInject @EscalatedClient HttpClient shuffleClient + ) + { + super( + getOrMakeId(id, TYPE, ingestionSchema.getDataSchema().getDataSource()), + groupId, + taskResource, + supervisorTaskId, + ingestionSchema.getDataSchema(), + ingestionSchema.getIOConfig(), + ingestionSchema.getTuningConfig(), + numAttempts, + context, + indexingServiceClient, + taskClientFactory, + shuffleClient + ); + + this.ingestionSchema = ingestionSchema; + this.createIntervalAndIntegerToShardSpec = createIntervalAndIntegerToShardSpec( + ingestionSchema.getIOConfig().getPartitionLocations() + ); + } + + private static Table createIntervalAndIntegerToShardSpec( + List partitionLocations + ) + { + Table intervalAndIntegerToShardSpec = HashBasedTable.create(); + + partitionLocations.forEach( + p -> intervalAndIntegerToShardSpec.put(p.getInterval(), p.getPartitionId(), p.getShardSpec()) + ); + + return intervalAndIntegerToShardSpec; + } + + @JsonProperty("spec") + private PartialGenericSegmentMergeIngestionSpec getIngestionSchema() + { + return ingestionSchema; + } + + @Override + public String getType() + { + return TYPE; + } + + @Override + ShardSpec createShardSpec(TaskToolbox toolbox, Interval interval, int partitionNum) + { + return createIntervalAndIntegerToShardSpec.get(interval, partitionNum); + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java new file mode 100644 index 000000000000..57002a8311c4 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.druid.client.indexing.IndexingServiceClient; +import org.apache.druid.data.input.InputSplit; +import org.apache.druid.indexing.common.TaskToolbox; +import org.apache.druid.indexing.common.task.IndexTaskClientFactory; +import org.apache.druid.segment.realtime.appenderator.AppenderatorsManager; +import org.joda.time.Interval; + +import java.util.Map; + +/** + * {@link ParallelIndexTaskRunner} for the phase to create range partitioned segments in multi-phase parallel indexing. + * + * @see PartialHashSegmentMergeParallelIndexTaskRunner + */ +class PartialRangeSegmentGenerateParallelIndexTaskRunner + extends InputSourceSplitParallelIndexTaskRunner> +{ + private final IndexTaskClientFactory taskClientFactory; + private final AppenderatorsManager appenderatorsManager; + private final Map intervalToPartitions; + + PartialRangeSegmentGenerateParallelIndexTaskRunner( + TaskToolbox toolbox, + String taskId, + String groupId, + ParallelIndexIngestionSpec ingestionSchema, + Map context, + IndexingServiceClient indexingServiceClient, + Map intervalToPartitions + ) + { + this( + toolbox, + taskId, + groupId, + ingestionSchema, + context, + indexingServiceClient, + intervalToPartitions, + null, + null + ); + } + + @VisibleForTesting + PartialRangeSegmentGenerateParallelIndexTaskRunner( + TaskToolbox toolbox, + String taskId, + String groupId, + ParallelIndexIngestionSpec ingestionSchema, + Map context, + IndexingServiceClient indexingServiceClient, + Map intervalToPartitions, + IndexTaskClientFactory taskClientFactory, + AppenderatorsManager appenderatorsManager + ) + { + super(toolbox, taskId, groupId, ingestionSchema, context, indexingServiceClient); + this.taskClientFactory = taskClientFactory; + this.appenderatorsManager = appenderatorsManager; + this.intervalToPartitions = intervalToPartitions; + } + + @Override + public String getName() + { + return PartialRangeSegmentGenerateTask.TYPE; + } + + @Override + SubTaskSpec createSubTaskSpec( + String id, + String groupId, + String supervisorTaskId, + Map context, + InputSplit split, + ParallelIndexIngestionSpec subTaskIngestionSpec, + IndexingServiceClient indexingServiceClient + ) + { + return new SubTaskSpec( + id, + groupId, + supervisorTaskId, + context, + split + ) + { + @Override + public PartialRangeSegmentGenerateTask newSubTask(int numAttempts) + { + return new PartialRangeSegmentGenerateTask( + null, + groupId, + null, + supervisorTaskId, + numAttempts, + subTaskIngestionSpec, + context, + intervalToPartitions, + indexingServiceClient, + taskClientFactory, + appenderatorsManager + ); + } + }; + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java new file mode 100644 index 000000000000..8956fbee2195 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.annotation.JacksonInject; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.Preconditions; +import org.apache.druid.client.indexing.IndexingServiceClient; +import org.apache.druid.indexer.partitions.PartitionsSpec; +import org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec; +import org.apache.druid.indexing.common.TaskToolbox; +import org.apache.druid.indexing.common.actions.TaskActionClient; +import org.apache.druid.indexing.common.task.IndexTaskClientFactory; +import org.apache.druid.indexing.common.task.IndexTaskSegmentAllocator; +import org.apache.druid.indexing.common.task.RangePartitionCachingLocalSegmentAllocator; +import org.apache.druid.indexing.common.task.TaskResource; +import org.apache.druid.indexing.common.task.batch.parallel.iterator.RangePartitionIndexTaskInputRowIteratorBuilder; +import org.apache.druid.indexing.worker.ShuffleDataSegmentPusher; +import org.apache.druid.segment.realtime.appenderator.AppenderatorsManager; +import org.apache.druid.timeline.DataSegment; +import org.joda.time.Interval; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * The worker task of {@link PartialRangeSegmentGenerateParallelIndexTaskRunner}. This task + * partitions input data by ranges of the partition dimension specified in + * {@link SingleDimensionPartitionsSpec}. Partitioned segments are stored in local storage using + * {@link ShuffleDataSegmentPusher}. + */ +public class PartialRangeSegmentGenerateTask extends PartialSegmentGenerateTask +{ + public static final String TYPE = "partial_range_index_generate"; + private static final String PROP_SPEC = "spec"; + + private final String supervisorTaskId; + private final int numAttempts; + private final ParallelIndexIngestionSpec ingestionSchema; + private final Map intervalToPartitions; + + @JsonCreator + public PartialRangeSegmentGenerateTask( + // id shouldn't be null except when this task is created by ParallelIndexSupervisorTask + @JsonProperty("id") @Nullable String id, + @JsonProperty("groupId") String groupId, + @JsonProperty("resource") TaskResource taskResource, + @JsonProperty("supervisorTaskId") String supervisorTaskId, + @JsonProperty("numAttempts") int numAttempts, // zero-based counting + @JsonProperty(PROP_SPEC) ParallelIndexIngestionSpec ingestionSchema, + @JsonProperty("context") Map context, + @JsonProperty("intervalToPartitions") Map intervalToPartitions, + @JacksonInject IndexingServiceClient indexingServiceClient, + @JacksonInject IndexTaskClientFactory taskClientFactory, + @JacksonInject AppenderatorsManager appenderatorsManager + ) + { + super( + getOrMakeId(id, TYPE, ingestionSchema.getDataSchema().getDataSource()), + groupId, + taskResource, + supervisorTaskId, + ingestionSchema, + context, + indexingServiceClient, + taskClientFactory, + appenderatorsManager, + new RangePartitionIndexTaskInputRowIteratorBuilder(getPartitionDimension(ingestionSchema)) + ); + + this.numAttempts = numAttempts; + this.ingestionSchema = ingestionSchema; + this.supervisorTaskId = supervisorTaskId; + this.intervalToPartitions = intervalToPartitions; + } + + private static String getPartitionDimension(ParallelIndexIngestionSpec ingestionSpec) + { + PartitionsSpec partitionsSpec = ingestionSpec.getTuningConfig().getPartitionsSpec(); + Preconditions.checkArgument( + partitionsSpec instanceof SingleDimensionPartitionsSpec, + "%s partitionsSpec required", + SingleDimensionPartitionsSpec.NAME + ); + + SingleDimensionPartitionsSpec singleDimPartitionsSpec = (SingleDimensionPartitionsSpec) partitionsSpec; + String partitionDimension = singleDimPartitionsSpec.getPartitionDimension(); + Preconditions.checkNotNull(partitionDimension, "partitionDimension required"); + + return partitionDimension; + } + + @JsonProperty + public int getNumAttempts() + { + return numAttempts; + } + + @JsonProperty(PROP_SPEC) + public ParallelIndexIngestionSpec getIngestionSchema() + { + return ingestionSchema; + } + + @JsonProperty + public String getSupervisorTaskId() + { + return supervisorTaskId; + } + + @JsonProperty + public Map getIntervalToPartitions() + { + return intervalToPartitions; + } + + @Override + public String getType() + { + return TYPE; + } + + @Override + public boolean isReady(TaskActionClient taskActionClient) + { + return true; + } + + @Override + IndexTaskSegmentAllocator createSegmentAllocator(TaskToolbox toolbox) throws IOException + { + return new RangePartitionCachingLocalSegmentAllocator( + toolbox, + getId(), + getDataSource(), + getPartitionDimension(ingestionSchema), + intervalToPartitions + ); + } + + @Override + GeneratedGenericPartitionsReport createGeneratedPartitionsReport(TaskToolbox toolbox, List segments) + { + List partitionStats = segments.stream() + .map(segment -> createPartitionStat(toolbox, segment)) + .collect(Collectors.toList()); + return new GeneratedGenericPartitionsReport(getId(), partitionStats); + } + + private GenericPartitionStat createPartitionStat(TaskToolbox toolbox, DataSegment segment) + { + return new GenericPartitionStat( + toolbox.getTaskExecutorNode().getHost(), + toolbox.getTaskExecutorNode().getPortToUse(), + toolbox.getTaskExecutorNode().isEnableTlsPort(), + segment.getInterval(), + segment.getShardSpec(), + null, // numRows is not supported yet + null // sizeBytes is not supported yet + ); + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SubTaskReport.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SubTaskReport.java index e60423533dfa..8cc6db91e94e 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SubTaskReport.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SubTaskReport.java @@ -30,7 +30,9 @@ @JsonTypeInfo(use = Id.NAME, property = "type", defaultImpl = PushedSegmentsReport.class) @JsonSubTypes(value = { @Type(name = PushedSegmentsReport.TYPE, value = PushedSegmentsReport.class), - @Type(name = GeneratedHashPartitionsReport.TYPE, value = GeneratedHashPartitionsReport.class) + @Type(name = GeneratedHashPartitionsReport.TYPE, value = GeneratedHashPartitionsReport.class), + @Type(name = DimensionDistributionReport.TYPE, value = DimensionDistributionReport.class), + @Type(name = GeneratedGenericPartitionsReport.TYPE, value = GeneratedGenericPartitionsReport.class) }) public interface SubTaskReport { diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistribution.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistribution.java new file mode 100644 index 000000000000..643a1a8276e8 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistribution.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.distribution; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; + +/** + * Counts frequencies of {@link String}s. + */ +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "type") +@JsonSubTypes({ + @JsonSubTypes.Type(name = StringSketch.NAME, value = StringSketch.class) +}) +public interface StringDistribution +{ + /** + * Record occurence of {@link String} + */ + void put(String element); + + /** + * Record occurence of {@link String} if it will become the new minimum element. + */ + void putIfNewMin(String element); + + /** + * Record occurence of {@link String} if it will become the new maximum element; + */ + void putIfNewMax(String element); + + /** + * Split the distribution in the fewest number of evenly-sized partitions while honoring a max + * partition size. + * + * @return Array of elements that correspond to the endpoints of evenly-sized partitions of the + * sorted elements. + */ + String[] getEvenPartitionsByMaxSize(int maxSize); + + /** + * Split the distribution in the fewest number of evenly-sized partitions while honoring a target + * partition size (actual partition sizes may be slightly lower or higher). + * + * @return Array of elements that correspond to the endpoints of evenly-sized partitions of the + * sorted elements. + */ + String[] getEvenPartitionsByTargetSize(int targetSize); +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistributionMerger.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistributionMerger.java new file mode 100644 index 000000000000..f35fd33a792e --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistributionMerger.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.distribution; + +/** + * Merges {@link StringDistribution}s. + */ +public interface StringDistributionMerger +{ + /** + * Merge distribution. + */ + void merge(StringDistribution distribution); + + /** + * @return Merged distributions. + */ + StringDistribution getResult(); +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java new file mode 100644 index 000000000000..74a97b2d7537 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.distribution; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.databind.DeserializationContext; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.SerializerProvider; +import com.fasterxml.jackson.databind.annotation.JsonDeserialize; +import com.fasterxml.jackson.databind.annotation.JsonSerialize; +import com.fasterxml.jackson.databind.deser.std.StdDeserializer; +import com.fasterxml.jackson.databind.jsontype.TypeSerializer; +import com.fasterxml.jackson.databind.ser.std.StdSerializer; +import com.google.common.base.Preconditions; +import com.yahoo.memory.Memory; +import com.yahoo.sketches.ArrayOfStringsSerDe; +import com.yahoo.sketches.quantiles.ItemsSketch; + +import java.io.IOException; +import java.util.Comparator; + +/** + * Counts approximate frequencies of strings. + */ +@JsonSerialize(using = StringSketch.Jackson.Serializer.class) +@JsonDeserialize(using = StringSketch.Jackson.Deserializer.class) +public class StringSketch implements StringDistribution +{ + static final String NAME = "sketch"; + static final int SKETCH_K = 1 << 12; // smallest value with normalized rank error < 0.1%; retain up to ~86k elements + static final Comparator SKETCH_COMPARATOR = Comparator.naturalOrder(); + private static final ArrayOfStringsSerDe ARRAY_OF_STRINGS_SERDE = new ArrayOfStringsSerDe(); + + private final ItemsSketch delegate; + + public StringSketch() + { + this(ItemsSketch.getInstance(SKETCH_K, SKETCH_COMPARATOR)); + } + + StringSketch(ItemsSketch sketch) + { + this.delegate = sketch; + } + + @Override + public void put(String string) + { + delegate.update(string); + } + + @Override + public void putIfNewMin(String string) + { + String min = delegate.getMinValue(); + if (min == null || string.compareTo(min) < 0) { + delegate.update(string); + } + } + + @Override + public void putIfNewMax(String string) + { + String max = delegate.getMaxValue(); + if (max == null || string.compareTo(max) > 0) { + delegate.update(string); + } + } + + @Override + public String[] getEvenPartitionsByMaxSize(int maxSize) + { + Preconditions.checkArgument(maxSize > 0, "maxSize must be positive but is %s", maxSize); + long n = delegate.getN(); + double delta = delegate.getNormalizedRankError(true) * n; // account for approx distribution + int targetSize = Math.max(1, (int) Math.floor(maxSize - delta)); // floor() to increase chance below max size + int evenPartitionCount = (int) Math.ceil((double) n / targetSize); // ceil() to increase chance below max size + return getEventPartitionsByCount(Math.max(1, evenPartitionCount)); + } + + @Override + public String[] getEvenPartitionsByTargetSize(int targetSize) + { + Preconditions.checkArgument(targetSize > 0, "targetSize must be positive but is %s", targetSize); + long n = delegate.getN(); + int evenPartitionCount = Math.max(1, (int) Math.round((double) n / targetSize)); + return getEventPartitionsByCount(evenPartitionCount); + } + + private String[] getEventPartitionsByCount(int evenPartitionCount) + { + Preconditions.checkArgument( + evenPartitionCount > 0, + "evenPartitionCount must be positive but is %s", + evenPartitionCount + ); + String[] partitions = delegate.getQuantiles(evenPartitionCount + 1); // add 1 since this returns endpoints + return (partitions == null) ? new String[0] : partitions; + } + + @Override + public String toString() + { + return "StringSketch{" + + "delegate=" + delegate + + '}'; + } + + ItemsSketch getDelegate() + { + return delegate; + } + + private byte[] toByteArray() + { + return delegate.toByteArray(ARRAY_OF_STRINGS_SERDE); + } + + static class Jackson + { + private static final String FIELD_SKETCH = "sketch"; + + static class Serializer extends StdSerializer + { + Serializer() + { + super(StringSketch.class); + } + + @Override + public void serialize( + StringSketch stringSketch, + JsonGenerator jsonGenerator, + SerializerProvider serializerProvider + ) throws IOException + { + jsonGenerator.writeBinaryField(FIELD_SKETCH, stringSketch.toByteArray()); + } + + @Override + public void serializeWithType( + StringSketch value, + JsonGenerator gen, + SerializerProvider serializers, + TypeSerializer typeSer + ) throws IOException + { + typeSer.writeTypePrefixForObject(value, gen); + serialize(value, gen, serializers); + typeSer.writeTypeSuffixForObject(value, gen); + } + } + + static class Deserializer extends StdDeserializer + { + Deserializer() + { + super(StringSketch.class); + } + + @Override + public StringSketch deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) + throws IOException + { + JsonNode jsonNode = jsonParser.getCodec().readTree(jsonParser); + byte[] sketchBytes = jsonNode.get(FIELD_SKETCH).binaryValue(); + ItemsSketch sketch = ItemsSketch.getInstance( + Memory.wrap(sketchBytes), + SKETCH_COMPARATOR, + ARRAY_OF_STRINGS_SERDE + ); + return new StringSketch(sketch); + } + } + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMerger.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMerger.java new file mode 100644 index 000000000000..f628f35c6694 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMerger.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.distribution; + +import com.yahoo.sketches.quantiles.ItemsUnion; + +/** + * Merges {@link StringSketch}es. + */ +public class StringSketchMerger implements StringDistributionMerger +{ + private final ItemsUnion delegate; + + public StringSketchMerger() + { + delegate = ItemsUnion.getInstance(StringSketch.SKETCH_K, StringSketch.SKETCH_COMPARATOR); + } + + @Override + public void merge(StringDistribution stringDistribution) + { + if (!(stringDistribution instanceof StringSketch)) { + throw new IllegalArgumentException("Only merging StringSketch instances is currently supported"); + } + + StringSketch stringSketch = (StringSketch) stringDistribution; + delegate.update(stringSketch.getDelegate()); + } + + @Override + public StringDistribution getResult() + { + return new StringSketch(delegate.getResult()); + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTuple.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTuple.java new file mode 100644 index 000000000000..1c7f5c3be12f --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTuple.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.distribution; + +import java.util.Objects; + +/** + * Tuple of timestamp and dimension value + */ +public class TimeDimTuple implements Comparable +{ + private final long timestamp; + private final String dimensionValue; + + TimeDimTuple(long timestamp, String dimensionValue) + { + this.timestamp = timestamp; + this.dimensionValue = dimensionValue; + } + + public long getTimestamp() + { + return timestamp; + } + + public String getDimensionValue() + { + return dimensionValue; + } + + @Override + public int compareTo(TimeDimTuple o) + { + if (timestamp < o.timestamp) { + return -1; + } + + if (o.timestamp < timestamp) { + return 1; + } + + return dimensionValue.compareTo(o.dimensionValue); + } + + @Override + public boolean equals(Object o) + { + if (!(o instanceof TimeDimTuple)) { + return false; + } + return compareTo((TimeDimTuple) o) == 0; + } + + @Override + public int hashCode() + { + return Objects.hash(timestamp, dimensionValue); + } + + @Override + public String toString() + { + return "TimeDimTuple{" + + "timestamp=" + timestamp + + ", dimensionValue='" + dimensionValue + '\'' + + '}'; + } +} + diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFactory.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFactory.java new file mode 100644 index 000000000000..229bbc637791 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFactory.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.distribution; + +import org.apache.druid.java.util.common.granularity.Granularity; +import org.joda.time.DateTime; + +/** + * Creates {@link TimeDimTuple}s with time stamp adjust according to a {@link Granularity}. + */ +public class TimeDimTupleFactory +{ + private final Granularity granularity; + + public TimeDimTupleFactory(Granularity granularity) + { + this.granularity = granularity; + } + + public TimeDimTuple createWithBucketedTimestamp(DateTime timestamp, String dimensionValue) + { + return new TimeDimTuple(getBucketTimestamp(timestamp), dimensionValue); + } + + private long getBucketTimestamp(DateTime dateTime) + { + return granularity.bucketStart(dateTime).getMillis(); + } +} + diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFunnel.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFunnel.java new file mode 100644 index 000000000000..050c903402a3 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFunnel.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.distribution; + +import com.google.common.hash.Funnel; +import com.google.common.hash.PrimitiveSink; + +/** + * Utility class for adding {@link TimeDimTuple}s to a {@link com.google.common.hash.BloomFilter}. + */ +public enum TimeDimTupleFunnel implements Funnel +{ + INSTANCE; + + @Override + public void funnel(TimeDimTuple timeDimTuple, PrimitiveSink into) + { + into.putLong(timeDimTuple.getTimestamp()) + .putUnencodedChars(timeDimTuple.getDimensionValue()); + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/DefaultIndexTaskInputRowIteratorBuilder.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/DefaultIndexTaskInputRowIteratorBuilder.java index 3a8ad8ab566c..b2a9463bf40f 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/DefaultIndexTaskInputRowIteratorBuilder.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/DefaultIndexTaskInputRowIteratorBuilder.java @@ -55,6 +55,8 @@ * If any of the handlers invoke their respective callback, the {@link HandlingInputRowIterator} will yield * a null {@link InputRow} next; otherwise, the next {@link InputRow} is yielded. * + * + * @see RangePartitionIndexTaskInputRowIteratorBuilder */ public class DefaultIndexTaskInputRowIteratorBuilder implements IndexTaskInputRowIteratorBuilder { diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionIndexTaskInputRowIteratorBuilder.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionIndexTaskInputRowIteratorBuilder.java new file mode 100644 index 000000000000..b2884b99d439 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionIndexTaskInputRowIteratorBuilder.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.iterator; + +import org.apache.druid.data.input.HandlingInputRowIterator; +import org.apache.druid.data.input.InputRow; +import org.apache.druid.indexing.common.task.IndexTask; +import org.apache.druid.java.util.common.parsers.CloseableIterator; +import org.apache.druid.java.util.common.parsers.ParseException; +import org.apache.druid.segment.indexing.granularity.GranularitySpec; + +import java.util.List; +import java.util.function.Consumer; + +/** + *
+ * Build an {@link HandlingInputRowIterator} for {@link IndexTask}s used for range partitioning. Each {@link
+ * InputRow} is processed by the following handlers, in order:
+ *
+ *   1. Null row: If {@link InputRow} is null, invoke the null row {@link Runnable} callback.
+ *
+ *   2. Invalid timestamp: If {@link InputRow} has an invalid timestamp, throw a {@link ParseException}.
+ *
+ *   3. Absent bucket interval: If {@link InputRow} has a timestamp that does not match the
+ *      {@link GranularitySpec} bucket intervals, invoke the absent bucket interval {@link Consumer}
+ *      callback.
+ *
+ *   4. Filter for rows with only a single dimension value count for the specified partition dimension.
+ *
+ * If any of the handlers invoke their respective callback, the {@link HandlingInputRowIterator} will yield
+ * a null {@link InputRow} next; otherwise, the next {@link InputRow} is yielded.
+ * 
+ * + * @see DefaultIndexTaskInputRowIteratorBuilder + */ +public class RangePartitionIndexTaskInputRowIteratorBuilder implements IndexTaskInputRowIteratorBuilder +{ + private final DefaultIndexTaskInputRowIteratorBuilder delegate; + + public RangePartitionIndexTaskInputRowIteratorBuilder(String partitionDimension) + { + delegate = new DefaultIndexTaskInputRowIteratorBuilder(); + delegate.appendInputRowHandler(createOnlySingleDimensionValueRowsHandler(partitionDimension)); + } + + @Override + public IndexTaskInputRowIteratorBuilder delegate(CloseableIterator inputRowIterator) + { + return delegate.delegate(inputRowIterator); + } + + @Override + public IndexTaskInputRowIteratorBuilder granularitySpec(GranularitySpec granularitySpec) + { + return delegate.granularitySpec(granularitySpec); + } + + @Override + public IndexTaskInputRowIteratorBuilder nullRowRunnable(Runnable nullRowRunnable) + { + return delegate.nullRowRunnable(nullRowRunnable); + } + + @Override + public IndexTaskInputRowIteratorBuilder absentBucketIntervalConsumer(Consumer absentBucketIntervalConsumer) + { + return delegate.absentBucketIntervalConsumer(absentBucketIntervalConsumer); + } + + @Override + public HandlingInputRowIterator build() + { + return delegate.build(); + } + + private static HandlingInputRowIterator.InputRowHandler createOnlySingleDimensionValueRowsHandler( + String partitionDimension + ) + { + return inputRow -> { + List dimensionValues = inputRow.getDimension(partitionDimension); + return dimensionValues.size() != 1; + }; + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IngestionTestBase.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IngestionTestBase.java index dcaffb395ded..fe6c534e837a 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IngestionTestBase.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IngestionTestBase.java @@ -23,6 +23,7 @@ import com.google.common.base.Optional; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; +import org.apache.druid.common.config.NullHandling; import org.apache.druid.indexer.TaskStatus; import org.apache.druid.indexing.common.SegmentLoaderFactory; import org.apache.druid.indexing.common.SingleFileTaskReportFileWriter; @@ -80,6 +81,10 @@ public abstract class IngestionTestBase { + static { + NullHandling.initializeForTests(); + } + @Rule public TemporaryFolder temporaryFolder = new TemporaryFolder(); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java new file mode 100644 index 000000000000..86cb36403c25 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task; + +import com.google.common.collect.ImmutableMap; +import org.apache.druid.data.input.InputRow; +import org.apache.druid.indexing.common.TaskLock; +import org.apache.druid.indexing.common.TaskToolbox; +import org.apache.druid.indexing.common.actions.LockListAction; +import org.apache.druid.indexing.common.actions.TaskActionClient; +import org.apache.druid.java.util.common.DateTimes; +import org.apache.druid.java.util.common.Intervals; +import org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec; +import org.apache.druid.timeline.SegmentId; +import org.apache.druid.timeline.partition.SingleDimensionShardSpec; +import org.easymock.EasyMock; +import org.joda.time.Interval; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class RangePartitionCachingLocalSegmentAllocatorTest +{ + private static final String DATASOURCE = "datasource"; + private static final String TASKID = "taskid"; + private static final String PARTITION_DIMENSION = "dimension"; + private static final Interval INTERVAL_EMPTY = Intervals.utc(0, 1000); + private static final Interval INTERVAL_SINGLETON = Intervals.utc(1000, 2000); + private static final Interval INTERVAL_NORMAL = Intervals.utc(2000, 3000); + private static final Interval INTERVAL_FREQUENT_MID = Intervals.utc(3000, 4000); + private static final Interval INTERVAL_FREQUENT_MAX = Intervals.utc(5000, 6000); + private static final Map INTERVAL_TO_VERSION = ImmutableMap.of( + INTERVAL_EMPTY, "version-empty", + INTERVAL_SINGLETON, "version-singleton", + INTERVAL_NORMAL, "version-normal", + INTERVAL_FREQUENT_MID, "version-frequent-mid", + INTERVAL_FREQUENT_MAX, "version-frequent-max" + ); + private static final String PARTITION0 = "0"; + private static final String PARTITION5 = "5"; + private static final String PARTITION9 = "9"; + private static final String[] EMPTY_PARTITIONS = new String[]{}; + private static final String[] SINGLETON_PARTITIONS = new String[]{PARTITION0, PARTITION0}; + private static final String[] NORMAL_PARTITIONS = new String[]{PARTITION0, PARTITION5, PARTITION9}; + private static final String[] FREQUENT_MID_PARTITIONS = new String[]{PARTITION0, PARTITION5, PARTITION5, PARTITION9}; + private static final String[] FREQUENT_MAX_PARTITIONS = new String[]{PARTITION0, PARTITION5, PARTITION9, PARTITION9}; + + private static final Map INTERVAL_TO_PARTITONS = ImmutableMap.of( + INTERVAL_EMPTY, EMPTY_PARTITIONS, + INTERVAL_SINGLETON, SINGLETON_PARTITIONS, + INTERVAL_NORMAL, NORMAL_PARTITIONS, + INTERVAL_FREQUENT_MID, FREQUENT_MID_PARTITIONS, + INTERVAL_FREQUENT_MAX, FREQUENT_MAX_PARTITIONS + ); + + private RangePartitionCachingLocalSegmentAllocator target; + + @Rule + public ExpectedException exception = ExpectedException.none(); + + @Before + public void setup() throws IOException + { + TaskToolbox toolbox = createToolbox( + INTERVAL_TO_VERSION.keySet() + .stream() + .map(RangePartitionCachingLocalSegmentAllocatorTest::createTaskLock) + .collect(Collectors.toList()) + ); + target = new RangePartitionCachingLocalSegmentAllocator( + toolbox, + TASKID, + DATASOURCE, + PARTITION_DIMENSION, + INTERVAL_TO_PARTITONS + ); + } + + @Test + public void failsIfAllocateFromEmptyInterval() + { + int dummy = 0; + Interval interval = INTERVAL_EMPTY; + InputRow row = createInputRow(interval, PARTITION9); + + exception.expect(IllegalStateException.class); + exception.expectMessage("Failed to get shardSpec"); + + testAllocate(row, interval, dummy, null); + } + + @Test + public void allocatesCorrectShardSpecsForSingletonPartitions() + { + Interval interval = INTERVAL_SINGLETON; + InputRow row = createInputRow(interval, PARTITION9); + testAllocate(row, interval, 0, null); + } + + + @Test + public void allocatesCorrectShardSpecsForFirstPartition() + { + Interval interval = INTERVAL_NORMAL; + InputRow row = createInputRow(interval, PARTITION0); + testAllocate(row, interval, 0); + } + + @Test + public void allocatesCorrectShardSpecsForLastPartitionWithoutFrequentValue() + { + Interval interval = INTERVAL_NORMAL; + InputRow row = createInputRow(interval, PARTITION9); + testAllocate(row, interval, INTERVAL_TO_PARTITONS.get(interval).length - 2, null); + } + + @Test + public void allocatesCorrectShardSpecsForLPartitionWithFrequentMid() + { + Interval interval = INTERVAL_FREQUENT_MID; + InputRow row = createInputRow(interval, PARTITION9); + testAllocate(row, interval, INTERVAL_TO_PARTITONS.get(interval).length - 3, null); + } + + @Test + public void allocatesCorrectShardSpecsForLastPartitionWithFrequentMax() + { + Interval interval = INTERVAL_FREQUENT_MAX; + InputRow row = createInputRow(interval, PARTITION9); + testAllocate(row, interval, INTERVAL_TO_PARTITONS.get(interval).length - 2, null); + } + + private void testAllocate(InputRow row, Interval interval, int partitionNum) + { + testAllocate(row, interval, partitionNum, INTERVAL_TO_PARTITONS.get(interval)[partitionNum + 1]); + } + + private void testAllocate(InputRow row, Interval interval, int partitionNum, @Nullable String partitionEnd) + { + String sequenceName = target.getSequenceName(interval, row); + SegmentIdWithShardSpec segmentIdWithShardSpec = allocate(row, sequenceName); + + Assert.assertEquals( + SegmentId.of(DATASOURCE, interval, INTERVAL_TO_VERSION.get(interval), partitionNum), + segmentIdWithShardSpec.asSegmentId() + ); + SingleDimensionShardSpec shardSpec = (SingleDimensionShardSpec) segmentIdWithShardSpec.getShardSpec(); + Assert.assertEquals(PARTITION_DIMENSION, shardSpec.getDimension()); + Assert.assertEquals(partitionNum, shardSpec.getPartitionNum()); + String partitionStart = INTERVAL_TO_PARTITONS.get(interval)[partitionNum]; + Assert.assertEquals(partitionStart, shardSpec.getStart()); + Assert.assertEquals(partitionEnd, shardSpec.getEnd()); + } + + private SegmentIdWithShardSpec allocate(InputRow row, String sequenceName) + { + try { + return target.allocate(row, sequenceName, null, false); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private static TaskToolbox createToolbox(List taskLocks) + { + TaskToolbox toolbox = EasyMock.mock(TaskToolbox.class); + EasyMock.expect(toolbox.getTaskActionClient()).andStubReturn(createTaskActionClient(taskLocks)); + EasyMock.replay(toolbox); + return toolbox; + } + + private static TaskActionClient createTaskActionClient(List taskLocks) + { + try { + TaskActionClient taskActionClient = EasyMock.mock(TaskActionClient.class); + EasyMock.expect(taskActionClient.submit(EasyMock.anyObject(LockListAction.class))).andStubReturn(taskLocks); + EasyMock.replay(taskActionClient); + return taskActionClient; + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private static TaskLock createTaskLock(Interval interval) + { + TaskLock taskLock = EasyMock.mock(TaskLock.class); + EasyMock.expect(taskLock.getInterval()).andStubReturn(interval); + EasyMock.expect(taskLock.getVersion()).andStubReturn(INTERVAL_TO_VERSION.get(interval)); + EasyMock.replay(taskLock); + return taskLock; + } + + private static InputRow createInputRow(Interval interval, String dimensionValue) + { + long timestamp = interval.getStartMillis(); + InputRow inputRow = EasyMock.mock(InputRow.class); + EasyMock.expect(inputRow.getTimestamp()).andStubReturn(DateTimes.utc(timestamp)); + EasyMock.expect(inputRow.getTimestampFromEpoch()).andStubReturn(timestamp); + EasyMock.expect(inputRow.getDimension(PARTITION_DIMENSION)) + .andStubReturn(Collections.singletonList(dimensionValue)); + EasyMock.replay(inputRow); + return inputRow; + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/DimensionDistributionReportTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/DimensionDistributionReportTest.java new file mode 100644 index 000000000000..c23362f3e9c3 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/DimensionDistributionReportTest.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch; +import org.apache.druid.java.util.common.Intervals; +import org.apache.druid.segment.TestHelper; +import org.joda.time.Interval; +import org.junit.Before; +import org.junit.Test; + +import java.util.Collections; +import java.util.Map; + +public class DimensionDistributionReportTest +{ + private static final ObjectMapper OBJECT_MAPPER = ParallelIndexTestingFactory.createObjectMapper(); + + private DimensionDistributionReport target; + + @Before + public void setup() + { + Interval interval = Intervals.ETERNITY; + StringSketch sketch = new StringSketch(); + Map intervalToDistribution = Collections.singletonMap(interval, sketch); + String taskId = "abc"; + target = new DimensionDistributionReport(taskId, intervalToDistribution); + } + + @Test + public void serializesDeserializes() + { + TestHelper.testSerializesDeserializes(OBJECT_MAPPER, target); + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionLocationTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionLocationTest.java new file mode 100644 index 000000000000..956dbc8fd150 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionLocationTest.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.druid.segment.TestHelper; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class GenericPartitionLocationTest +{ + private static final ObjectMapper OBJECT_MAPPER = ParallelIndexTestingFactory.createObjectMapper(); + + private GenericPartitionLocation target; + + @Before + public void setup() + { + target = new GenericPartitionLocation( + ParallelIndexTestingFactory.HOST, + ParallelIndexTestingFactory.PORT, + ParallelIndexTestingFactory.USE_HTTPS, + ParallelIndexTestingFactory.SUBTASK_ID, + ParallelIndexTestingFactory.INTERVAL, + ParallelIndexTestingFactory.HASH_BASED_NUMBERED_SHARD_SPEC + ); + } + + @Test + public void serializesDeserializes() + { + TestHelper.testSerializesDeserializes(OBJECT_MAPPER, target); + } + + @Test + public void hasPartitionIdThatMatchesShardSpec() + { + Assert.assertEquals(ParallelIndexTestingFactory.PARTITION_ID, target.getPartitionId()); + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStatTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStatTest.java new file mode 100644 index 000000000000..2bcac8edfd47 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStatTest.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.druid.segment.TestHelper; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class GenericPartitionStatTest +{ + private static final ObjectMapper OBJECT_MAPPER = ParallelIndexTestingFactory.createObjectMapper(); + + private GenericPartitionStat target; + + @Before + public void setup() + { + target = new GenericPartitionStat( + ParallelIndexTestingFactory.TASK_EXECUTOR_HOST, + ParallelIndexTestingFactory.TASK_EXECUTOR_PORT, + ParallelIndexTestingFactory.USE_HTTPS, + ParallelIndexTestingFactory.INTERVAL, + ParallelIndexTestingFactory.HASH_BASED_NUMBERED_SHARD_SPEC, + ParallelIndexTestingFactory.NUM_ROWS, + ParallelIndexTestingFactory.SIZE_BYTES + ); + } + + @Test + public void serializesDeserializes() + { + TestHelper.testSerializesDeserializes(OBJECT_MAPPER, target); + } + + @Test + public void hasPartitionIdThatMatchesSecondaryPartition() + { + Assert.assertEquals(target.getSecondaryPartition().getPartitionNum(), target.getPartitionId()); + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskSerdeTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskSerdeTest.java index 97c6954e9247..313a5ccad4ef 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskSerdeTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskSerdeTest.java @@ -153,14 +153,28 @@ public void forceGuaranteedRollupWithHashPartitionsValid() } @Test - public void forceGuaranteedRollupWithSingleDimPartitionsInvalid() + public void forceGuaranteedRollupWithSingleDimPartitionsMissingDimension() { expectedException.expect(IllegalStateException.class); expectedException.expectMessage( - "forceGuaranteedRollup is incompatible with partitionsSpec: single_dim partitions unsupported" + "forceGuaranteedRollup is incompatible with partitionsSpec: partitionDimension must be specified" ); new ParallelIndexSupervisorTaskBuilder() + .ingestionSpec( + new ParallelIndexIngestionSpecBuilder() + .forceGuaranteedRollup(true) + .partitionsSpec(new SingleDimensionPartitionsSpec(1, null, null, true)) + .inputIntervals(INTERVALS) + .build() + ) + .build(); + } + + @Test + public void forceGuaranteedRollupWithSingleDimPartitionsValid() + { + ParallelIndexSupervisorTask task = new ParallelIndexSupervisorTaskBuilder() .ingestionSpec( new ParallelIndexIngestionSpecBuilder() .forceGuaranteedRollup(true) @@ -169,6 +183,9 @@ public void forceGuaranteedRollupWithSingleDimPartitionsInvalid() .build() ) .build(); + + PartitionsSpec partitionsSpec = task.getIngestionSchema().getTuningConfig().getPartitionsSpec(); + Assert.assertThat(partitionsSpec, CoreMatchers.instanceOf(SingleDimensionPartitionsSpec.class)); } private static class ParallelIndexSupervisorTaskBuilder diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexTestingFactory.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexTestingFactory.java index 3d6e86aa01f2..a580ab6b8deb 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexTestingFactory.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexTestingFactory.java @@ -27,6 +27,7 @@ import org.apache.druid.data.input.InputFormat; import org.apache.druid.data.input.InputSource; import org.apache.druid.data.input.impl.DimensionsSpec; +import org.apache.druid.data.input.impl.JsonInputFormat; import org.apache.druid.data.input.impl.TimestampSpec; import org.apache.druid.indexer.partitions.HashedPartitionsSpec; import org.apache.druid.indexer.partitions.PartitionsSpec; @@ -44,6 +45,7 @@ import org.apache.druid.segment.realtime.appenderator.AppenderatorsManager; import org.apache.druid.segment.transform.TransformSpec; import org.apache.druid.timeline.partition.HashBasedNumberedShardSpec; +import org.easymock.EasyMock; import org.joda.time.Duration; import org.joda.time.Interval; @@ -229,7 +231,14 @@ SingleDimensionPartitionsSpec build() static IndexTaskClientFactory createTaskClientFactory() { - return TASK_CLIENT_FACTORY; + return (taskInfoProvider, callerId, numThreads, httpTimeout, numRetries) -> createTaskClient(); + } + + private static ParallelIndexSupervisorTaskClient createTaskClient() + { + ParallelIndexSupervisorTaskClient taskClient = EasyMock.niceMock(ParallelIndexSupervisorTaskClient.class); + EasyMock.replay(taskClient); + return taskClient; } static String createRow(long timestamp, Object dimensionValue) @@ -244,4 +253,9 @@ static String createRow(long timestamp, Object dimensionValue) throw new RuntimeException(e); } } + + static InputFormat getInputFormat() + { + return new JsonInputFormat(null, null); + } } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java new file mode 100644 index 000000000000..86bfc2e0e8c0 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java @@ -0,0 +1,470 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Joiner; +import com.google.common.collect.Iterables; +import org.apache.druid.data.input.InputFormat; +import org.apache.druid.data.input.InputSource; +import org.apache.druid.data.input.impl.InlineInputSource; +import org.apache.druid.indexer.TaskState; +import org.apache.druid.indexer.TaskStatus; +import org.apache.druid.indexer.partitions.DynamicPartitionsSpec; +import org.apache.druid.indexer.partitions.HashedPartitionsSpec; +import org.apache.druid.indexer.partitions.PartitionsSpec; +import org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec; +import org.apache.druid.indexing.common.TaskToolbox; +import org.apache.druid.indexing.common.task.IndexTaskClientFactory; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution; +import org.apache.druid.segment.TestHelper; +import org.apache.druid.segment.indexing.DataSchema; +import org.apache.druid.testing.junit.LoggerCaptureRule; +import org.apache.logging.log4j.core.LogEvent; +import org.easymock.Capture; +import org.easymock.EasyMock; +import org.hamcrest.Matchers; +import org.joda.time.Interval; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.runners.Enclosed; +import org.junit.rules.ExpectedException; +import org.junit.runner.RunWith; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +@RunWith(Enclosed.class) +public class PartialDimensionDistributionTaskTest +{ + private static final ObjectMapper OBJECT_MAPPER = ParallelIndexTestingFactory.createObjectMapper(); + private static final SingleDimensionPartitionsSpec SINGLE_DIM_PARTITIONS_SPEC = + new ParallelIndexTestingFactory.SingleDimensionPartitionsSpecBuilder().build(); + + public static class ConstructorTest + { + @Rule + public ExpectedException exception = ExpectedException.none(); + + @Test + public void requiresForceGuaranteedRollup() + { + exception.expect(IllegalArgumentException.class); + exception.expectMessage("forceGuaranteedRollup must be set"); + + ParallelIndexTuningConfig tuningConfig = new ParallelIndexTestingFactory.TuningConfigBuilder() + .forceGuaranteedRollup(false) + .partitionsSpec(new DynamicPartitionsSpec(null, null)) + .build(); + + new PartialDimensionDistributionTaskBuilder() + .tuningConfig(tuningConfig) + .build(); + } + + @Test + public void requiresSingleDimensionPartitions() + { + exception.expect(IllegalArgumentException.class); + exception.expectMessage("single_dim partitionsSpec required"); + + PartitionsSpec partitionsSpec = new HashedPartitionsSpec(null, 1, null); + ParallelIndexTuningConfig tuningConfig = + new ParallelIndexTestingFactory.TuningConfigBuilder().partitionsSpec(partitionsSpec).build(); + + new PartialDimensionDistributionTaskBuilder() + .tuningConfig(tuningConfig) + .build(); + } + + @Test + public void requiresGranularitySpecInputIntervals() + { + exception.expect(IllegalArgumentException.class); + exception.expectMessage("Missing intervals in granularitySpec"); + + DataSchema dataSchema = ParallelIndexTestingFactory.createDataSchema(Collections.emptyList()); + + new PartialDimensionDistributionTaskBuilder() + .dataSchema(dataSchema) + .build(); + } + + @Test + public void serializesDeserializes() + { + PartialDimensionDistributionTask task = new PartialDimensionDistributionTaskBuilder() + .build(); + TestHelper.testSerializesDeserializes(OBJECT_MAPPER, task); + } + + @Test + public void hasCorrectPrefixForAutomaticId() + { + PartialDimensionDistributionTask task = new PartialDimensionDistributionTaskBuilder() + .id(ParallelIndexTestingFactory.AUTOMATIC_ID) + .build(); + Assert.assertThat(task.getId(), Matchers.startsWith(PartialDimensionDistributionTask.TYPE)); + } + } + + public static class RunTaskTest + { + private static final TaskToolbox TASK_TOOLBOX = null; + + @Rule + public ExpectedException exception = ExpectedException.none(); + + @Rule + public LoggerCaptureRule logger = new LoggerCaptureRule(PartialDimensionDistributionTask.class); + + @Test + public void requiresPartitionDimension() throws Exception + { + exception.expect(IllegalArgumentException.class); + exception.expectMessage("partitionDimension must be specified"); + + ParallelIndexTuningConfig tuningConfig = new ParallelIndexTestingFactory.TuningConfigBuilder() + .partitionsSpec( + new ParallelIndexTestingFactory.SingleDimensionPartitionsSpecBuilder().partitionDimension(null).build() + ) + .build(); + PartialDimensionDistributionTask task = new PartialDimensionDistributionTaskBuilder() + .tuningConfig(tuningConfig) + .build(); + + task.runTask(TASK_TOOLBOX); + } + + @Test + public void logsParseExceptionsIfEnabled() throws Exception + { + long invalidTimestamp = Long.MAX_VALUE; + InputSource inlineInputSource = new InlineInputSource( + ParallelIndexTestingFactory.createRow(invalidTimestamp, "a") + ); + ParallelIndexTuningConfig tuningConfig = new ParallelIndexTestingFactory.TuningConfigBuilder() + .partitionsSpec(SINGLE_DIM_PARTITIONS_SPEC) + .logParseExceptions(true) + .build(); + PartialDimensionDistributionTask task = new PartialDimensionDistributionTaskBuilder() + .inputSource(inlineInputSource) + .tuningConfig(tuningConfig) + .taskClientFactory(ParallelIndexTestingFactory.createTaskClientFactory()) + .build(); + + task.runTask(TASK_TOOLBOX); + + List logEvents = logger.getLogEvents(); + Assert.assertEquals(1, logEvents.size()); + String logMessage = logEvents.get(0).getMessage().getFormattedMessage(); + Assert.assertThat(logMessage, Matchers.containsString("Encountered parse exception")); + } + + @Test + public void doesNotLogParseExceptionsIfDisabled() throws Exception + { + ParallelIndexTuningConfig tuningConfig = new ParallelIndexTestingFactory.TuningConfigBuilder() + .partitionsSpec(SINGLE_DIM_PARTITIONS_SPEC) + .logParseExceptions(false) + .build(); + PartialDimensionDistributionTask task = new PartialDimensionDistributionTaskBuilder() + .tuningConfig(tuningConfig) + .taskClientFactory(ParallelIndexTestingFactory.createTaskClientFactory()) + .build(); + + task.runTask(TASK_TOOLBOX); + + Assert.assertEquals(Collections.emptyList(), logger.getLogEvents()); + } + + @Test + public void failsWhenTooManyParseExceptions() throws Exception + { + ParallelIndexTuningConfig tuningConfig = new ParallelIndexTestingFactory.TuningConfigBuilder() + .partitionsSpec(SINGLE_DIM_PARTITIONS_SPEC) + .maxParseExceptions(0) + .build(); + PartialDimensionDistributionTask task = new PartialDimensionDistributionTaskBuilder() + .tuningConfig(tuningConfig) + .taskClientFactory(ParallelIndexTestingFactory.createTaskClientFactory()) + .build(); + + exception.expect(RuntimeException.class); + exception.expectMessage("Max parse exceptions exceeded"); + + task.runTask(TASK_TOOLBOX); + } + + @Test + public void skipsRowsWithMultipleDimensionValues() + { + InputSource inlineInputSource = new InlineInputSource( + ParallelIndexTestingFactory.createRow(0, Arrays.asList("a", "b")) + ); + PartialDimensionDistributionTaskBuilder taskBuilder = new PartialDimensionDistributionTaskBuilder() + .inputSource(inlineInputSource); + + DimensionDistributionReport report = runTask(taskBuilder); + + Map intervalToDistribution = report.getIntervalToDistribution(); + Assert.assertEquals(0, intervalToDistribution.size()); + } + + @Test + public void sendsCorrectReportWhenAssumeGroupedTrue() + { + long timestamp = 0; + String dimensionValue = "a"; + InputSource inlineInputSource = new InlineInputSource( + ParallelIndexTestingFactory.createRow(timestamp, dimensionValue) + + "\n" + ParallelIndexTestingFactory.createRow(timestamp + 1, dimensionValue) + ); + ParallelIndexTuningConfig tuningConfig = new ParallelIndexTestingFactory.TuningConfigBuilder() + .partitionsSpec( + new ParallelIndexTestingFactory.SingleDimensionPartitionsSpecBuilder().assumeGrouped(true).build() + ) + .build(); + PartialDimensionDistributionTaskBuilder taskBuilder = new PartialDimensionDistributionTaskBuilder() + .tuningConfig(tuningConfig) + .inputSource(inlineInputSource); + + DimensionDistributionReport report = runTask(taskBuilder); + + Assert.assertEquals(ParallelIndexTestingFactory.ID, report.getTaskId()); + Map intervalToDistribution = report.getIntervalToDistribution(); + StringDistribution distribution = Iterables.getOnlyElement(intervalToDistribution.values()); + Assert.assertNotNull(distribution); + String[] partitions = distribution.getEvenPartitionsByMaxSize(1); + Assert.assertEquals(3, partitions.length); + Assert.assertEquals(dimensionValue, partitions[0]); + Assert.assertEquals(dimensionValue, partitions[1]); + Assert.assertEquals(dimensionValue, partitions[2]); + } + + @Test + public void groupsRowsWhenAssumeGroupedFalse() + { + long timestamp = 0; + String dimensionValue = "a"; + InputSource inlineInputSource = new InlineInputSource( + ParallelIndexTestingFactory.createRow(timestamp, dimensionValue) + + "\n" + ParallelIndexTestingFactory.createRow(timestamp + 1, dimensionValue) + ); + ParallelIndexTuningConfig tuningConfig = new ParallelIndexTestingFactory.TuningConfigBuilder() + .partitionsSpec( + new ParallelIndexTestingFactory.SingleDimensionPartitionsSpecBuilder().assumeGrouped(false).build() + ) + .build(); + PartialDimensionDistributionTaskBuilder taskBuilder = new PartialDimensionDistributionTaskBuilder() + .tuningConfig(tuningConfig) + .inputSource(inlineInputSource); + + DimensionDistributionReport report = runTask(taskBuilder); + + Assert.assertEquals(ParallelIndexTestingFactory.ID, report.getTaskId()); + Map intervalToDistribution = report.getIntervalToDistribution(); + StringDistribution distribution = Iterables.getOnlyElement(intervalToDistribution.values()); + Assert.assertNotNull(distribution); + String[] partitions = distribution.getEvenPartitionsByMaxSize(1); + Assert.assertEquals(2, partitions.length); + Assert.assertEquals(dimensionValue, partitions[0]); + Assert.assertEquals(dimensionValue, partitions[1]); + } + + @Test + public void preservesMinAndMaxWhenAssumeGroupedFalse() + { + // Create a small bloom filter so that it saturates quickly + int smallBloomFilter = 1; + double manyFalsePositiveBloomFilter = 0.5; + int minBloomFilterBits = Long.SIZE; + + long timestamp = 0; + List dimensionValues = IntStream.range(0, minBloomFilterBits * 10) + .mapToObj(i -> String.format("%010d", i)) + .collect(Collectors.toCollection(ArrayList::new)); + String minDimensionValue = dimensionValues.get(0); + String maxDimensionValue = dimensionValues.get(dimensionValues.size() - 1); + List rows = dimensionValues.stream() + .map(d -> ParallelIndexTestingFactory.createRow(timestamp, d)) + .collect(Collectors.toList()); + Joiner joiner = Joiner.on("\n"); + InputSource inlineInputSource = new InlineInputSource( + joiner.join( + joiner.join(rows.subList(1, rows.size())), // saturate bloom filter first + rows.get(0), + rows.get(rows.size() - 1) + ) + ); + ParallelIndexTuningConfig tuningConfig = new ParallelIndexTestingFactory.TuningConfigBuilder() + .partitionsSpec( + new ParallelIndexTestingFactory.SingleDimensionPartitionsSpecBuilder().assumeGrouped(false).build() + ) + .build(); + DataSchema dataSchema = ParallelIndexTestingFactory.createDataSchema(ParallelIndexTestingFactory.INPUT_INTERVALS); + PartialDimensionDistributionTaskBuilder taskBuilder = new PartialDimensionDistributionTaskBuilder() + .tuningConfig(tuningConfig) + .dataSchema(dataSchema) + .inputSource(inlineInputSource) + .ungroupedRowDimValueFilterSupplier( + () -> new PartialDimensionDistributionTask.UngroupedRowDimensionValueFilter( + dataSchema.getGranularitySpec().getQueryGranularity(), + smallBloomFilter, + manyFalsePositiveBloomFilter + ) + ); + + DimensionDistributionReport report = runTask(taskBuilder); + + Assert.assertEquals(ParallelIndexTestingFactory.ID, report.getTaskId()); + Map intervalToDistribution = report.getIntervalToDistribution(); + StringDistribution distribution = Iterables.getOnlyElement(intervalToDistribution.values()); + Assert.assertNotNull(distribution); + String[] partitions = distribution.getEvenPartitionsByMaxSize(1); + Assert.assertEquals(minBloomFilterBits + 3, partitions.length); // 3 = min + max + exclusive endpoint + Assert.assertEquals(minDimensionValue, partitions[0]); + Assert.assertEquals(maxDimensionValue, partitions[partitions.length - 1]); + } + + @Test + public void returnsSuccessIfNoExceptions() throws Exception + { + PartialDimensionDistributionTask task = new PartialDimensionDistributionTaskBuilder() + .taskClientFactory(ParallelIndexTestingFactory.createTaskClientFactory()) + .build(); + + TaskStatus taskStatus = task.runTask(TASK_TOOLBOX); + + Assert.assertEquals(ParallelIndexTestingFactory.ID, taskStatus.getId()); + Assert.assertEquals(TaskState.SUCCESS, taskStatus.getStatusCode()); + } + + private static DimensionDistributionReport runTask(PartialDimensionDistributionTaskBuilder taskBuilder) + { + Capture reportCapture = Capture.newInstance(); + ParallelIndexSupervisorTaskClient taskClient = EasyMock.mock(ParallelIndexSupervisorTaskClient.class); + taskClient.report(EasyMock.eq(ParallelIndexTestingFactory.SUPERVISOR_TASK_ID), EasyMock.capture(reportCapture)); + EasyMock.replay(taskClient); + + try { + taskBuilder.taskClientFactory((taskInfoProvider, callerId, numThreads, httpTimeout, numRetries) -> taskClient) + .build() + .runTask(TASK_TOOLBOX); + } + catch (Exception e) { + throw new RuntimeException(e); + } + + return (DimensionDistributionReport) reportCapture.getValue(); + } + } + + private static class PartialDimensionDistributionTaskBuilder + { + private static final InputFormat INPUT_FORMAT = ParallelIndexTestingFactory.getInputFormat(); + + private String id = ParallelIndexTestingFactory.ID; + private InputSource inputSource = new InlineInputSource("row-with-invalid-timestamp"); + private ParallelIndexTuningConfig tuningConfig = new ParallelIndexTestingFactory.TuningConfigBuilder() + .partitionsSpec(new ParallelIndexTestingFactory.SingleDimensionPartitionsSpecBuilder().build()) + .build(); + private DataSchema dataSchema = + ParallelIndexTestingFactory.createDataSchema(ParallelIndexTestingFactory.INPUT_INTERVALS); + private IndexTaskClientFactory taskClientFactory = + ParallelIndexTestingFactory.TASK_CLIENT_FACTORY; + private Supplier + ungroupedRowDimValueFilterSupplier = null; + + @SuppressWarnings("SameParameterValue") + PartialDimensionDistributionTaskBuilder id(String id) + { + this.id = id; + return this; + } + + PartialDimensionDistributionTaskBuilder inputSource(InputSource inputSource) + { + this.inputSource = inputSource; + return this; + } + + PartialDimensionDistributionTaskBuilder tuningConfig(ParallelIndexTuningConfig tuningConfig) + { + this.tuningConfig = tuningConfig; + return this; + } + + PartialDimensionDistributionTaskBuilder dataSchema(DataSchema dataSchema) + { + this.dataSchema = dataSchema; + return this; + } + + PartialDimensionDistributionTaskBuilder taskClientFactory( + IndexTaskClientFactory taskClientFactory + ) + { + this.taskClientFactory = taskClientFactory; + return this; + } + + PartialDimensionDistributionTaskBuilder ungroupedRowDimValueFilterSupplier( + Supplier ungroupedRowDimValueFilterSupplier + ) + { + this.ungroupedRowDimValueFilterSupplier = ungroupedRowDimValueFilterSupplier; + return this; + } + + PartialDimensionDistributionTask build() + { + ParallelIndexIngestionSpec ingestionSpec = + ParallelIndexTestingFactory.createIngestionSpec(inputSource, INPUT_FORMAT, tuningConfig, dataSchema); + + Supplier supplier = + ungroupedRowDimValueFilterSupplier == null + ? () -> new PartialDimensionDistributionTask.UngroupedRowDimensionValueFilter( + dataSchema.getGranularitySpec().getQueryGranularity() + ) + : ungroupedRowDimValueFilterSupplier; + + return new PartialDimensionDistributionTask( + id, + ParallelIndexTestingFactory.GROUP_ID, + ParallelIndexTestingFactory.TASK_RESOURCE, + ParallelIndexTestingFactory.SUPERVISOR_TASK_ID, + ParallelIndexTestingFactory.NUM_ATTEMPTS, + ingestionSpec, + ParallelIndexTestingFactory.CONTEXT, + ParallelIndexTestingFactory.INDEXING_SERVICE_CLIENT, + taskClientFactory, + supplier + ); + } + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIOConfigTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIOConfigTest.java new file mode 100644 index 000000000000..c96adb89a755 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIOConfigTest.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.druid.segment.TestHelper; +import org.junit.Before; +import org.junit.Test; + +import java.util.Collections; + +public class PartialGenericSegmentMergeIOConfigTest +{ + private static final ObjectMapper OBJECT_MAPPER = ParallelIndexTestingFactory.createObjectMapper(); + private static final GenericPartitionLocation GENERIC_PARTITION_LOCATION = new GenericPartitionLocation( + ParallelIndexTestingFactory.HOST, + ParallelIndexTestingFactory.PORT, + ParallelIndexTestingFactory.USE_HTTPS, + ParallelIndexTestingFactory.SUBTASK_ID, + ParallelIndexTestingFactory.INTERVAL, + ParallelIndexTestingFactory.HASH_BASED_NUMBERED_SHARD_SPEC + ); + + private PartialGenericSegmentMergeIOConfig target; + + @Before + public void setup() + { + target = new PartialGenericSegmentMergeIOConfig(Collections.singletonList(GENERIC_PARTITION_LOCATION)); + } + + @Test + public void serializesDeserializes() + { + TestHelper.testSerializesDeserializes(OBJECT_MAPPER, target); + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIngestionSpecTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIngestionSpecTest.java new file mode 100644 index 000000000000..c30cc9ee3b29 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeIngestionSpecTest.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.druid.indexer.partitions.HashedPartitionsSpec; +import org.apache.druid.segment.TestHelper; +import org.junit.Before; +import org.junit.Test; + +import java.util.Collections; + +public class PartialGenericSegmentMergeIngestionSpecTest +{ + private static final ObjectMapper OBJECT_MAPPER = ParallelIndexTestingFactory.createObjectMapper(); + private static final GenericPartitionLocation GENERIC_PARTITION_LOCATION = new GenericPartitionLocation( + ParallelIndexTestingFactory.HOST, + ParallelIndexTestingFactory.PORT, + ParallelIndexTestingFactory.USE_HTTPS, + ParallelIndexTestingFactory.SUBTASK_ID, + ParallelIndexTestingFactory.INTERVAL, + ParallelIndexTestingFactory.HASH_BASED_NUMBERED_SHARD_SPEC + ); + private static final PartialGenericSegmentMergeIOConfig IO_CONFIG = + new PartialGenericSegmentMergeIOConfig(Collections.singletonList(GENERIC_PARTITION_LOCATION)); + private static final HashedPartitionsSpec PARTITIONS_SPEC = new HashedPartitionsSpec( + null, + 1, + Collections.emptyList() + ); + + private PartialGenericSegmentMergeIngestionSpec target; + + @Before + public void setup() + { + target = new PartialGenericSegmentMergeIngestionSpec( + ParallelIndexTestingFactory.createDataSchema(ParallelIndexTestingFactory.INPUT_INTERVALS), + IO_CONFIG, + new ParallelIndexTestingFactory.TuningConfigBuilder() + .partitionsSpec(PARTITIONS_SPEC) + .build() + ); + } + + @Test + public void serializesDeserializes() + { + TestHelper.testSerializesDeserializes(OBJECT_MAPPER, target); + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTaskTest.java new file mode 100644 index 000000000000..69403bc14414 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTaskTest.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.druid.indexer.partitions.HashedPartitionsSpec; +import org.apache.druid.segment.TestHelper; +import org.hamcrest.Matchers; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.util.Collections; + +public class PartialGenericSegmentMergeTaskTest +{ + private static final ObjectMapper OBJECT_MAPPER = ParallelIndexTestingFactory.createObjectMapper(); + private static final GenericPartitionLocation GENERIC_PARTITION_LOCATION = new GenericPartitionLocation( + ParallelIndexTestingFactory.HOST, + ParallelIndexTestingFactory.PORT, + ParallelIndexTestingFactory.USE_HTTPS, + ParallelIndexTestingFactory.SUBTASK_ID, + ParallelIndexTestingFactory.INTERVAL, + ParallelIndexTestingFactory.HASH_BASED_NUMBERED_SHARD_SPEC + ); + private static final PartialGenericSegmentMergeIOConfig IO_CONFIG = + new PartialGenericSegmentMergeIOConfig(Collections.singletonList(GENERIC_PARTITION_LOCATION)); + private static final HashedPartitionsSpec PARTITIONS_SPEC = new HashedPartitionsSpec( + null, + 1, + Collections.emptyList() + ); + private static final PartialGenericSegmentMergeIngestionSpec INGESTION_SPEC = + new PartialGenericSegmentMergeIngestionSpec( + ParallelIndexTestingFactory.createDataSchema(ParallelIndexTestingFactory.INPUT_INTERVALS), + IO_CONFIG, + new ParallelIndexTestingFactory.TuningConfigBuilder() + .partitionsSpec(PARTITIONS_SPEC) + .build() + ); + + private PartialGenericSegmentMergeTask target; + + @Before + public void setup() + { + target = new PartialGenericSegmentMergeTask( + ParallelIndexTestingFactory.AUTOMATIC_ID, + ParallelIndexTestingFactory.GROUP_ID, + ParallelIndexTestingFactory.TASK_RESOURCE, + ParallelIndexTestingFactory.SUPERVISOR_TASK_ID, + ParallelIndexTestingFactory.NUM_ATTEMPTS, + INGESTION_SPEC, + ParallelIndexTestingFactory.CONTEXT, + ParallelIndexTestingFactory.INDEXING_SERVICE_CLIENT, + ParallelIndexTestingFactory.TASK_CLIENT_FACTORY, + ParallelIndexTestingFactory.SHUFFLE_CLIENT + ); + } + + @Test + public void serializesDeserializes() + { + TestHelper.testSerializesDeserializes(OBJECT_MAPPER, target); + } + + @Test + public void hasCorrectPrefixForAutomaticId() + { + String id = target.getId(); + Assert.assertThat(id, Matchers.startsWith(PartialGenericSegmentMergeTask.TYPE)); + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTaskTest.java new file mode 100644 index 000000000000..67a4919cd9df --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTaskTest.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.druid.data.input.InputFormat; +import org.apache.druid.data.input.InputSource; +import org.apache.druid.data.input.impl.InlineInputSource; +import org.apache.druid.indexer.partitions.DynamicPartitionsSpec; +import org.apache.druid.indexer.partitions.HashedPartitionsSpec; +import org.apache.druid.indexer.partitions.PartitionsSpec; +import org.apache.druid.indexing.common.task.IndexTaskClientFactory; +import org.apache.druid.segment.TestHelper; +import org.apache.druid.segment.indexing.DataSchema; +import org.hamcrest.Matchers; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import java.util.Collections; + +public class PartialRangeSegmentGenerateTaskTest +{ + private static final ObjectMapper OBJECT_MAPPER = ParallelIndexTestingFactory.createObjectMapper(); + + @Rule + public ExpectedException exception = ExpectedException.none(); + + @Test + public void requiresForceGuaranteedRollup() + { + exception.expect(IllegalArgumentException.class); + exception.expectMessage("single_dim partitionsSpec required"); + + ParallelIndexTuningConfig tuningConfig = new ParallelIndexTestingFactory.TuningConfigBuilder() + .forceGuaranteedRollup(false) + .partitionsSpec(new DynamicPartitionsSpec(null, null)) + .build(); + + new PartialRangeSegmentGenerateTaskBuilder() + .tuningConfig(tuningConfig) + .build(); + } + + @Test + public void requiresSingleDimensionPartitions() + { + exception.expect(IllegalArgumentException.class); + exception.expectMessage("single_dim partitionsSpec required"); + + PartitionsSpec partitionsSpec = new HashedPartitionsSpec(null, 1, null); + ParallelIndexTuningConfig tuningConfig = + new ParallelIndexTestingFactory.TuningConfigBuilder().partitionsSpec(partitionsSpec).build(); + + new PartialRangeSegmentGenerateTaskBuilder() + .tuningConfig(tuningConfig) + .build(); + } + + @Test + public void requiresGranularitySpecInputIntervals() + { + exception.expect(IllegalArgumentException.class); + exception.expectMessage("Missing intervals in granularitySpec"); + + DataSchema dataSchema = ParallelIndexTestingFactory.createDataSchema(Collections.emptyList()); + + new PartialRangeSegmentGenerateTaskBuilder() + .dataSchema(dataSchema) + .build(); + } + + @Test + public void serializesDeserializes() + { + PartialRangeSegmentGenerateTask task = new PartialRangeSegmentGenerateTaskBuilder().build(); + TestHelper.testSerializesDeserializes(OBJECT_MAPPER, task); + } + + @Test + public void hasCorrectPrefixForAutomaticId() + { + PartialRangeSegmentGenerateTask task = new PartialRangeSegmentGenerateTaskBuilder().build(); + Assert.assertThat(task.getId(), Matchers.startsWith(PartialRangeSegmentGenerateTask.TYPE)); + } + + private static class PartialRangeSegmentGenerateTaskBuilder + { + private static final InputSource INPUT_SOURCE = new InlineInputSource("data"); + private static final InputFormat INPUT_FORMAT = ParallelIndexTestingFactory.getInputFormat(); + + private final IndexTaskClientFactory taskClientFactory = + ParallelIndexTestingFactory.TASK_CLIENT_FACTORY; + + private ParallelIndexTuningConfig tuningConfig = new ParallelIndexTestingFactory.TuningConfigBuilder() + .partitionsSpec(new ParallelIndexTestingFactory.SingleDimensionPartitionsSpecBuilder().build()) + .build(); + private DataSchema dataSchema = + ParallelIndexTestingFactory.createDataSchema(ParallelIndexTestingFactory.INPUT_INTERVALS); + + PartialRangeSegmentGenerateTaskBuilder tuningConfig(ParallelIndexTuningConfig tuningConfig) + { + this.tuningConfig = tuningConfig; + return this; + } + + PartialRangeSegmentGenerateTaskBuilder dataSchema(DataSchema dataSchema) + { + this.dataSchema = dataSchema; + return this; + } + + PartialRangeSegmentGenerateTask build() + { + ParallelIndexIngestionSpec ingestionSpec = + ParallelIndexTestingFactory.createIngestionSpec(INPUT_SOURCE, INPUT_FORMAT, tuningConfig, dataSchema); + + return new PartialRangeSegmentGenerateTask( + ParallelIndexTestingFactory.AUTOMATIC_ID, + ParallelIndexTestingFactory.GROUP_ID, + ParallelIndexTestingFactory.TASK_RESOURCE, + ParallelIndexTestingFactory.SUPERVISOR_TASK_ID, + ParallelIndexTestingFactory.NUM_ATTEMPTS, + ingestionSpec, + ParallelIndexTestingFactory.CONTEXT, + Collections.emptyMap(), + ParallelIndexTestingFactory.INDEXING_SERVICE_CLIENT, + taskClientFactory, + ParallelIndexTestingFactory.APPENDERATORS_MANAGER + ); + } + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java new file mode 100644 index 000000000000..26814d2c3040 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java @@ -0,0 +1,472 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel; + +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.HashMultimap; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Multimap; +import com.google.common.collect.SetMultimap; +import org.apache.druid.client.indexing.IndexingServiceClient; +import org.apache.druid.data.input.InputSplit; +import org.apache.druid.data.input.impl.CSVParseSpec; +import org.apache.druid.data.input.impl.DimensionsSpec; +import org.apache.druid.data.input.impl.ParseSpec; +import org.apache.druid.data.input.impl.TimestampSpec; +import org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec; +import org.apache.druid.indexing.common.LockGranularity; +import org.apache.druid.indexing.common.TaskToolbox; +import org.apache.druid.indexing.common.task.IndexTaskClientFactory; +import org.apache.druid.indexing.common.task.TaskResource; +import org.apache.druid.indexing.common.task.TestAppenderatorsManager; +import org.apache.druid.java.util.common.ISE; +import org.apache.druid.java.util.common.Intervals; +import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.java.util.common.guava.Comparators; +import org.apache.druid.query.scan.ScanResultValue; +import org.apache.druid.timeline.DataSegment; +import org.apache.druid.timeline.partition.SingleDimensionShardSpec; +import org.hamcrest.Matchers; +import org.joda.time.Interval; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import javax.annotation.Nullable; +import java.io.File; +import java.io.IOException; +import java.io.Writer; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; +import java.util.stream.Collectors; + +@RunWith(Parameterized.class) +public class RangePartitionMultiPhaseParallelIndexingTest extends AbstractMultiPhaseParallelIndexingTest +{ + private static final int NUM_FILE = 10; + private static final int NUM_ROW = 20; + private static final int NUM_DAY = 2; + private static final int NUM_PARTITION = 2; + private static final int YEAR = 2017; + private static final String DIM1 = "dim1"; + private static final String DIM2 = "dim2"; + private static final List DIMS = ImmutableList.of(DIM1, DIM2); + private static final String TEST_FILE_NAME_PREFIX = "test_"; + private static final ParseSpec PARSE_SPEC = new CSVParseSpec( + new TimestampSpec( + "ts", + "auto", + null + ), + new DimensionsSpec( + DimensionsSpec.getDefaultSchemas(Arrays.asList("ts", DIM1, DIM2)), + new ArrayList<>(), + new ArrayList<>() + ), + null, + Arrays.asList("ts", DIM1, DIM2, "val"), + false, + 0 + ); + + @Parameterized.Parameters(name = "{0}, useInputFormatApi={1}") + public static Iterable constructorFeeder() + { + return ImmutableList.of( + new Object[]{LockGranularity.TIME_CHUNK, false}, + new Object[]{LockGranularity.TIME_CHUNK, true}, + new Object[]{LockGranularity.SEGMENT, true} + ); + } + + private File inputDir; + private SetMultimap intervalToDim1; + + public RangePartitionMultiPhaseParallelIndexingTest(LockGranularity lockGranularity, boolean useInputFormatApi) + { + super(lockGranularity, useInputFormatApi); + } + + @Override + @Before + public void setup() throws IOException + { + super.setup(); + inputDir = temporaryFolder.newFolder("data"); + intervalToDim1 = createInputFiles(inputDir); + } + + private static SetMultimap createInputFiles(File inputDir) throws IOException + { + SetMultimap intervalToDim1 = HashMultimap.create(); + + for (int fileIndex = 0; fileIndex < NUM_FILE; fileIndex++) { + Path path = new File(inputDir, TEST_FILE_NAME_PREFIX + fileIndex).toPath(); + try (final Writer writer = Files.newBufferedWriter(path, StandardCharsets.UTF_8)) { + for (int i = 0; i < (NUM_ROW / NUM_DAY); i++) { + for (int d = 0; d < NUM_DAY; d++) { + writeRow(writer, i + d, fileIndex + d, intervalToDim1); + } + } + } + } + + return intervalToDim1; + } + + private static void writeRow(Writer writer, int day, int fileIndex, Multimap intervalToDim1) + throws IOException + { + Interval interval = Intervals.of("%s-12-%d/%s-12-%d", YEAR, day + 1, YEAR, day + 2); + String startDate = interval.getStart().toString("y-M-d"); + String dim1Value = String.valueOf(fileIndex + 10); + writer.write(StringUtils.format("%s,%s,%d th test file\n", startDate, dim1Value, fileIndex)); + intervalToDim1.put(interval, dim1Value); + } + + @Test + public void createsCorrectRangePartitions() throws Exception + { + int targetRowsPerSegment = NUM_ROW / NUM_DAY / NUM_PARTITION; + final Set publishedSegments = runTestTask( + PARSE_SPEC, + Intervals.of("%s/%s", YEAR, YEAR + 1), + inputDir, + TEST_FILE_NAME_PREFIX + "*", + new SingleDimensionPartitionsSpec( + targetRowsPerSegment, + null, + DIM1, + false + ) + ); + assertRangePartitions(publishedSegments); + } + + private void assertRangePartitions(Set publishedSegments) throws IOException + { + Multimap intervalToSegments = ArrayListMultimap.create(); + publishedSegments.forEach(s -> intervalToSegments.put(s.getInterval(), s)); + + SortedSet publishedIntervals = new TreeSet<>(Comparators.intervalsByStartThenEnd()); + publishedIntervals.addAll(intervalToSegments.keySet()); + assertHasExpectedIntervals(publishedIntervals); + + Interval firstInterval = publishedIntervals.first(); + Interval lastInterval = publishedIntervals.last(); + File tempSegmentDir = temporaryFolder.newFolder(); + + intervalToSegments.asMap().forEach((interval, segments) -> { + assertNumPartition(interval, segments, firstInterval, lastInterval); + + List allValues = new ArrayList<>(NUM_ROW); + for (DataSegment segment : segments) { + List values = getColumnValues(segment, tempSegmentDir); + assertValuesInRange(values, segment); + allValues.addAll(values); + } + + assertIntervalHasAllExpectedValues(interval, allValues); + }); + } + + private void assertHasExpectedIntervals(Set publishedSegmentIntervals) + { + Assert.assertEquals(intervalToDim1.keySet(), publishedSegmentIntervals); + } + + private static void assertNumPartition( + Interval interval, + Collection segments, + Interval firstInterval, + Interval lastInterval + ) + { + int expectedNumPartition = NUM_PARTITION; + if (interval.equals(firstInterval) || interval.equals(lastInterval)) { + expectedNumPartition -= 1; + } + expectedNumPartition *= NUM_DAY; + Assert.assertEquals(expectedNumPartition, segments.size()); + } + + private List getColumnValues(DataSegment segment, File tempDir) + { + List results = querySegment(segment, DIMS, tempDir); + Assert.assertEquals(1, results.size()); + List> rows = (List>) results.get(0).getEvents(); + return rows.stream() + .map(row -> row.get(DIM1)) + .collect(Collectors.toList()); + } + + private static void assertValuesInRange(List values, DataSegment segment) + { + SingleDimensionShardSpec shardSpec = (SingleDimensionShardSpec) segment.getShardSpec(); + String start = shardSpec.getStart(); + Assert.assertNotNull(start); + String end = shardSpec.getEnd(); + + for (String value : values) { + Assert.assertThat(value.compareTo(start), Matchers.greaterThanOrEqualTo(0)); + + if (end != null) { + Assert.assertThat(value.compareTo(end), Matchers.lessThan(0)); + } + } + } + + private void assertIntervalHasAllExpectedValues(Interval interval, List actualValues) + { + List expectedValues = new ArrayList<>(intervalToDim1.get(interval)); + Assert.assertEquals(expectedValues.size(), actualValues.size()); + Collections.sort(expectedValues); + Collections.sort(actualValues); + Assert.assertEquals(expectedValues, actualValues); + } + + @Override + ParallelIndexSupervisorTask createParallelIndexSupervisorTask( + String id, + TaskResource taskResource, + ParallelIndexIngestionSpec ingestionSchema, + Map context, + IndexingServiceClient indexingServiceClient + ) + { + return new TestSupervisorTask(id, taskResource, ingestionSchema, context, indexingServiceClient); + } + + private static class TestSupervisorTask extends TestParallelIndexSupervisorTask + { + TestSupervisorTask( + String id, + TaskResource taskResource, + ParallelIndexIngestionSpec ingestionSchema, + Map context, + IndexingServiceClient indexingServiceClient + ) + { + super(id, taskResource, ingestionSchema, context, indexingServiceClient); + } + + @Override + PartialDimensionDistributionParallelIndexTaskRunner createPartialDimensionDistributionRunner(TaskToolbox toolbox) + { + return new TestPartialDimensionDistributionRunner(toolbox, this, getIndexingServiceClient()); + } + + @Override + PartialRangeSegmentGenerateParallelIndexTaskRunner createPartialRangeSegmentGenerateRunner( + TaskToolbox toolbox, + Map intervalToPartitions + ) + { + return new TestPartialRangeSegmentGenerateRunner( + toolbox, + this, + getIndexingServiceClient(), + intervalToPartitions + ); + } + + @Override + public PartialGenericSegmentMergeParallelIndexTaskRunner createPartialGenericSegmentMergeRunner( + TaskToolbox toolbox, + List ioConfigs + ) + { + return new TestPartialGenericSegmentMergeParallelIndexTaskRunner( + toolbox, + this, + ioConfigs, + getIndexingServiceClient() + ); + } + } + + private static class TestPartialDimensionDistributionRunner + extends PartialDimensionDistributionParallelIndexTaskRunner + { + private TestPartialDimensionDistributionRunner( + TaskToolbox toolbox, + ParallelIndexSupervisorTask supervisorTask, + IndexingServiceClient indexingServiceClient + ) + { + super( + toolbox, + supervisorTask.getId(), + supervisorTask.getGroupId(), + supervisorTask.getIngestionSchema(), + supervisorTask.getContext(), + indexingServiceClient, + new LocalParallelIndexTaskClientFactory(supervisorTask) + ); + } + } + + private static class TestPartialRangeSegmentGenerateRunner extends PartialRangeSegmentGenerateParallelIndexTaskRunner + { + private TestPartialRangeSegmentGenerateRunner( + TaskToolbox toolbox, + ParallelIndexSupervisorTask supervisorTask, + IndexingServiceClient indexingServiceClient, + Map intervalToPartitions + ) + { + super( + toolbox, + supervisorTask.getId(), + supervisorTask.getGroupId(), + supervisorTask.getIngestionSchema(), + supervisorTask.getContext(), + indexingServiceClient, + intervalToPartitions, + new LocalParallelIndexTaskClientFactory(supervisorTask), + new TestAppenderatorsManager() + ); + } + } + + + private static class TestPartialGenericSegmentMergeParallelIndexTaskRunner + extends PartialGenericSegmentMergeParallelIndexTaskRunner + { + private final ParallelIndexSupervisorTask supervisorTask; + + private TestPartialGenericSegmentMergeParallelIndexTaskRunner( + TaskToolbox toolbox, + ParallelIndexSupervisorTask supervisorTask, + List mergeIOConfigs, + IndexingServiceClient indexingServiceClient + ) + { + super( + toolbox, + supervisorTask.getId(), + supervisorTask.getGroupId(), + supervisorTask.getIngestionSchema().getDataSchema(), + mergeIOConfigs, + supervisorTask.getIngestionSchema().getTuningConfig(), + supervisorTask.getContext(), + indexingServiceClient + ); + this.supervisorTask = supervisorTask; + } + + @Override + SubTaskSpec newTaskSpec(PartialGenericSegmentMergeIOConfig ioConfig) + { + final PartialGenericSegmentMergeIngestionSpec ingestionSpec = + new PartialGenericSegmentMergeIngestionSpec( + supervisorTask.getIngestionSchema().getDataSchema(), + ioConfig, + getTuningConfig() + ); + return new SubTaskSpec( + getTaskId() + "_" + getAndIncrementNextSpecId(), + getGroupId(), + getTaskId(), + getContext(), + new InputSplit<>(ioConfig.getPartitionLocations()) + ) + { + @Override + public PartialGenericSegmentMergeTask newSubTask(int numAttempts) + { + return new TestPartialGenericSegmentMergeTask( + null, + getGroupId(), + null, + getSupervisorTaskId(), + numAttempts, + ingestionSpec, + getContext(), + getIndexingServiceClient(), + new LocalParallelIndexTaskClientFactory(supervisorTask), + getToolbox() + ); + } + }; + } + } + + private static class TestPartialGenericSegmentMergeTask extends PartialGenericSegmentMergeTask + { + private final TaskToolbox toolbox; + + private TestPartialGenericSegmentMergeTask( + @Nullable String id, + String groupId, + TaskResource taskResource, + String supervisorTaskId, + int numAttempts, + PartialGenericSegmentMergeIngestionSpec ingestionSchema, + Map context, + IndexingServiceClient indexingServiceClient, + IndexTaskClientFactory taskClientFactory, + TaskToolbox toolbox + ) + { + super( + id, + groupId, + taskResource, + supervisorTaskId, + numAttempts, + ingestionSchema, + context, + indexingServiceClient, + taskClientFactory, + null + ); + this.toolbox = toolbox; + } + + @Override + File fetchSegmentFile(File partitionDir, GenericPartitionLocation location) + { + final File zippedFile = toolbox.getIntermediaryDataManager().findPartitionFile( + getSupervisorTaskId(), + location.getSubTaskId(), + location.getInterval(), + location.getPartitionId() + ); + if (zippedFile == null) { + throw new ISE("Can't find segment file for location[%s] at path[%s]", location); + } + return zippedFile; + } + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java new file mode 100644 index 000000000000..5a39b585a849 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.distribution; + +import org.easymock.EasyMock; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +public class StringSketchMergerTest +{ + private StringSketchMerger target; + + @Rule + public ExpectedException exception = ExpectedException.none(); + + @Before + public void setup() + { + target = new StringSketchMerger(); + } + + @Test + public void requiresStringSketch() + { + StringDistribution distribution = EasyMock.mock(StringDistribution.class); + + exception.expect(IllegalArgumentException.class); + exception.expectMessage("Only merging StringSketch instances is currently supported"); + + target.merge(distribution); + } + + @Test + public void mergesCorrectly() + { + String string1 = "a"; + StringSketch sketch1 = new StringSketch(); + sketch1.put(string1); + + String string2 = "mn"; + StringSketch sketch2 = new StringSketch(); + sketch2.put(string2); + + String string3 = "z"; + StringSketch sketch3 = new StringSketch(); + sketch3.put(string3); + + target.merge(sketch2); + target.merge(sketch1); + target.merge(sketch3); + StringDistribution merged = target.getResult(); + + String[] partitions = merged.getEvenPartitionsByMaxSize(1); + Assert.assertEquals(4, partitions.length); + Assert.assertEquals(string1, partitions[0]); // min + Assert.assertEquals(string2, partitions[1]); // median + Assert.assertEquals(string3, partitions[2]); // max + Assert.assertEquals(string3, partitions[3]); // max + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java new file mode 100644 index 000000000000..0a0559e6049d --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java @@ -0,0 +1,379 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.distribution; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.yahoo.sketches.quantiles.ItemsSketch; +import org.apache.druid.jackson.JacksonModule; +import org.apache.druid.segment.TestHelper; +import org.hamcrest.Matchers; +import org.hamcrest.number.IsCloseTo; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.runners.Enclosed; +import org.junit.rules.ExpectedException; +import org.junit.runner.RunWith; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.StringJoiner; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +@RunWith(Enclosed.class) +public class StringSketchTest +{ + private static final int FACTOR = 2; + private static final int NUM_STRING = StringSketch.SKETCH_K * FACTOR; + private static final double DELTA = ItemsSketch.getNormalizedRankError(StringSketch.SKETCH_K, true) * NUM_STRING; + private static final List STRINGS = IntStream.range(0, NUM_STRING) + .mapToObj(i -> String.format("%010d", i)) + .collect(Collectors.toCollection(ArrayList::new)); + private static final String MIN_STRING = STRINGS.get(0); + private static final String MAX_STRING = STRINGS.get(NUM_STRING - 1); + + static { + ItemsSketch.rand.setSeed(0); // make sketches deterministic for testing + } + + public static class SerializationDeserializationTest + { + private static final ObjectMapper OBJECT_MAPPER = new JacksonModule().smileMapper(); + + @Test + public void serializesDeserializes() + { + StringSketch target = new StringSketch(); + target.put(MIN_STRING); + target.put(MAX_STRING); + TestHelper.testSerializesDeserializes(OBJECT_MAPPER, target); + } + } + + public static class PutTest + { + private StringSketch target; + + @Before + public void setup() + { + target = new StringSketch(); + } + + @Test + public void putIfNewMin() + { + String value = MAX_STRING; + Assert.assertEquals(0, getCount()); + + target.putIfNewMin(value); + Assert.assertEquals(1, getCount()); + + target.putIfNewMin(value); + Assert.assertEquals(1, getCount()); + Assert.assertEquals(value, target.getDelegate().getMinValue()); + Assert.assertEquals(value, target.getDelegate().getMaxValue()); + + target.putIfNewMin(MIN_STRING); + Assert.assertEquals(2, getCount()); + Assert.assertEquals(MIN_STRING, target.getDelegate().getMinValue()); + Assert.assertEquals(MAX_STRING, target.getDelegate().getMaxValue()); + } + + @Test + public void putIfNewMax() + { + String value = MIN_STRING; + Assert.assertEquals(0, getCount()); + + target.putIfNewMax(value); + Assert.assertEquals(1, getCount()); + + target.putIfNewMax(value); + Assert.assertEquals(1, getCount()); + Assert.assertEquals(value, target.getDelegate().getMinValue()); + Assert.assertEquals(value, target.getDelegate().getMaxValue()); + + target.putIfNewMax(MAX_STRING); + Assert.assertEquals(2, getCount()); + Assert.assertEquals(MIN_STRING, target.getDelegate().getMinValue()); + Assert.assertEquals(MAX_STRING, target.getDelegate().getMaxValue()); + } + + private long getCount() + { + return target.getDelegate().getN(); + } + } + + @RunWith(Enclosed.class) + public static class PartitionTest + { + private static final StringSketch SKETCH; + + static { + SKETCH = new StringSketch(); + STRINGS.forEach(SKETCH::put); + } + + public static class TargetSizeTest + { + @Rule + public ExpectedException exception = ExpectedException.none(); + + @Test + public void requiresPositiveSize() + { + exception.expect(IllegalArgumentException.class); + exception.expectMessage("targetSize must be positive but is 0"); + + SKETCH.getEvenPartitionsByTargetSize(0); + } + + @Test + public void handlesEmptySketch() + { + StringSketch sketch = new StringSketch(); + String[] partitions = sketch.getEvenPartitionsByTargetSize(1); + Assert.assertEquals(0, partitions.length); + } + + @Test + public void handlesSingletonSketch() + { + String value = MIN_STRING; + StringSketch sketch = new StringSketch(); + sketch.put(value); + String[] partitions = sketch.getEvenPartitionsByTargetSize(1); + Assert.assertEquals(2, partitions.length); + Assert.assertEquals(value, partitions[0]); + Assert.assertEquals(value, partitions[1]); + } + + @Test + public void handlesMinimimumSize() + { + String[] partitions = SKETCH.getEvenPartitionsByTargetSize(1); + assertMaxNumberOfPartitions(partitions); + } + + @Test + public void handlesUnevenPartitions() + { + List targetSizes = Arrays.asList(127, 257, 509, 1021, 2039, 4093); + targetSizes.forEach(TargetSizeTest::testHandlesUnevenPartitions); + } + + private static void testHandlesUnevenPartitions(int targetSize) + { + String[] partitions = SKETCH.getEvenPartitionsByTargetSize(targetSize); + + assertFirstAndLastPartitionsCorrect(partitions); + + String partitionsString = PartitionTest.toString(partitions); + int expectedHighPartitionCount = (int) Math.ceil((double) NUM_STRING / targetSize); + int expectedLowPartitionCount = expectedHighPartitionCount - 1; + Assert.assertThat( + "targetSize=" + targetSize + " " + partitionsString, + partitions.length, + Matchers.lessThanOrEqualTo(expectedHighPartitionCount + 1) + ); + Assert.assertThat( + "targetSize=" + targetSize + " " + partitionsString, + partitions.length, + Matchers.greaterThanOrEqualTo(expectedLowPartitionCount + 1) + ); + + int previous = 0; + for (int i = 1; i < partitions.length; i++) { + int current = Integer.parseInt(partitions[i]); + int size = current - previous; + Assert.assertThat( + getErrMsgPrefix(targetSize, i) + partitionsString, + (double) size, + IsCloseTo.closeTo(targetSize, Math.ceil(DELTA) * 2) + ); + previous = current; + } + } + + @Test + public void handlesSinglePartition() + { + String[] partitions = SKETCH.getEvenPartitionsByTargetSize(NUM_STRING); + assertSinglePartition(partitions); + } + + @Test + public void handlesOversizedPartition() + { + String[] partitions = SKETCH.getEvenPartitionsByTargetSize(Integer.MAX_VALUE); + assertSinglePartition(partitions); + } + } + + public static class MaxSizeTest + { + @Rule + public ExpectedException exception = ExpectedException.none(); + + @Test + public void requiresPositiveSize() + { + exception.expect(IllegalArgumentException.class); + exception.expectMessage("maxSize must be positive but is 0"); + + SKETCH.getEvenPartitionsByMaxSize(0); + } + + @Test + public void handlesEmptySketch() + { + StringSketch sketch = new StringSketch(); + String[] partitions = sketch.getEvenPartitionsByMaxSize(1); + Assert.assertEquals(0, partitions.length); + } + + @Test + public void handlesSingletonSketch() + { + String value = MIN_STRING; + StringSketch sketch = new StringSketch(); + sketch.put(value); + String[] partitions = sketch.getEvenPartitionsByMaxSize(1); + Assert.assertEquals(2, partitions.length); + Assert.assertEquals(value, partitions[0]); + Assert.assertEquals(value, partitions[1]); + } + + @Test + public void handlesMinimimumSize() + { + String[] partitions = SKETCH.getEvenPartitionsByMaxSize(1); + assertMaxNumberOfPartitions(partitions); + } + + @Test + public void handlesUnevenPartitions() + { + List maxSizes = Arrays.asList(509, 1021, 2039, 4093); + maxSizes.forEach(MaxSizeTest::testHandlesUnevenPartitions); + } + + private static void testHandlesUnevenPartitions(int maxSize) + { + String[] partitions = SKETCH.getEvenPartitionsByMaxSize(maxSize); + + assertFirstAndLastPartitionsCorrect(partitions); + + String partitionsString = PartitionTest.toString(partitions); + long expectedPartitionCount = (long) Math.ceil((double) NUM_STRING / maxSize); + Assert.assertEquals( + "maxSize=" + maxSize + " " + partitionsString, + expectedPartitionCount + 1, + partitions.length + ); + + double minSize = (double) NUM_STRING / expectedPartitionCount - DELTA; + + int previous = 0; + for (int i = 1; i < partitions.length; i++) { + int current = Integer.parseInt(partitions[i]); + int size = current - previous; + Assert.assertThat( + getErrMsgPrefix(maxSize, i) + partitionsString, + size, + Matchers.lessThanOrEqualTo(maxSize) + ); + Assert.assertThat( + getErrMsgPrefix(maxSize, i) + partitionsString, + (double) size, + Matchers.greaterThanOrEqualTo(minSize) + ); + previous = current; + } + } + + @Test + public void handlesSinglePartition() + { + String[] partitions = SKETCH.getEvenPartitionsByMaxSize((int) Math.ceil(NUM_STRING + DELTA)); + assertSinglePartition(partitions); + } + + @Test + public void handlesOversizedPartition() + { + String[] partitions = SKETCH.getEvenPartitionsByMaxSize(Integer.MAX_VALUE); + assertSinglePartition(partitions); + } + } + + private static void assertMaxNumberOfPartitions(String[] partitions) + { + String partitionsString = toString(partitions); + + Assert.assertEquals(partitionsString, NUM_STRING + 1, partitions.length); + assertFirstAndLastPartitionsCorrect(partitions); + + int previous = 0; + for (int i = 1; i < partitions.length; i++) { + int current = Integer.parseInt(partitions[i]); + Assert.assertEquals( + getErrMsgPrefix(1, i) + partitionsString, + 1, + current - previous, + FACTOR + ); + previous = current; + } + } + + private static void assertSinglePartition(String[] partitions) + { + Assert.assertEquals(2, partitions.length); + assertFirstAndLastPartitionsCorrect(partitions); + } + + private static void assertFirstAndLastPartitionsCorrect(String[] partitions) + { + Assert.assertEquals(MIN_STRING, partitions[0]); + Assert.assertEquals(MAX_STRING, partitions[partitions.length - 1]); + } + + private static String getErrMsgPrefix(int size, int i) + { + return "size=" + size + " i=" + i + " of "; + } + + private static String toString(String[] partitions) + { + String prefix = "partitions[" + partitions.length + "]="; + StringJoiner sj = new StringJoiner(" ", prefix, "]"); + for (int i = 0; i < partitions.length; i++) { + sj.add("[" + i + "]=" + partitions[i]); + } + return sj.toString(); + } + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFactoryTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFactoryTest.java new file mode 100644 index 000000000000..4d0b0795b822 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFactoryTest.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.distribution; + +import org.apache.druid.java.util.common.DateTimes; +import org.apache.druid.java.util.common.granularity.Granularities; +import org.apache.druid.java.util.common.granularity.Granularity; +import org.joda.time.DateTime; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TimeDimTupleFactoryTest +{ + private static final Granularity GRANULARITY = Granularities.SECOND; + private static final DateTime TIMESTAMP = DateTimes.utc(0); + private static final String DIMENSION_VALUE = "abc"; + + private TimeDimTupleFactory target; + + @Before + public void setup() + { + target = new TimeDimTupleFactory(GRANULARITY); + } + + @Test + public void adjustsTimestamps() + { + TimeDimTuple timeDimTuple = target.createWithBucketedTimestamp(TIMESTAMP, DIMENSION_VALUE); + Assert.assertEquals(TIMESTAMP.getMillis(), timeDimTuple.getTimestamp()); + + TimeDimTuple timeDimTuple_plus_1msec = target.createWithBucketedTimestamp(TIMESTAMP.plus(1), DIMENSION_VALUE); + Assert.assertEquals(TIMESTAMP.getMillis(), timeDimTuple_plus_1msec.getTimestamp()); + + TimeDimTuple timeDimTuple_plus_999msec = target.createWithBucketedTimestamp(TIMESTAMP.plus(999), DIMENSION_VALUE); + Assert.assertEquals(TIMESTAMP.getMillis(), timeDimTuple_plus_999msec.getTimestamp()); + + TimeDimTuple timeDimTuple_plus_1sec = target.createWithBucketedTimestamp(TIMESTAMP.plus(1000), DIMENSION_VALUE); + Assert.assertEquals(TIMESTAMP.getMillis() + 1000, timeDimTuple_plus_1sec.getTimestamp()); + } + + @Test + public void setsDimensionValue() + { + TimeDimTuple timeDimTuple = target.createWithBucketedTimestamp(TIMESTAMP, DIMENSION_VALUE); + Assert.assertEquals(DIMENSION_VALUE, timeDimTuple.getDimensionValue()); + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFunnelTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFunnelTest.java new file mode 100644 index 000000000000..87e9f46d2a6e --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleFunnelTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.distribution; + +import com.google.common.hash.BloomFilter; +import org.junit.Assert; +import org.junit.Test; + +public class TimeDimTupleFunnelTest +{ + @Test + public void worksWithBloomFilter() + { + TimeDimTuple tuple = new TimeDimTuple(1000, "a"); + BloomFilter bloomFilter = BloomFilter.create(TimeDimTupleFunnel.INSTANCE, 10); + Assert.assertFalse(bloomFilter.mightContain(tuple)); + bloomFilter.put(tuple); + Assert.assertTrue(bloomFilter.mightContain(tuple)); + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleTest.java new file mode 100644 index 000000000000..0570a030e330 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/TimeDimTupleTest.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.distribution; + +import org.hamcrest.Matchers; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TimeDimTupleTest +{ + private static final long TIMESTAMP = 1000; + private static final String DIMENSION1 = "a"; + private static final String DIMENSION2 = "m"; + private static final String DIMENSION3 = "z"; + + private TimeDimTuple target; + + @Before + public void setup() + { + target = new TimeDimTuple(TIMESTAMP, DIMENSION2); + } + + @Test + public void comparesCorrectlyToSmallerTimestamp() + { + Assert.assertThat(target.compareTo(new TimeDimTuple(TIMESTAMP - 1, DIMENSION2)), Matchers.greaterThan(0)); + } + + @Test + public void comparesCorrectlyToSmallerDimension() + { + Assert.assertThat(target.compareTo(new TimeDimTuple(TIMESTAMP, DIMENSION1)), Matchers.greaterThan(0)); + } + + @Test + public void comparesCorrectlyToEqual() + { + Assert.assertEquals(0, target.compareTo(new TimeDimTuple(TIMESTAMP, DIMENSION2))); + } + + @Test + public void comparesCorrectlyToBiggerTimestamp() + { + Assert.assertThat(target.compareTo(new TimeDimTuple(TIMESTAMP + 1, DIMENSION2)), Matchers.lessThan(0)); + } + + @Test + public void comparesCorrectlyToBiggerDimension() + { + Assert.assertThat(target.compareTo(new TimeDimTuple(TIMESTAMP, DIMENSION3)), Matchers.lessThan(0)); + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/IndexTaskInputRowIteratorBuilderTestingFactory.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/IndexTaskInputRowIteratorBuilderTestingFactory.java index 754742fe3780..628a5b008e3c 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/IndexTaskInputRowIteratorBuilderTestingFactory.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/IndexTaskInputRowIteratorBuilderTestingFactory.java @@ -75,6 +75,7 @@ public boolean hasNext() return true; } + @SuppressWarnings("IteratorNextCanNotThrowNoSuchElementException") @Override public InputRow next() { diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionTaskInputRowIteratorBuilderTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionTaskInputRowIteratorBuilderTest.java new file mode 100644 index 000000000000..6093d0d0eb46 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionTaskInputRowIteratorBuilderTest.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.iterator; + +import org.apache.druid.data.input.InputRow; +import org.apache.druid.java.util.common.parsers.CloseableIterator; +import org.apache.druid.segment.indexing.granularity.GranularitySpec; +import org.joda.time.DateTime; +import org.junit.Assert; +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class RangePartitionTaskInputRowIteratorBuilderTest +{ + private static final IndexTaskInputRowIteratorBuilderTestingFactory.HandlerTester HANDLER_TESTER = + IndexTaskInputRowIteratorBuilderTestingFactory.createHandlerTester(() -> new RangePartitionIndexTaskInputRowIteratorBuilder(IndexTaskInputRowIteratorBuilderTestingFactory.DIMENSION)); + private static final InputRow NO_NEXT_INPUT_ROW = null; + + @Test + public void invokesDimensionValueCountFilterLast() + { + DateTime timestamp = IndexTaskInputRowIteratorBuilderTestingFactory.TIMESTAMP; + List multipleDimensionValues = Arrays.asList("multiple", "dimension", "values"); + InputRow inputRow = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRow(timestamp, multipleDimensionValues); + CloseableIterator inputRowIterator = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRowIterator(inputRow); + GranularitySpec granularitySpec = IndexTaskInputRowIteratorBuilderTestingFactory.createGranularitySpec(timestamp, IndexTaskInputRowIteratorBuilderTestingFactory.PRESENT_BUCKET_INTERVAL_OPT); + + List handlerInvocationHistory = HANDLER_TESTER.invokeHandlers( + inputRowIterator, + granularitySpec, + NO_NEXT_INPUT_ROW + ); + + Assert.assertEquals(Collections.emptyList(), handlerInvocationHistory); + } + + @Test + public void doesNotInvokeHandlersIfRowValid() + { + DateTime timestamp = IndexTaskInputRowIteratorBuilderTestingFactory.TIMESTAMP; + List singleDimensionValue = Collections.singletonList("single-dimension-value"); + InputRow inputRow = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRow(timestamp, singleDimensionValue); + CloseableIterator inputRowIterator = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRowIterator(inputRow); + GranularitySpec granularitySpec = IndexTaskInputRowIteratorBuilderTestingFactory.createGranularitySpec(timestamp, IndexTaskInputRowIteratorBuilderTestingFactory.PRESENT_BUCKET_INTERVAL_OPT); + + List handlerInvocationHistory = HANDLER_TESTER.invokeHandlers( + inputRowIterator, + granularitySpec, + inputRow + ); + + Assert.assertEquals(Collections.emptyList(), handlerInvocationHistory); + } +} diff --git a/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractITBatchIndexTest.java b/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractITBatchIndexTest.java index 2a7e0f5956f2..af1b2a40084e 100644 --- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractITBatchIndexTest.java +++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractITBatchIndexTest.java @@ -21,8 +21,11 @@ import com.google.inject.Inject; import org.apache.commons.io.IOUtils; +import org.apache.druid.indexing.common.task.batch.parallel.PartialDimensionDistributionTask; +import org.apache.druid.indexing.common.task.batch.parallel.PartialGenericSegmentMergeTask; import org.apache.druid.indexing.common.task.batch.parallel.PartialHashSegmentGenerateTask; import org.apache.druid.indexing.common.task.batch.parallel.PartialHashSegmentMergeTask; +import org.apache.druid.indexing.common.task.batch.parallel.PartialRangeSegmentGenerateTask; import org.apache.druid.indexing.common.task.batch.parallel.SinglePhaseSubTask; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.Intervals; @@ -260,7 +263,10 @@ private long countCompleteSubTasks(final String dataSource, final boolean perfec return t.getType().equals(SinglePhaseSubTask.TYPE); } else { return t.getType().equalsIgnoreCase(PartialHashSegmentGenerateTask.TYPE) - || t.getType().equalsIgnoreCase(PartialHashSegmentMergeTask.TYPE); + || t.getType().equalsIgnoreCase(PartialHashSegmentMergeTask.TYPE) + || t.getType().equalsIgnoreCase(PartialDimensionDistributionTask.TYPE) + || t.getType().equalsIgnoreCase(PartialRangeSegmentGenerateTask.TYPE) + || t.getType().equalsIgnoreCase(PartialGenericSegmentMergeTask.TYPE); } }) .count(); diff --git a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITParallelIndexTest.java b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITParallelIndexTest.java index be99de14933f..58c0270d98c9 100644 --- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITParallelIndexTest.java +++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITParallelIndexTest.java @@ -23,6 +23,7 @@ import org.apache.druid.indexer.partitions.DynamicPartitionsSpec; import org.apache.druid.indexer.partitions.HashedPartitionsSpec; import org.apache.druid.indexer.partitions.PartitionsSpec; +import org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.testing.guice.DruidTestModuleFactory; import org.apache.druid.tests.TestNGGroup; @@ -50,7 +51,8 @@ public static Object[][] resources() { return new Object[][]{ {new DynamicPartitionsSpec(null, null)}, - {new HashedPartitionsSpec(null, 2, null)} + {new HashedPartitionsSpec(null, 2, null)}, + {new SingleDimensionPartitionsSpec(2, null, "namespace", false)}, }; } From 1b72540d72a3b53ba3f8044504d2522265354117 Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Fri, 22 Nov 2019 08:36:54 -0800 Subject: [PATCH 02/17] Fix dependencies & forbidden apis --- extensions-core/datasketches/pom.xml | 5 ----- indexing-service/pom.xml | 13 +++++++++++++ .../PartialDimensionDistributionTaskTest.java | 3 ++- .../parallel/distribution/StringSketchTest.java | 3 ++- pom.xml | 5 +++++ 5 files changed, 22 insertions(+), 7 deletions(-) diff --git a/extensions-core/datasketches/pom.xml b/extensions-core/datasketches/pom.xml index cf2a3e1c8d38..97e2da9c5603 100644 --- a/extensions-core/datasketches/pom.xml +++ b/extensions-core/datasketches/pom.xml @@ -34,10 +34,6 @@ ../../pom.xml - - 0.12.2 - - com.yahoo.datasketches @@ -52,7 +48,6 @@ com.yahoo.datasketches memory - ${datasketches.memory.version} org.apache.calcite diff --git a/indexing-service/pom.xml b/indexing-service/pom.xml index c55e2de46827..63c0e99e727c 100644 --- a/indexing-service/pom.xml +++ b/indexing-service/pom.xml @@ -199,11 +199,24 @@ it.unimi.dsi fastutil + + org.apache.logging.log4j + log4j-core + + + org.apache.logging.log4j + log4j-api + com.yahoo.datasketches sketches-core provided + + com.yahoo.datasketches + memory + provided + diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java index 86bfc2e0e8c0..6e44472a4dc0 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java @@ -34,6 +34,7 @@ import org.apache.druid.indexing.common.TaskToolbox; import org.apache.druid.indexing.common.task.IndexTaskClientFactory; import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.segment.TestHelper; import org.apache.druid.segment.indexing.DataSchema; import org.apache.druid.testing.junit.LoggerCaptureRule; @@ -306,7 +307,7 @@ public void preservesMinAndMaxWhenAssumeGroupedFalse() long timestamp = 0; List dimensionValues = IntStream.range(0, minBloomFilterBits * 10) - .mapToObj(i -> String.format("%010d", i)) + .mapToObj(i -> StringUtils.format("%010d", i)) .collect(Collectors.toCollection(ArrayList::new)); String minDimensionValue = dimensionValues.get(0); String maxDimensionValue = dimensionValues.get(dimensionValues.size() - 1); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java index 0a0559e6049d..c5d84d231636 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java @@ -22,6 +22,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.yahoo.sketches.quantiles.ItemsSketch; import org.apache.druid.jackson.JacksonModule; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.segment.TestHelper; import org.hamcrest.Matchers; import org.hamcrest.number.IsCloseTo; @@ -47,7 +48,7 @@ public class StringSketchTest private static final int NUM_STRING = StringSketch.SKETCH_K * FACTOR; private static final double DELTA = ItemsSketch.getNormalizedRankError(StringSketch.SKETCH_K, true) * NUM_STRING; private static final List STRINGS = IntStream.range(0, NUM_STRING) - .mapToObj(i -> String.format("%010d", i)) + .mapToObj(i -> StringUtils.format("%010d", i)) .collect(Collectors.toCollection(ArrayList::new)); private static final String MIN_STRING = STRINGS.get(0); private static final String MAX_STRING = STRINGS.get(NUM_STRING - 1); diff --git a/pom.xml b/pom.xml index 5ee169b44a8e..d1d38f5a74d6 100644 --- a/pom.xml +++ b/pom.xml @@ -987,6 +987,11 @@ sketches-core 0.13.4 + + com.yahoo.datasketches + memory + 0.12.2 + org.apache.calcite From a2f4877933836cf67710a7717efab60a30f3d323 Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Fri, 22 Nov 2019 17:32:52 -0800 Subject: [PATCH 03/17] Fixes for integration test --- .../task/CachingLocalSegmentAllocator.java | 18 +++++++++++------- ...hPartitionCachingLocalSegmentAllocator.java | 2 ++ .../druid/indexing/common/task/IndexTask.java | 2 +- ...ePartitionCachingLocalSegmentAllocator.java | 2 ++ .../parallel/ParallelIndexSupervisorTask.java | 4 ++-- .../PartialHashSegmentGenerateTask.java | 1 + .../PartialRangeSegmentGenerateTask.java | 1 + ...titionCachingLocalSegmentAllocatorTest.java | 2 ++ ...titionCachingLocalSegmentAllocatorTest.java | 2 ++ 9 files changed, 24 insertions(+), 10 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CachingLocalSegmentAllocator.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CachingLocalSegmentAllocator.java index fbb9081aafa3..279786472e41 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CachingLocalSegmentAllocator.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CachingLocalSegmentAllocator.java @@ -23,6 +23,7 @@ import org.apache.druid.indexing.common.TaskLock; import org.apache.druid.indexing.common.TaskToolbox; import org.apache.druid.indexing.common.actions.LockListAction; +import org.apache.druid.indexing.common.actions.SurrogateAction; import org.apache.druid.indexing.common.task.IndexTask.ShardSpecs; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.StringUtils; @@ -56,6 +57,7 @@ interface IntervalToSegmentIdsCreator { /** * @param versionFinder Returns the version for the specified interval + * * @return Information for segment preallocation */ Map> create(Function versionFinder); @@ -64,19 +66,21 @@ interface IntervalToSegmentIdsCreator CachingLocalSegmentAllocator( TaskToolbox toolbox, String taskId, + String supervisorTaskId, IntervalToSegmentIdsCreator intervalToSegmentIdsCreator ) throws IOException { this.taskId = taskId; this.sequenceNameToSegmentId = new HashMap<>(); - final Map intervalToVersion = toolbox.getTaskActionClient() - .submit(new LockListAction()) - .stream() - .collect(Collectors.toMap( - TaskLock::getInterval, - TaskLock::getVersion - )); + final Map intervalToVersion = + toolbox.getTaskActionClient() + .submit(new SurrogateAction<>(supervisorTaskId, new LockListAction())) + .stream() + .collect(Collectors.toMap( + TaskLock::getInterval, + TaskLock::getVersion + )); Function versionFinder = interval -> findVersion(intervalToVersion, interval); final Map> intervalToIds = intervalToSegmentIdsCreator.create(versionFinder); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/HashPartitionCachingLocalSegmentAllocator.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/HashPartitionCachingLocalSegmentAllocator.java index 9640ed461358..fa54a76295ca 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/HashPartitionCachingLocalSegmentAllocator.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/HashPartitionCachingLocalSegmentAllocator.java @@ -51,6 +51,7 @@ public class HashPartitionCachingLocalSegmentAllocator implements IndexTaskSegme public HashPartitionCachingLocalSegmentAllocator( TaskToolbox toolbox, String taskId, + String supervisorTaskId, String dataSource, Map> allocateSpec ) throws IOException @@ -62,6 +63,7 @@ public HashPartitionCachingLocalSegmentAllocator( this.delegate = new CachingLocalSegmentAllocator( toolbox, taskId, + supervisorTaskId, this::getIntervalToSegmentIds ); } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/IndexTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/IndexTask.java index 094a713cde16..4fa9fa095139 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/IndexTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/IndexTask.java @@ -828,7 +828,7 @@ private IndexTaskSegmentAllocator createSegmentAllocator( // We use the timeChunk lock and don't have to ask the overlord to create segmentIds. // Instead, a local allocator is used. if (isGuaranteedRollup(ingestionSchema.ioConfig, ingestionSchema.tuningConfig)) { - return new HashPartitionCachingLocalSegmentAllocator(toolbox, getId(), getDataSource(), allocateSpec); + return new HashPartitionCachingLocalSegmentAllocator(toolbox, getId(), getId(), getDataSource(), allocateSpec); } else { return new LocalSegmentAllocator(toolbox, getId(), getDataSource(), dataSchema.getGranularitySpec()); } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java index 15c9b56c60d5..d8b8ff25493e 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java @@ -52,6 +52,7 @@ public class RangePartitionCachingLocalSegmentAllocator implements IndexTaskSegm public RangePartitionCachingLocalSegmentAllocator( TaskToolbox toolbox, String taskId, + String supervisorTaskId, String dataSource, String partitionDimension, Map intervalsToPartitions @@ -64,6 +65,7 @@ public RangePartitionCachingLocalSegmentAllocator( this.delegate = new CachingLocalSegmentAllocator( toolbox, taskId, + supervisorTaskId, this::getIntervalToSegmentIds ); } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java index d2e94a675e9b..e63d2b9cd67a 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java @@ -633,8 +633,8 @@ private static void assertDataSketchesAvailable() //noinspection ResultOfObjectAllocationIgnored new StringSketch(); } - catch (Exception e) { - throw new ISE(e, "DataSketches is unvailable. Try loading the druid-datasketches extension."); + catch (Throwable t) { + throw new ISE(t, "DataSketches is unvailable. Try adding the druid-datasketches extension to the classpath."); } } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentGenerateTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentGenerateTask.java index d7f886207719..7b6f70b0efd6 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentGenerateTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentGenerateTask.java @@ -130,6 +130,7 @@ IndexTaskSegmentAllocator createSegmentAllocator(TaskToolbox toolbox) throws IOE return new HashPartitionCachingLocalSegmentAllocator( toolbox, getId(), + supervisorTaskId, getDataSource(), createShardSpecs() ); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java index 8956fbee2195..5b8e67d7266d 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java @@ -153,6 +153,7 @@ IndexTaskSegmentAllocator createSegmentAllocator(TaskToolbox toolbox) throws IOE return new RangePartitionCachingLocalSegmentAllocator( toolbox, getId(), + supervisorTaskId, getDataSource(), getPartitionDimension(ingestionSchema), intervalToPartitions diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java index 86cb36403c25..a3f4e771abf0 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java @@ -50,6 +50,7 @@ public class RangePartitionCachingLocalSegmentAllocatorTest { private static final String DATASOURCE = "datasource"; private static final String TASKID = "taskid"; + private static final String SUPERVISOR_TASKID = "supervisor-taskid"; private static final String PARTITION_DIMENSION = "dimension"; private static final Interval INTERVAL_EMPTY = Intervals.utc(0, 1000); private static final Interval INTERVAL_SINGLETON = Intervals.utc(1000, 2000); @@ -97,6 +98,7 @@ public void setup() throws IOException target = new RangePartitionCachingLocalSegmentAllocator( toolbox, TASKID, + SUPERVISOR_TASKID, DATASOURCE, PARTITION_DIMENSION, INTERVAL_TO_PARTITONS diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/HashPartitionCachingLocalSegmentAllocatorTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/HashPartitionCachingLocalSegmentAllocatorTest.java index 5b60bdf7a610..e82101d7386a 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/HashPartitionCachingLocalSegmentAllocatorTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/HashPartitionCachingLocalSegmentAllocatorTest.java @@ -53,6 +53,7 @@ public class HashPartitionCachingLocalSegmentAllocatorTest private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final String DATASOURCE = "datasource"; private static final String TASKID = "taskid"; + private static final String SUPERVISOR_TASKID = "supervisor-taskid"; private static final Interval INTERVAL = Intervals.utc(0, 1000); private static final String VERSION = "version"; private static final String DIMENSION = "dim"; @@ -76,6 +77,7 @@ public void setup() throws IOException target = new HashPartitionCachingLocalSegmentAllocator( toolbox, TASKID, + SUPERVISOR_TASKID, DATASOURCE, ALLOCATE_SPEC ); From 6211c4150f1cc58cc8450c867f087a2fb258f203 Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Sun, 1 Dec 2019 22:08:03 -0800 Subject: [PATCH 04/17] Address review comments --- .../partition/SingleDimensionShardSpec.java | 23 ++++++ docs/ingestion/native-batch.md | 6 +- indexing-service/pom.xml | 1 + ...> CachingLocalSegmentAllocatorHelper.java} | 4 +- ...PartitionCachingLocalSegmentAllocator.java | 4 +- ...PartitionCachingLocalSegmentAllocator.java | 46 ++++------- ...=> GeneratedPartitionsMetadataReport.java} | 15 ++-- .../parallel/GeneratedPartitionsReport.java | 2 +- .../parallel/ParallelIndexSupervisorTask.java | 58 +++++++------- .../PartialDimensionDistributionTask.java | 49 ++++++++---- ...icSegmentMergeParallelIndexTaskRunner.java | 11 ++- .../PartialGenericSegmentMergeTask.java | 28 +++++-- .../parallel/PartialHashSegmentMergeTask.java | 4 +- ...egmentGenerateParallelIndexTaskRunner.java | 9 ++- .../PartialRangeSegmentGenerateTask.java | 30 +++---- .../parallel/PartialSegmentMergeTask.java | 2 +- ...titionStat.java => PartitionMetadata.java} | 13 +-- .../task/batch/parallel/SubTaskReport.java | 2 +- .../parallel/distribution/Partitions.java | 44 ++++++++++ .../distribution/StringDistribution.java | 10 +-- .../parallel/distribution/StringSketch.java | 12 +-- .../common/task/IngestionTestBase.java | 8 +- ...itionCachingLocalSegmentAllocatorTest.java | 51 ++++++++---- .../PartialDimensionDistributionTaskTest.java | 58 +++++++++----- ...atTest.java => PartitionMetadataTest.java} | 6 +- ...rtitionMultiPhaseParallelIndexingTest.java | 6 +- .../parallel/distribution/PartitionsTest.java | 60 ++++++++++++++ .../distribution/StringSketchMergerTest.java | 12 +-- .../distribution/StringSketchTest.java | 80 +++++++++---------- 29 files changed, 417 insertions(+), 237 deletions(-) rename indexing-service/src/main/java/org/apache/druid/indexing/common/task/{CachingLocalSegmentAllocator.java => CachingLocalSegmentAllocatorHelper.java} (97%) rename indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/{GeneratedGenericPartitionsReport.java => GeneratedPartitionsMetadataReport.java} (65%) rename indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/{GenericPartitionStat.java => PartitionMetadata.java} (82%) create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/Partitions.java rename indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/{GenericPartitionStatTest.java => PartitionMetadataTest.java} (94%) create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionsTest.java diff --git a/core/src/main/java/org/apache/druid/timeline/partition/SingleDimensionShardSpec.java b/core/src/main/java/org/apache/druid/timeline/partition/SingleDimensionShardSpec.java index 968a1d74cc98..9db390c462fe 100644 --- a/core/src/main/java/org/apache/druid/timeline/partition/SingleDimensionShardSpec.java +++ b/core/src/main/java/org/apache/druid/timeline/partition/SingleDimensionShardSpec.java @@ -31,6 +31,7 @@ import javax.annotation.Nullable; import java.util.List; import java.util.Map; +import java.util.Objects; /** * {@link ShardSpec} for range partitioning based on a single dimension @@ -184,4 +185,26 @@ public String toString() ", partitionNum=" + partitionNum + '}'; } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + SingleDimensionShardSpec that = (SingleDimensionShardSpec) o; + return partitionNum == that.partitionNum && + Objects.equals(dimension, that.dimension) && + Objects.equals(start, that.start) && + Objects.equals(end, that.end); + } + + @Override + public int hashCode() + { + return Objects.hash(dimension, start, end, partitionNum); + } } diff --git a/docs/ingestion/native-batch.md b/docs/ingestion/native-batch.md index e11986ed7a49..e9d1d4082c07 100644 --- a/docs/ingestion/native-batch.md +++ b/docs/ingestion/native-batch.md @@ -246,8 +246,8 @@ You should use different partitionsSpec depending on the [rollup mode](../ingest For perfect rollup, you should use either `hashed` (partitioning based on the hash of dimensions in each row) or `single_dim` (based on ranges of a single dimension. For best-effort rollup, you should use `dynamic`. -Hashed partitioning is recommended in most cases, as it will improve indexing performance and create more uniformly -sized data segments relative to single-dimension or dynamic partitioning. +For perfect rollup, `ashed partitioning is recommended in most cases, as it will improve indexing +performance and create more uniformly sized data segments relative to single-dimension partitioning. #### Hash-based partitioning @@ -266,9 +266,9 @@ sized data segments relative to single-dimension or dynamic partitioning. |property|description|default|required?| |--------|-----------|-------|---------| |type|This should always be `single_dim`|none|yes| +|partitionDimension|The dimension to partition on. Only rows with a single dimension value will be included.|none|yes| |targetRowsPerSegment|Target number of rows to include in a partition, should be a number that targets segments of 500MB\~1GB.|none|either this or `maxRowsPerSegment`| |maxRowsPerSegment|Maximum number of rows to include in a partition. Defaults to 50% larger than the `targetRowsPerSegment`.|none|either this or `targetRowsPerSegment`| -|partitionDimension|The dimension to partition on.|none|yes| |assumeGrouped|Assume that input data has already been grouped on time and dimensions. Ingestion will run faster, but may choose sub-optimal partitions if this assumption is violated.|false|no| #### Dynamic partitioning diff --git a/indexing-service/pom.xml b/indexing-service/pom.xml index 63c0e99e727c..9b3b0089cba0 100644 --- a/indexing-service/pom.xml +++ b/indexing-service/pom.xml @@ -208,6 +208,7 @@ log4j-api + com.yahoo.datasketches sketches-core provided diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CachingLocalSegmentAllocator.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CachingLocalSegmentAllocatorHelper.java similarity index 97% rename from indexing-service/src/main/java/org/apache/druid/indexing/common/task/CachingLocalSegmentAllocator.java rename to indexing-service/src/main/java/org/apache/druid/indexing/common/task/CachingLocalSegmentAllocatorHelper.java index 279786472e41..1963fb4c2fdc 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CachingLocalSegmentAllocator.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CachingLocalSegmentAllocatorHelper.java @@ -46,7 +46,7 @@ * @see HashPartitionCachingLocalSegmentAllocator * @see RangePartitionCachingLocalSegmentAllocator */ -class CachingLocalSegmentAllocator implements IndexTaskSegmentAllocator +class CachingLocalSegmentAllocatorHelper implements IndexTaskSegmentAllocator { private final String taskId; private final Map sequenceNameToSegmentId; @@ -63,7 +63,7 @@ interface IntervalToSegmentIdsCreator Map> create(Function versionFinder); } - CachingLocalSegmentAllocator( + CachingLocalSegmentAllocatorHelper( TaskToolbox toolbox, String taskId, String supervisorTaskId, diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/HashPartitionCachingLocalSegmentAllocator.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/HashPartitionCachingLocalSegmentAllocator.java index fa54a76295ca..1c1736930603 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/HashPartitionCachingLocalSegmentAllocator.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/HashPartitionCachingLocalSegmentAllocator.java @@ -39,7 +39,7 @@ /** * Allocates all necessary hash-partitioned segments locally at the beginning and reuses them. * - * @see CachingLocalSegmentAllocator + * @see CachingLocalSegmentAllocatorHelper */ public class HashPartitionCachingLocalSegmentAllocator implements IndexTaskSegmentAllocator { @@ -60,7 +60,7 @@ public HashPartitionCachingLocalSegmentAllocator( this.dataSource = dataSource; this.allocateSpec = allocateSpec; - this.delegate = new CachingLocalSegmentAllocator( + this.delegate = new CachingLocalSegmentAllocatorHelper( toolbox, taskId, supervisorTaskId, diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java index d8b8ff25493e..3ef5bd28328c 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java @@ -22,6 +22,7 @@ import com.google.common.collect.Maps; import org.apache.druid.data.input.InputRow; import org.apache.druid.indexing.common.TaskToolbox; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; import org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec; import org.apache.druid.timeline.partition.SingleDimensionShardSpec; import org.joda.time.Interval; @@ -29,7 +30,6 @@ import javax.annotation.Nullable; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; @@ -40,13 +40,13 @@ /** * Allocates all necessary range-partitioned segments locally at the beginning and reuses them. * - * @see CachingLocalSegmentAllocator + * @see CachingLocalSegmentAllocatorHelper */ public class RangePartitionCachingLocalSegmentAllocator implements IndexTaskSegmentAllocator { private final String dataSource; private final String partitionDimension; - private final Map intervalsToPartitions; + private final Map intervalsToPartitions; private final IndexTaskSegmentAllocator delegate; public RangePartitionCachingLocalSegmentAllocator( @@ -55,14 +55,14 @@ public RangePartitionCachingLocalSegmentAllocator( String supervisorTaskId, String dataSource, String partitionDimension, - Map intervalsToPartitions + Map intervalsToPartitions ) throws IOException { this.dataSource = dataSource; this.partitionDimension = partitionDimension; this.intervalsToPartitions = intervalsToPartitions; - this.delegate = new CachingLocalSegmentAllocator( + this.delegate = new CachingLocalSegmentAllocatorHelper( toolbox, taskId, supervisorTaskId, @@ -86,36 +86,23 @@ private Map> getIntervalToSegmentIds(Func return intervalToSegmentIds; } + /** + * Translate {@link org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution} partititions + * into the corresponding {@link org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec} with segment id. + */ private List translatePartitions( Interval interval, - String[] partitions, + Partitions partitions, Function versionFinder ) { - if (partitions.length == 0) { + if (partitions.isEmpty()) { return Collections.emptyList(); } - String[] uniquePartitions = Arrays.stream(partitions).distinct().toArray(String[]::new); + String[] uniquePartitions = partitions.stream().distinct().toArray(String[]::new); int numUniquePartition = uniquePartitions.length; - if (numUniquePartition == 1) { - return Collections.singletonList( - createLastSegmentIdWithShardSpec( - interval, - versionFinder.apply(interval), - uniquePartitions[0], - 0 - ) - ); - } - - if (isLastPartitionOnlyMaxValue(partitions)) { - // The last partition only contains the max value. A shard that just contains the max value is likely to be - // small, so combine it with the second to last one. - numUniquePartition -= 1; - } - List segmentIds = IntStream.range(0, numUniquePartition - 1) .mapToObj(i -> createSegmentIdWithShardSpec( @@ -138,13 +125,6 @@ private List translatePartitions( return segmentIds; } - private boolean isLastPartitionOnlyMaxValue(String[] partitions) - { - String lastPartition = partitions[partitions.length - 1]; - String secondToLastPartition = partitions[partitions.length - 2]; - return !lastPartition.equals(secondToLastPartition); - } - private SegmentIdWithShardSpec createLastSegmentIdWithShardSpec( Interval interval, String version, @@ -163,6 +143,8 @@ private SegmentIdWithShardSpec createSegmentIdWithShardSpec( int partitionNum ) { + // The shardSpec created here will be reused in PartialGenericSegmentMergeTask. This is ok because + // all PartialSegmentGenerateTasks create the same set of segmentIds (and thus shardSpecs). return new SegmentIdWithShardSpec( dataSource, interval, diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedGenericPartitionsReport.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedPartitionsMetadataReport.java similarity index 65% rename from indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedGenericPartitionsReport.java rename to indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedPartitionsMetadataReport.java index 0f6570505003..9b50f9f7f37d 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedGenericPartitionsReport.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedPartitionsMetadataReport.java @@ -25,20 +25,19 @@ import java.util.List; /** - * Report containing the {@link GenericPartitionStat}s created by a {@link PartialSegmentGenerateTask}. - * This report is collected by {@link ParallelIndexSupervisorTask} and - * used to generate {@link PartialGenericSegmentMergeIOConfig}. + * Report containing the {@link PartitionMetadata}s created by a {@link PartialSegmentGenerateTask}. This report is + * collected by {@link ParallelIndexSupervisorTask} and used to generate {@link PartialGenericSegmentMergeIOConfig}. */ -class GeneratedGenericPartitionsReport extends GeneratedPartitionsReport implements SubTaskReport +class GeneratedPartitionsMetadataReport extends GeneratedPartitionsReport implements SubTaskReport { - public static final String TYPE = "generated_generic_partitions"; + public static final String TYPE = "generated_partitions_metadata"; @JsonCreator - GeneratedGenericPartitionsReport( + GeneratedPartitionsMetadataReport( @JsonProperty("taskId") String taskId, - @JsonProperty("partitionStats") List partitionStats + @JsonProperty("partitionStats") List partitionMetadata ) { - super(taskId, partitionStats); + super(taskId, partitionMetadata); } } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedPartitionsReport.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedPartitionsReport.java index 23449dcefeee..bfe8cef79c3f 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedPartitionsReport.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedPartitionsReport.java @@ -27,7 +27,7 @@ /** * Report containing the {@link PartitionStat}s created by a {@link PartialSegmentGenerateTask}. * This report is collected by {@link ParallelIndexSupervisorTask} and - * used to generate {@link PartialHashSegmentMergeIOConfig}. + * used to generate {@link PartialSegmentMergeIOConfig}. */ abstract class GeneratedPartitionsReport implements SubTaskReport { diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java index e63d2b9cd67a..84133a857157 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java @@ -36,7 +36,6 @@ import org.apache.druid.data.input.impl.InputRowParser; import org.apache.druid.indexer.TaskState; import org.apache.druid.indexer.TaskStatus; -import org.apache.druid.indexer.partitions.HashedPartitionsSpec; import org.apache.druid.indexer.partitions.PartitionsSpec; import org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec; import org.apache.druid.indexing.appenderator.ActionBasedUsedSegmentChecker; @@ -59,6 +58,7 @@ import org.apache.druid.indexing.common.task.TaskResource; import org.apache.druid.indexing.common.task.Tasks; import org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexTaskRunner.SubTaskSpecStatus; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution; import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistributionMerger; import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch; @@ -323,7 +323,7 @@ PartialDimensionDistributionParallelIndexTaskRunner createPartialDimensionDistri @VisibleForTesting PartialRangeSegmentGenerateParallelIndexTaskRunner createPartialRangeSegmentGenerateRunner( TaskToolbox toolbox, - Map intervalToPartitions + Map intervalToPartitions ) { return new PartialRangeSegmentGenerateParallelIndexTaskRunner( @@ -379,6 +379,22 @@ public boolean isReady(TaskActionClient taskActionClient) throws Exception return determineLockGranularityAndTryLock(taskActionClient, ingestionSchema.getDataSchema().getGranularitySpec()); } + private boolean useRangePartitions() + { + return (ingestionSchema.getTuningConfig().getGivenOrDefaultPartitionsSpec() instanceof SingleDimensionPartitionsSpec); + } + + private static void assertDataSketchesAvailable() + { + try { + //noinspection ResultOfObjectAllocationIgnored + new StringSketch(); + } + catch (NoClassDefFoundError e) { + throw new ISE(e, "DataSketches is unvailable. Try adding the druid-datasketches extension to the classpath."); + } + } + @Override public List findSegmentsToLock(TaskActionClient taskActionClient, List intervals) throws IOException @@ -528,14 +544,9 @@ private TaskStatus runSinglePhaseParallel(TaskToolbox toolbox) throws Exception */ private TaskStatus runMultiPhaseParallel(TaskToolbox toolbox) throws Exception { - return useHashPartitions() - ? runHashPartitionMultiPhaseParallel(toolbox) - : runRangePartitionMultiPhaseParallel(toolbox); - } - - private boolean useHashPartitions() - { - return (ingestionSchema.getTuningConfig().getGivenOrDefaultPartitionsSpec() instanceof HashedPartitionsSpec); + return useRangePartitions() + ? runRangePartitionMultiPhaseParallel(toolbox) + : runHashPartitionMultiPhaseParallel(toolbox); } private TaskStatus runHashPartitionMultiPhaseParallel(TaskToolbox toolbox) throws Exception @@ -576,8 +587,6 @@ private TaskStatus runHashPartitionMultiPhaseParallel(TaskToolbox toolbox) throw private TaskStatus runRangePartitionMultiPhaseParallel(TaskToolbox toolbox) throws Exception { - assertDataSketchesAvailable(); - ParallelIndexTaskRunner distributionRunner = createRunner( toolbox, @@ -589,7 +598,7 @@ private TaskStatus runRangePartitionMultiPhaseParallel(TaskToolbox toolbox) thro return TaskStatus.failure(getId()); } - Map intervalToPartitions = + Map intervalToPartitions = determineAllRangePartitions(distributionRunner.getReports().values()); if (intervalToPartitions.isEmpty()) { @@ -599,7 +608,7 @@ private TaskStatus runRangePartitionMultiPhaseParallel(TaskToolbox toolbox) thro return TaskStatus.success(getId(), msg); } - ParallelIndexTaskRunner> indexingRunner = + ParallelIndexTaskRunner> indexingRunner = createRunner(toolbox, tb -> createPartialRangeSegmentGenerateRunner(tb, intervalToPartitions)); TaskState indexingState = runNextPhase(indexingRunner); @@ -627,18 +636,7 @@ private TaskStatus runRangePartitionMultiPhaseParallel(TaskToolbox toolbox) thro return TaskStatus.fromCode(getId(), mergeState); } - private static void assertDataSketchesAvailable() - { - try { - //noinspection ResultOfObjectAllocationIgnored - new StringSketch(); - } - catch (Throwable t) { - throw new ISE(t, "DataSketches is unvailable. Try adding the druid-datasketches extension to the classpath."); - } - } - - private Map determineAllRangePartitions(Collection reports) + private Map determineAllRangePartitions(Collection reports) { Multimap intervalToDistributions = ArrayListMultimap.create(); reports.forEach(report -> { @@ -649,7 +647,7 @@ private Map determineAllRangePartitions(Collection distributions) + private Partitions determineRangePartition(Collection distributions) { StringDistributionMerger distributionMerger = new StringSketchMerger(); distributions.forEach(distributionMerger::merge); @@ -658,7 +656,7 @@ private String[] determineRangePartition(Collection distribu SingleDimensionPartitionsSpec partitionsSpec = (SingleDimensionPartitionsSpec) ingestionSchema.getTuningConfig().getGivenOrDefaultPartitionsSpec(); - final String[] partitions; + final Partitions partitions; Integer targetRowsPerSegment = partitionsSpec.getTargetRowsPerSegment(); if (targetRowsPerSegment == null) { partitions = mergedDistribution.getEvenPartitionsByMaxSize(partitionsSpec.getMaxRowsPerSegment()); @@ -688,10 +686,10 @@ private static Map, List> groupHa } private static Map, List> groupGenericPartitionLocationsPerPartition( - Map> subTaskIdToReport + Map> subTaskIdToReport ) { - BiFunction createPartitionLocationFunction = + BiFunction createPartitionLocationFunction = (subtaskId, partitionStat) -> new GenericPartitionLocation( partitionStat.getTaskExecutorHost(), diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java index 508b5c8615ed..a50239362181 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java @@ -24,6 +24,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; +import com.google.common.collect.Iterables; import com.google.common.hash.BloomFilter; import org.apache.druid.client.indexing.IndexingServiceClient; import org.apache.druid.data.input.HandlingInputRowIterator; @@ -202,7 +203,9 @@ public TaskStatus runTask(TaskToolbox toolbox) throws Exception List metricsNames = Arrays.stream(dataSchema.getAggregators()) .map(AggregatorFactory::getName) .collect(Collectors.toList()); - InputFormat inputFormat = ParallelIndexSupervisorTask.getInputFormat(ingestionSchema); + InputFormat inputFormat = inputSource.needsFormat() + ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) + : null; InputSourceReader inputSourceReader = dataSchema.getTransformSpec().decorate( inputSource.reader( new InputRowSchema( @@ -211,7 +214,7 @@ public TaskStatus runTask(TaskToolbox toolbox) throws Exception metricsNames ), inputFormat, - null + toolbox.getIndexingTmpDir() ) ); @@ -244,16 +247,16 @@ private Map determineDistribution( String partitionDimension, boolean isAssumeGrouped, boolean isLogParseExceptions, - long maxParseExceptions + int maxParseExceptions ) { Map intervalToDistribution = new HashMap<>(); DimensionValueFilter dimValueFilter = - isAssumeGrouped + isAssumeGrouped && granularitySpec.isRollup() ? new GroupedRowDimensionValueFilter() : ungroupedRowDimValueFilterSupplier.get(); - long numParseExceptions = 0; + int numParseExceptions = 0; while (inputRowIterator.hasNext()) { try { @@ -272,7 +275,7 @@ private Map determineDistribution( String dimensionValue = dimValueFilter.accept( interval, timestamp, - inputRow.getDimension(partitionDimension).get(0) + Iterables.getOnlyElement(inputRow.getDimension(partitionDimension)) ); if (dimensionValue != null) { @@ -319,7 +322,7 @@ private interface DimensionValueFilter * @return Dimension value if it should be accepted, else null */ @Nullable - String accept(Interval interval, DateTime timestamp, String dimesionValue); + String accept(Interval interval, DateTime timestamp, String dimensionValue); /** * @return Minimum dimension value for each interval processed so far. @@ -332,6 +335,10 @@ private interface DimensionValueFilter Map getIntervalToMaxDimensionValue(); } + /** + * Filters out reoccurrences of rows that have timestamps with the same query granularity and dimension value. + * Approximate matching is used, so there is a small probability that rows that are not reoccurences are discarded. + */ @VisibleForTesting static class UngroupedRowDimensionValueFilter implements DimensionValueFilter { @@ -419,18 +426,30 @@ public String accept(Interval interval, DateTime timestamp, String dimensionValu private void updateMinDimensionValue(Interval interval, String dimensionValue) { - String minDimensionValue = intervalToMinDimensionValue.get(interval); - if (minDimensionValue == null || dimensionValue.compareTo(minDimensionValue) < 0) { - intervalToMinDimensionValue.put(interval, dimensionValue); - } + intervalToMinDimensionValue.compute( + interval, + (intervalKey, currentMinValue) -> { + if (currentMinValue == null || dimensionValue.compareTo(currentMinValue) < 0) { + return dimensionValue; + } else { + return currentMinValue; + } + } + ); } private void updateMaxDimensionValue(Interval interval, String dimensionValue) { - String maxDimensionValue = intervalToMaxDimensionValue.get(interval); - if (maxDimensionValue == null || dimensionValue.compareTo(maxDimensionValue) > 0) { - intervalToMaxDimensionValue.put(interval, dimensionValue); - } + intervalToMaxDimensionValue.compute( + interval, + (intervalKey, currentMaxValue) -> { + if (currentMaxValue == null || dimensionValue.compareTo(currentMaxValue) > 0) { + return dimensionValue; + } else { + return currentMaxValue; + } + } + ); } @Override diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeParallelIndexTaskRunner.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeParallelIndexTaskRunner.java index e53b1d22451a..de9810342113 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeParallelIndexTaskRunner.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeParallelIndexTaskRunner.java @@ -76,12 +76,11 @@ int getTotalNumSubTasks() @VisibleForTesting SubTaskSpec newTaskSpec(PartialGenericSegmentMergeIOConfig ioConfig) { - final PartialGenericSegmentMergeIngestionSpec ingestionSpec = - new PartialGenericSegmentMergeIngestionSpec( - dataSchema, - ioConfig, - getTuningConfig() - ); + final PartialGenericSegmentMergeIngestionSpec ingestionSpec = new PartialGenericSegmentMergeIngestionSpec( + dataSchema, + ioConfig, + getTuningConfig() + ); return new SubTaskSpec( getTaskId() + "_" + getAndIncrementNextSpecId(), getGroupId(), diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTask.java index 559a9b5317ef..0c369bf0f106 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTask.java @@ -22,6 +22,7 @@ import com.fasterxml.jackson.annotation.JacksonInject; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.Preconditions; import com.google.common.collect.HashBasedTable; import com.google.common.collect.Table; import org.apache.druid.client.indexing.IndexingServiceClient; @@ -45,7 +46,7 @@ public class PartialGenericSegmentMergeTask extends PartialSegmentMergeTask createIntervalAndIntegerToShardSpec; + private final Table intervalAndIntegerToShardSpec; @JsonCreator public PartialGenericSegmentMergeTask( @@ -78,7 +79,7 @@ public PartialGenericSegmentMergeTask( ); this.ingestionSchema = ingestionSchema; - this.createIntervalAndIntegerToShardSpec = createIntervalAndIntegerToShardSpec( + this.intervalAndIntegerToShardSpec = createIntervalAndIntegerToShardSpec( ingestionSchema.getIOConfig().getPartitionLocations() ); } @@ -90,7 +91,18 @@ private static Table createIntervalAndIntegerToSha Table intervalAndIntegerToShardSpec = HashBasedTable.create(); partitionLocations.forEach( - p -> intervalAndIntegerToShardSpec.put(p.getInterval(), p.getPartitionId(), p.getShardSpec()) + p -> { + ShardSpec currShardSpec = intervalAndIntegerToShardSpec.get(p.getInterval(), p.getPartitionId()); + Preconditions.checkArgument( + currShardSpec == null || p.getShardSpec().equals(currShardSpec), + "interval %s, partitionId %d mismatched shard specs: %s", + p.getInterval(), + p.getPartitionId(), + partitionLocations + ); + + intervalAndIntegerToShardSpec.put(p.getInterval(), p.getPartitionId(), p.getShardSpec()); + } ); return intervalAndIntegerToShardSpec; @@ -109,8 +121,14 @@ public String getType() } @Override - ShardSpec createShardSpec(TaskToolbox toolbox, Interval interval, int partitionNum) + ShardSpec createShardSpec(TaskToolbox toolbox, Interval interval, int partitionId) { - return createIntervalAndIntegerToShardSpec.get(interval, partitionNum); + return Preconditions.checkNotNull( + intervalAndIntegerToShardSpec.get(interval, partitionId), + "no shard spec exists for interval %s, partitionId %d: %s", + interval, + partitionId, + intervalAndIntegerToShardSpec.rowMap() + ); } } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentMergeTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentMergeTask.java index fa23eed2d1a5..157f5e943e7b 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentMergeTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentMergeTask.java @@ -102,10 +102,10 @@ public String getType() } @Override - HashBasedNumberedShardSpec createShardSpec(TaskToolbox toolbox, Interval interval, int partitionNum) + HashBasedNumberedShardSpec createShardSpec(TaskToolbox toolbox, Interval interval, int partitionId) { return new HashBasedNumberedShardSpec( - partitionNum, + partitionId, Preconditions.checkNotNull(partitionsSpec.getNumShards(), "numShards"), partitionsSpec.getPartitionDimensions(), toolbox.getJsonMapper() diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java index 57002a8311c4..06f6ddb1d2b9 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java @@ -24,6 +24,7 @@ import org.apache.druid.data.input.InputSplit; import org.apache.druid.indexing.common.TaskToolbox; import org.apache.druid.indexing.common.task.IndexTaskClientFactory; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; import org.apache.druid.segment.realtime.appenderator.AppenderatorsManager; import org.joda.time.Interval; @@ -35,11 +36,11 @@ * @see PartialHashSegmentMergeParallelIndexTaskRunner */ class PartialRangeSegmentGenerateParallelIndexTaskRunner - extends InputSourceSplitParallelIndexTaskRunner> + extends InputSourceSplitParallelIndexTaskRunner> { private final IndexTaskClientFactory taskClientFactory; private final AppenderatorsManager appenderatorsManager; - private final Map intervalToPartitions; + private final Map intervalToPartitions; PartialRangeSegmentGenerateParallelIndexTaskRunner( TaskToolbox toolbox, @@ -48,7 +49,7 @@ class PartialRangeSegmentGenerateParallelIndexTaskRunner ParallelIndexIngestionSpec ingestionSchema, Map context, IndexingServiceClient indexingServiceClient, - Map intervalToPartitions + Map intervalToPartitions ) { this( @@ -72,7 +73,7 @@ class PartialRangeSegmentGenerateParallelIndexTaskRunner ParallelIndexIngestionSpec ingestionSchema, Map context, IndexingServiceClient indexingServiceClient, - Map intervalToPartitions, + Map intervalToPartitions, IndexTaskClientFactory taskClientFactory, AppenderatorsManager appenderatorsManager ) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java index 5b8e67d7266d..00ec70f22ef9 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java @@ -32,6 +32,7 @@ import org.apache.druid.indexing.common.task.IndexTaskSegmentAllocator; import org.apache.druid.indexing.common.task.RangePartitionCachingLocalSegmentAllocator; import org.apache.druid.indexing.common.task.TaskResource; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; import org.apache.druid.indexing.common.task.batch.parallel.iterator.RangePartitionIndexTaskInputRowIteratorBuilder; import org.apache.druid.indexing.worker.ShuffleDataSegmentPusher; import org.apache.druid.segment.realtime.appenderator.AppenderatorsManager; @@ -45,12 +46,11 @@ import java.util.stream.Collectors; /** - * The worker task of {@link PartialRangeSegmentGenerateParallelIndexTaskRunner}. This task - * partitions input data by ranges of the partition dimension specified in - * {@link SingleDimensionPartitionsSpec}. Partitioned segments are stored in local storage using - * {@link ShuffleDataSegmentPusher}. + * The worker task of {@link PartialRangeSegmentGenerateParallelIndexTaskRunner}. This task partitions input data by + * ranges of the partition dimension specified in {@link SingleDimensionPartitionsSpec}. Partitioned segments are stored + * in local storage using {@link ShuffleDataSegmentPusher}. */ -public class PartialRangeSegmentGenerateTask extends PartialSegmentGenerateTask +public class PartialRangeSegmentGenerateTask extends PartialSegmentGenerateTask { public static final String TYPE = "partial_range_index_generate"; private static final String PROP_SPEC = "spec"; @@ -58,7 +58,7 @@ public class PartialRangeSegmentGenerateTask extends PartialSegmentGenerateTask< private final String supervisorTaskId; private final int numAttempts; private final ParallelIndexIngestionSpec ingestionSchema; - private final Map intervalToPartitions; + private final Map intervalToPartitions; @JsonCreator public PartialRangeSegmentGenerateTask( @@ -70,7 +70,7 @@ public PartialRangeSegmentGenerateTask( @JsonProperty("numAttempts") int numAttempts, // zero-based counting @JsonProperty(PROP_SPEC) ParallelIndexIngestionSpec ingestionSchema, @JsonProperty("context") Map context, - @JsonProperty("intervalToPartitions") Map intervalToPartitions, + @JsonProperty("intervalToPartitions") Map intervalToPartitions, @JacksonInject IndexingServiceClient indexingServiceClient, @JacksonInject IndexTaskClientFactory taskClientFactory, @JacksonInject AppenderatorsManager appenderatorsManager @@ -130,7 +130,7 @@ public String getSupervisorTaskId() } @JsonProperty - public Map getIntervalToPartitions() + public Map getIntervalToPartitions() { return intervalToPartitions; } @@ -161,17 +161,17 @@ IndexTaskSegmentAllocator createSegmentAllocator(TaskToolbox toolbox) throws IOE } @Override - GeneratedGenericPartitionsReport createGeneratedPartitionsReport(TaskToolbox toolbox, List segments) + GeneratedPartitionsMetadataReport createGeneratedPartitionsReport(TaskToolbox toolbox, List segments) { - List partitionStats = segments.stream() - .map(segment -> createPartitionStat(toolbox, segment)) - .collect(Collectors.toList()); - return new GeneratedGenericPartitionsReport(getId(), partitionStats); + List partitionsMetadata = segments.stream() + .map(segment -> createPartitionStat(toolbox, segment)) + .collect(Collectors.toList()); + return new GeneratedPartitionsMetadataReport(getId(), partitionsMetadata); } - private GenericPartitionStat createPartitionStat(TaskToolbox toolbox, DataSegment segment) + private PartitionMetadata createPartitionStat(TaskToolbox toolbox, DataSegment segment) { - return new GenericPartitionStat( + return new PartitionMetadata( toolbox.getTaskExecutorNode().getHost(), toolbox.getTaskExecutorNode().getPortToUse(), toolbox.getTaskExecutorNode().isEnableTlsPort(), diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialSegmentMergeTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialSegmentMergeTask.java index ea0ac936925c..495a7008565c 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialSegmentMergeTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialSegmentMergeTask.java @@ -281,7 +281,7 @@ File fetchSegmentFile(File partitionDir, P location) throws IOException /** * Create a {@link ShardSpec} suitable for the desired secondary partitioning strategy. */ - abstract S createShardSpec(TaskToolbox toolbox, Interval interval, int partitionNum); + abstract S createShardSpec(TaskToolbox toolbox, Interval interval, int partitionId); private Set mergeAndPushSegments( TaskToolbox toolbox, diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStat.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartitionMetadata.java similarity index 82% rename from indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStat.java rename to indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartitionMetadata.java index 04a98c284476..e8f5c4a5503a 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStat.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartitionMetadata.java @@ -28,11 +28,12 @@ import java.util.Objects; /** - * Statistics about a partition created by {@link PartialSegmentGenerateTask}. Each partition is a set of data - * of the same time chunk (primary partition key) and the same {@link ShardSpec} (secondary partition key). This class - * holds the statistics of a single partition created by a task. + * Partition description ({@link ShardSpec}) and statistics created by {@link PartialSegmentGenerateTask}. Each + * partition is a set of data of the same time chunk (primary partition key) and the same {@link ShardSpec} (secondary + * partition key). The {@link ShardSpec} is later used by {@link PartialGenericSegmentMergeTask} to merge the partial + * segments. */ -public class GenericPartitionStat extends PartitionStat +public class PartitionMetadata extends PartitionStat { private static final String PROP_SHARD_SPEC = "shardSpec"; @@ -40,7 +41,7 @@ public class GenericPartitionStat extends PartitionStat private final ShardSpec shardSpec; @JsonCreator - public GenericPartitionStat( + public PartitionMetadata( @JsonProperty("taskExecutorHost") String taskExecutorHost, @JsonProperty("taskExecutorPort") int taskExecutorPort, @JsonProperty("useHttps") boolean useHttps, @@ -79,7 +80,7 @@ public boolean equals(Object o) if (!super.equals(o)) { return false; } - GenericPartitionStat that = (GenericPartitionStat) o; + PartitionMetadata that = (PartitionMetadata) o; return Objects.equals(shardSpec, that.shardSpec); } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SubTaskReport.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SubTaskReport.java index 8cc6db91e94e..564b3af8ab6f 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SubTaskReport.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SubTaskReport.java @@ -32,7 +32,7 @@ @Type(name = PushedSegmentsReport.TYPE, value = PushedSegmentsReport.class), @Type(name = GeneratedHashPartitionsReport.TYPE, value = GeneratedHashPartitionsReport.class), @Type(name = DimensionDistributionReport.TYPE, value = DimensionDistributionReport.class), - @Type(name = GeneratedGenericPartitionsReport.TYPE, value = GeneratedGenericPartitionsReport.class) + @Type(name = GeneratedPartitionsMetadataReport.TYPE, value = GeneratedPartitionsMetadataReport.class) }) public interface SubTaskReport { diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/Partitions.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/Partitions.java new file mode 100644 index 000000000000..d3f967923471 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/Partitions.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.distribution; + +import com.google.common.collect.ForwardingList; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * Convenience wrapper to make code more readable. + */ +public class Partitions extends ForwardingList implements List +{ + private final List delegate; + + public Partitions(String... partitions) + { + delegate = ImmutableList.copyOf(partitions); + } + + @Override + protected List delegate() + { + return delegate; + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistribution.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistribution.java index 643a1a8276e8..116dea63f2c2 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistribution.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistribution.java @@ -32,17 +32,17 @@ public interface StringDistribution { /** - * Record occurence of {@link String} + * Record occurrence of {@link String} */ void put(String element); /** - * Record occurence of {@link String} if it will become the new minimum element. + * Record occurrence of {@link String} if it will become the new minimum element. */ void putIfNewMin(String element); /** - * Record occurence of {@link String} if it will become the new maximum element; + * Record occurrence of {@link String} if it will become the new maximum element; */ void putIfNewMax(String element); @@ -53,7 +53,7 @@ public interface StringDistribution * @return Array of elements that correspond to the endpoints of evenly-sized partitions of the * sorted elements. */ - String[] getEvenPartitionsByMaxSize(int maxSize); + Partitions getEvenPartitionsByMaxSize(int maxSize); /** * Split the distribution in the fewest number of evenly-sized partitions while honoring a target @@ -62,5 +62,5 @@ public interface StringDistribution * @return Array of elements that correspond to the endpoints of evenly-sized partitions of the * sorted elements. */ - String[] getEvenPartitionsByTargetSize(int targetSize); + Partitions getEvenPartitionsByTargetSize(int targetSize); } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java index 74a97b2d7537..7fb71a10e539 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java @@ -86,26 +86,26 @@ public void putIfNewMax(String string) } @Override - public String[] getEvenPartitionsByMaxSize(int maxSize) + public Partitions getEvenPartitionsByMaxSize(int maxSize) { Preconditions.checkArgument(maxSize > 0, "maxSize must be positive but is %s", maxSize); long n = delegate.getN(); double delta = delegate.getNormalizedRankError(true) * n; // account for approx distribution int targetSize = Math.max(1, (int) Math.floor(maxSize - delta)); // floor() to increase chance below max size int evenPartitionCount = (int) Math.ceil((double) n / targetSize); // ceil() to increase chance below max size - return getEventPartitionsByCount(Math.max(1, evenPartitionCount)); + return getEvenPartitionsByCount(Math.max(1, evenPartitionCount)); } @Override - public String[] getEvenPartitionsByTargetSize(int targetSize) + public Partitions getEvenPartitionsByTargetSize(int targetSize) { Preconditions.checkArgument(targetSize > 0, "targetSize must be positive but is %s", targetSize); long n = delegate.getN(); int evenPartitionCount = Math.max(1, (int) Math.round((double) n / targetSize)); - return getEventPartitionsByCount(evenPartitionCount); + return getEvenPartitionsByCount(evenPartitionCount); } - private String[] getEventPartitionsByCount(int evenPartitionCount) + private Partitions getEvenPartitionsByCount(int evenPartitionCount) { Preconditions.checkArgument( evenPartitionCount > 0, @@ -113,7 +113,7 @@ private String[] getEventPartitionsByCount(int evenPartitionCount) evenPartitionCount ); String[] partitions = delegate.getQuantiles(evenPartitionCount + 1); // add 1 since this returns endpoints - return (partitions == null) ? new String[0] : partitions; + return new Partitions((partitions == null) ? new String[0] : partitions); } @Override diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IngestionTestBase.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IngestionTestBase.java index fe6c534e837a..e87e0f796177 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IngestionTestBase.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IngestionTestBase.java @@ -23,7 +23,6 @@ import com.google.common.base.Optional; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; -import org.apache.druid.common.config.NullHandling; import org.apache.druid.indexer.TaskStatus; import org.apache.druid.indexing.common.SegmentLoaderFactory; import org.apache.druid.indexing.common.SingleFileTaskReportFileWriter; @@ -63,6 +62,7 @@ import org.apache.druid.segment.loading.SegmentLoader; import org.apache.druid.server.DruidNode; import org.apache.druid.server.metrics.NoopServiceEmitter; +import org.apache.druid.testing.InitializedNullHandlingTest; import org.apache.druid.timeline.DataSegment; import org.junit.After; import org.junit.Before; @@ -79,12 +79,8 @@ import java.util.Set; import java.util.concurrent.Executor; -public abstract class IngestionTestBase +public abstract class IngestionTestBase extends InitializedNullHandlingTest { - static { - NullHandling.initializeForTests(); - } - @Rule public TemporaryFolder temporaryFolder = new TemporaryFolder(); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java index a3f4e771abf0..b4ab9f77a30d 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java @@ -25,6 +25,7 @@ import org.apache.druid.indexing.common.TaskToolbox; import org.apache.druid.indexing.common.actions.LockListAction; import org.apache.druid.indexing.common.actions.TaskActionClient; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; import org.apache.druid.java.util.common.DateTimes; import org.apache.druid.java.util.common.Intervals; import org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec; @@ -67,13 +68,23 @@ public class RangePartitionCachingLocalSegmentAllocatorTest private static final String PARTITION0 = "0"; private static final String PARTITION5 = "5"; private static final String PARTITION9 = "9"; - private static final String[] EMPTY_PARTITIONS = new String[]{}; - private static final String[] SINGLETON_PARTITIONS = new String[]{PARTITION0, PARTITION0}; - private static final String[] NORMAL_PARTITIONS = new String[]{PARTITION0, PARTITION5, PARTITION9}; - private static final String[] FREQUENT_MID_PARTITIONS = new String[]{PARTITION0, PARTITION5, PARTITION5, PARTITION9}; - private static final String[] FREQUENT_MAX_PARTITIONS = new String[]{PARTITION0, PARTITION5, PARTITION9, PARTITION9}; + private static final Partitions EMPTY_PARTITIONS = new Partitions(); + private static final Partitions SINGLETON_PARTITIONS = new Partitions(PARTITION0, PARTITION0); + private static final Partitions NORMAL_PARTITIONS = new Partitions(PARTITION0, PARTITION5, PARTITION9); + private static final Partitions FREQUENT_MID_PARTITIONS = new Partitions( + PARTITION0, + PARTITION5, + PARTITION5, + PARTITION9 + ); + private static final Partitions FREQUENT_MAX_PARTITIONS = new Partitions( + PARTITION0, + PARTITION5, + PARTITION9, + PARTITION9 + ); - private static final Map INTERVAL_TO_PARTITONS = ImmutableMap.of( + private static final Map INTERVAL_TO_PARTITONS = ImmutableMap.of( INTERVAL_EMPTY, EMPTY_PARTITIONS, INTERVAL_SINGLETON, SINGLETON_PARTITIONS, INTERVAL_NORMAL, NORMAL_PARTITIONS, @@ -108,14 +119,14 @@ public void setup() throws IOException @Test public void failsIfAllocateFromEmptyInterval() { - int dummy = 0; Interval interval = INTERVAL_EMPTY; InputRow row = createInputRow(interval, PARTITION9); exception.expect(IllegalStateException.class); exception.expectMessage("Failed to get shardSpec"); - testAllocate(row, interval, dummy, null); + String sequenceName = target.getSequenceName(interval, row); + allocate(row, sequenceName); } @Test @@ -140,15 +151,16 @@ public void allocatesCorrectShardSpecsForLastPartitionWithoutFrequentValue() { Interval interval = INTERVAL_NORMAL; InputRow row = createInputRow(interval, PARTITION9); - testAllocate(row, interval, INTERVAL_TO_PARTITONS.get(interval).length - 2, null); + testAllocate(row, interval, INTERVAL_TO_PARTITONS.get(interval).size() - 1, null); } @Test - public void allocatesCorrectShardSpecsForLPartitionWithFrequentMid() + public void allocatesCorrectShardSpecsForLastPartitionWithFrequentMid() { Interval interval = INTERVAL_FREQUENT_MID; InputRow row = createInputRow(interval, PARTITION9); - testAllocate(row, interval, INTERVAL_TO_PARTITONS.get(interval).length - 3, null); + Partitions partitions = INTERVAL_TO_PARTITONS.get(interval); + testAllocate(row, interval, partitions.size() - 2, partitions.get(partitions.size() - 1), null); } @Test @@ -156,15 +168,27 @@ public void allocatesCorrectShardSpecsForLastPartitionWithFrequentMax() { Interval interval = INTERVAL_FREQUENT_MAX; InputRow row = createInputRow(interval, PARTITION9); - testAllocate(row, interval, INTERVAL_TO_PARTITONS.get(interval).length - 2, null); + testAllocate(row, interval, INTERVAL_TO_PARTITONS.get(interval).size() - 2, null); } + @SuppressWarnings("SameParameterValue") private void testAllocate(InputRow row, Interval interval, int partitionNum) { - testAllocate(row, interval, partitionNum, INTERVAL_TO_PARTITONS.get(interval)[partitionNum + 1]); + testAllocate(row, interval, partitionNum, INTERVAL_TO_PARTITONS.get(interval).get(partitionNum + 1)); } private void testAllocate(InputRow row, Interval interval, int partitionNum, @Nullable String partitionEnd) + { + testAllocate(row, interval, partitionNum, INTERVAL_TO_PARTITONS.get(interval).get(partitionNum), partitionEnd); + } + + private void testAllocate( + InputRow row, + Interval interval, + int partitionNum, + String partitionStart, + @Nullable String partitionEnd + ) { String sequenceName = target.getSequenceName(interval, row); SegmentIdWithShardSpec segmentIdWithShardSpec = allocate(row, sequenceName); @@ -176,7 +200,6 @@ private void testAllocate(InputRow row, Interval interval, int partitionNum, @Nu SingleDimensionShardSpec shardSpec = (SingleDimensionShardSpec) segmentIdWithShardSpec.getShardSpec(); Assert.assertEquals(PARTITION_DIMENSION, shardSpec.getDimension()); Assert.assertEquals(partitionNum, shardSpec.getPartitionNum()); - String partitionStart = INTERVAL_TO_PARTITONS.get(interval)[partitionNum]; Assert.assertEquals(partitionStart, shardSpec.getStart()); Assert.assertEquals(partitionEnd, shardSpec.getEnd()); } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java index 6e44472a4dc0..e627068e0e92 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java @@ -33,6 +33,7 @@ import org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec; import org.apache.druid.indexing.common.TaskToolbox; import org.apache.druid.indexing.common.task.IndexTaskClientFactory; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.segment.TestHelper; @@ -44,10 +45,12 @@ import org.hamcrest.Matchers; import org.joda.time.Interval; import org.junit.Assert; +import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.experimental.runners.Enclosed; import org.junit.rules.ExpectedException; +import org.junit.rules.TemporaryFolder; import org.junit.runner.RunWith; import java.util.ArrayList; @@ -135,14 +138,25 @@ public void hasCorrectPrefixForAutomaticId() public static class RunTaskTest { - private static final TaskToolbox TASK_TOOLBOX = null; - @Rule public ExpectedException exception = ExpectedException.none(); + @Rule + public TemporaryFolder temporaryFolder = new TemporaryFolder(); + @Rule public LoggerCaptureRule logger = new LoggerCaptureRule(PartialDimensionDistributionTask.class); + private TaskToolbox taskToolbox; + + @Before + public void setup() + { + taskToolbox = EasyMock.mock(TaskToolbox.class); + EasyMock.expect(taskToolbox.getIndexingTmpDir()).andStubReturn(temporaryFolder.getRoot()); + EasyMock.replay(taskToolbox); + } + @Test public void requiresPartitionDimension() throws Exception { @@ -158,7 +172,7 @@ public void requiresPartitionDimension() throws Exception .tuningConfig(tuningConfig) .build(); - task.runTask(TASK_TOOLBOX); + task.runTask(taskToolbox); } @Test @@ -178,7 +192,7 @@ public void logsParseExceptionsIfEnabled() throws Exception .taskClientFactory(ParallelIndexTestingFactory.createTaskClientFactory()) .build(); - task.runTask(TASK_TOOLBOX); + task.runTask(taskToolbox); List logEvents = logger.getLogEvents(); Assert.assertEquals(1, logEvents.size()); @@ -198,7 +212,7 @@ public void doesNotLogParseExceptionsIfDisabled() throws Exception .taskClientFactory(ParallelIndexTestingFactory.createTaskClientFactory()) .build(); - task.runTask(TASK_TOOLBOX); + task.runTask(taskToolbox); Assert.assertEquals(Collections.emptyList(), logger.getLogEvents()); } @@ -218,7 +232,7 @@ public void failsWhenTooManyParseExceptions() throws Exception exception.expect(RuntimeException.class); exception.expectMessage("Max parse exceptions exceeded"); - task.runTask(TASK_TOOLBOX); + task.runTask(taskToolbox); } @Test @@ -260,11 +274,11 @@ public void sendsCorrectReportWhenAssumeGroupedTrue() Map intervalToDistribution = report.getIntervalToDistribution(); StringDistribution distribution = Iterables.getOnlyElement(intervalToDistribution.values()); Assert.assertNotNull(distribution); - String[] partitions = distribution.getEvenPartitionsByMaxSize(1); - Assert.assertEquals(3, partitions.length); - Assert.assertEquals(dimensionValue, partitions[0]); - Assert.assertEquals(dimensionValue, partitions[1]); - Assert.assertEquals(dimensionValue, partitions[2]); + Partitions partitions = distribution.getEvenPartitionsByMaxSize(1); + Assert.assertEquals(3, partitions.size()); + Assert.assertEquals(dimensionValue, partitions.get(0)); + Assert.assertEquals(dimensionValue, partitions.get(1)); + Assert.assertEquals(dimensionValue, partitions.get(2)); } @Test @@ -291,10 +305,10 @@ public void groupsRowsWhenAssumeGroupedFalse() Map intervalToDistribution = report.getIntervalToDistribution(); StringDistribution distribution = Iterables.getOnlyElement(intervalToDistribution.values()); Assert.assertNotNull(distribution); - String[] partitions = distribution.getEvenPartitionsByMaxSize(1); - Assert.assertEquals(2, partitions.length); - Assert.assertEquals(dimensionValue, partitions[0]); - Assert.assertEquals(dimensionValue, partitions[1]); + Partitions partitions = distribution.getEvenPartitionsByMaxSize(1); + Assert.assertEquals(2, partitions.size()); + Assert.assertEquals(dimensionValue, partitions.get(0)); + Assert.assertEquals(dimensionValue, partitions.get(1)); } @Test @@ -346,10 +360,10 @@ public void preservesMinAndMaxWhenAssumeGroupedFalse() Map intervalToDistribution = report.getIntervalToDistribution(); StringDistribution distribution = Iterables.getOnlyElement(intervalToDistribution.values()); Assert.assertNotNull(distribution); - String[] partitions = distribution.getEvenPartitionsByMaxSize(1); - Assert.assertEquals(minBloomFilterBits + 3, partitions.length); // 3 = min + max + exclusive endpoint - Assert.assertEquals(minDimensionValue, partitions[0]); - Assert.assertEquals(maxDimensionValue, partitions[partitions.length - 1]); + Partitions partitions = distribution.getEvenPartitionsByMaxSize(1); + Assert.assertEquals(minBloomFilterBits + 3, partitions.size()); // 3 = min + max + exclusive endpoint + Assert.assertEquals(minDimensionValue, partitions.get(0)); + Assert.assertEquals(maxDimensionValue, partitions.get(partitions.size() - 1)); } @Test @@ -359,13 +373,13 @@ public void returnsSuccessIfNoExceptions() throws Exception .taskClientFactory(ParallelIndexTestingFactory.createTaskClientFactory()) .build(); - TaskStatus taskStatus = task.runTask(TASK_TOOLBOX); + TaskStatus taskStatus = task.runTask(taskToolbox); Assert.assertEquals(ParallelIndexTestingFactory.ID, taskStatus.getId()); Assert.assertEquals(TaskState.SUCCESS, taskStatus.getStatusCode()); } - private static DimensionDistributionReport runTask(PartialDimensionDistributionTaskBuilder taskBuilder) + private DimensionDistributionReport runTask(PartialDimensionDistributionTaskBuilder taskBuilder) { Capture reportCapture = Capture.newInstance(); ParallelIndexSupervisorTaskClient taskClient = EasyMock.mock(ParallelIndexSupervisorTaskClient.class); @@ -375,7 +389,7 @@ private static DimensionDistributionReport runTask(PartialDimensionDistributionT try { taskBuilder.taskClientFactory((taskInfoProvider, callerId, numThreads, httpTimeout, numRetries) -> taskClient) .build() - .runTask(TASK_TOOLBOX); + .runTask(taskToolbox); } catch (Exception e) { throw new RuntimeException(e); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStatTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartitionMetadataTest.java similarity index 94% rename from indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStatTest.java rename to indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartitionMetadataTest.java index 2bcac8edfd47..3deb64d391e7 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStatTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartitionMetadataTest.java @@ -25,16 +25,16 @@ import org.junit.Before; import org.junit.Test; -public class GenericPartitionStatTest +public class PartitionMetadataTest { private static final ObjectMapper OBJECT_MAPPER = ParallelIndexTestingFactory.createObjectMapper(); - private GenericPartitionStat target; + private PartitionMetadata target; @Before public void setup() { - target = new GenericPartitionStat( + target = new PartitionMetadata( ParallelIndexTestingFactory.TASK_EXECUTOR_HOST, ParallelIndexTestingFactory.TASK_EXECUTOR_PORT, ParallelIndexTestingFactory.USE_HTTPS, diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java index 26814d2c3040..0ab776864251 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java @@ -36,6 +36,7 @@ import org.apache.druid.indexing.common.task.IndexTaskClientFactory; import org.apache.druid.indexing.common.task.TaskResource; import org.apache.druid.indexing.common.task.TestAppenderatorsManager; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.Intervals; import org.apache.druid.java.util.common.StringUtils; @@ -217,6 +218,7 @@ private static void assertNumPartition( expectedNumPartition -= 1; } expectedNumPartition *= NUM_DAY; + expectedNumPartition += 1; // max dimension value has its own partition Assert.assertEquals(expectedNumPartition, segments.size()); } @@ -289,7 +291,7 @@ PartialDimensionDistributionParallelIndexTaskRunner createPartialDimensionDistri @Override PartialRangeSegmentGenerateParallelIndexTaskRunner createPartialRangeSegmentGenerateRunner( TaskToolbox toolbox, - Map intervalToPartitions + Map intervalToPartitions ) { return new TestPartialRangeSegmentGenerateRunner( @@ -342,7 +344,7 @@ private TestPartialRangeSegmentGenerateRunner( TaskToolbox toolbox, ParallelIndexSupervisorTask supervisorTask, IndexingServiceClient indexingServiceClient, - Map intervalToPartitions + Map intervalToPartitions ) { super( diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionsTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionsTest.java new file mode 100644 index 000000000000..6564ec8db368 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionsTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.common.task.batch.parallel.distribution; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.util.Arrays; + +public class PartitionsTest +{ + private Partitions target; + private String[] values; + + @Before + public void setup() + { + values = new String[]{"a", "b"}; + target = new Partitions(values); + } + + @Test + public void hasCorrectValues() + { + Assert.assertEquals(Arrays.asList(values), target); + } + + @Test(expected = UnsupportedOperationException.class) + public void isImmutable() + { + target.add("should fail"); + } + + @Test + public void cannotBeIndirectlyModified() + { + String[] originalValues = Arrays.copyOf(values, values.length); + values[0] = "changed"; + Assert.assertEquals(Arrays.asList(originalValues), target); + Assert.assertNotEquals(Arrays.asList(values), target); + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java index 5a39b585a849..9ca6c07b8835 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java @@ -70,11 +70,11 @@ public void mergesCorrectly() target.merge(sketch3); StringDistribution merged = target.getResult(); - String[] partitions = merged.getEvenPartitionsByMaxSize(1); - Assert.assertEquals(4, partitions.length); - Assert.assertEquals(string1, partitions[0]); // min - Assert.assertEquals(string2, partitions[1]); // median - Assert.assertEquals(string3, partitions[2]); // max - Assert.assertEquals(string3, partitions[3]); // max + Partitions partitions = merged.getEvenPartitionsByMaxSize(1); + Assert.assertEquals(4, partitions.size()); + Assert.assertEquals(string1, partitions.get(0)); // min + Assert.assertEquals(string2, partitions.get(1)); // median + Assert.assertEquals(string3, partitions.get(2)); // max + Assert.assertEquals(string3, partitions.get(3)); // max } } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java index c5d84d231636..d49868187e6a 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java @@ -155,8 +155,8 @@ public void requiresPositiveSize() public void handlesEmptySketch() { StringSketch sketch = new StringSketch(); - String[] partitions = sketch.getEvenPartitionsByTargetSize(1); - Assert.assertEquals(0, partitions.length); + Partitions partitions = sketch.getEvenPartitionsByTargetSize(1); + Assert.assertEquals(0, partitions.size()); } @Test @@ -165,16 +165,16 @@ public void handlesSingletonSketch() String value = MIN_STRING; StringSketch sketch = new StringSketch(); sketch.put(value); - String[] partitions = sketch.getEvenPartitionsByTargetSize(1); - Assert.assertEquals(2, partitions.length); - Assert.assertEquals(value, partitions[0]); - Assert.assertEquals(value, partitions[1]); + Partitions partitions = sketch.getEvenPartitionsByTargetSize(1); + Assert.assertEquals(2, partitions.size()); + Assert.assertEquals(value, partitions.get(0)); + Assert.assertEquals(value, partitions.get(1)); } @Test public void handlesMinimimumSize() { - String[] partitions = SKETCH.getEvenPartitionsByTargetSize(1); + Partitions partitions = SKETCH.getEvenPartitionsByTargetSize(1); assertMaxNumberOfPartitions(partitions); } @@ -187,7 +187,7 @@ public void handlesUnevenPartitions() private static void testHandlesUnevenPartitions(int targetSize) { - String[] partitions = SKETCH.getEvenPartitionsByTargetSize(targetSize); + Partitions partitions = SKETCH.getEvenPartitionsByTargetSize(targetSize); assertFirstAndLastPartitionsCorrect(partitions); @@ -196,18 +196,18 @@ private static void testHandlesUnevenPartitions(int targetSize) int expectedLowPartitionCount = expectedHighPartitionCount - 1; Assert.assertThat( "targetSize=" + targetSize + " " + partitionsString, - partitions.length, + partitions.size(), Matchers.lessThanOrEqualTo(expectedHighPartitionCount + 1) ); Assert.assertThat( "targetSize=" + targetSize + " " + partitionsString, - partitions.length, + partitions.size(), Matchers.greaterThanOrEqualTo(expectedLowPartitionCount + 1) ); int previous = 0; - for (int i = 1; i < partitions.length; i++) { - int current = Integer.parseInt(partitions[i]); + for (int i = 1; i < partitions.size(); i++) { + int current = Integer.parseInt(partitions.get(i)); int size = current - previous; Assert.assertThat( getErrMsgPrefix(targetSize, i) + partitionsString, @@ -221,14 +221,14 @@ private static void testHandlesUnevenPartitions(int targetSize) @Test public void handlesSinglePartition() { - String[] partitions = SKETCH.getEvenPartitionsByTargetSize(NUM_STRING); + Partitions partitions = SKETCH.getEvenPartitionsByTargetSize(NUM_STRING); assertSinglePartition(partitions); } @Test public void handlesOversizedPartition() { - String[] partitions = SKETCH.getEvenPartitionsByTargetSize(Integer.MAX_VALUE); + Partitions partitions = SKETCH.getEvenPartitionsByTargetSize(Integer.MAX_VALUE); assertSinglePartition(partitions); } } @@ -251,8 +251,8 @@ public void requiresPositiveSize() public void handlesEmptySketch() { StringSketch sketch = new StringSketch(); - String[] partitions = sketch.getEvenPartitionsByMaxSize(1); - Assert.assertEquals(0, partitions.length); + Partitions partitions = sketch.getEvenPartitionsByMaxSize(1); + Assert.assertEquals(0, partitions.size()); } @Test @@ -261,16 +261,16 @@ public void handlesSingletonSketch() String value = MIN_STRING; StringSketch sketch = new StringSketch(); sketch.put(value); - String[] partitions = sketch.getEvenPartitionsByMaxSize(1); - Assert.assertEquals(2, partitions.length); - Assert.assertEquals(value, partitions[0]); - Assert.assertEquals(value, partitions[1]); + Partitions partitions = sketch.getEvenPartitionsByMaxSize(1); + Assert.assertEquals(2, partitions.size()); + Assert.assertEquals(value, partitions.get(0)); + Assert.assertEquals(value, partitions.get(1)); } @Test public void handlesMinimimumSize() { - String[] partitions = SKETCH.getEvenPartitionsByMaxSize(1); + Partitions partitions = SKETCH.getEvenPartitionsByMaxSize(1); assertMaxNumberOfPartitions(partitions); } @@ -283,7 +283,7 @@ public void handlesUnevenPartitions() private static void testHandlesUnevenPartitions(int maxSize) { - String[] partitions = SKETCH.getEvenPartitionsByMaxSize(maxSize); + Partitions partitions = SKETCH.getEvenPartitionsByMaxSize(maxSize); assertFirstAndLastPartitionsCorrect(partitions); @@ -292,14 +292,14 @@ private static void testHandlesUnevenPartitions(int maxSize) Assert.assertEquals( "maxSize=" + maxSize + " " + partitionsString, expectedPartitionCount + 1, - partitions.length + partitions.size() ); double minSize = (double) NUM_STRING / expectedPartitionCount - DELTA; int previous = 0; - for (int i = 1; i < partitions.length; i++) { - int current = Integer.parseInt(partitions[i]); + for (int i = 1; i < partitions.size(); i++) { + int current = Integer.parseInt(partitions.get(i)); int size = current - previous; Assert.assertThat( getErrMsgPrefix(maxSize, i) + partitionsString, @@ -318,28 +318,28 @@ private static void testHandlesUnevenPartitions(int maxSize) @Test public void handlesSinglePartition() { - String[] partitions = SKETCH.getEvenPartitionsByMaxSize((int) Math.ceil(NUM_STRING + DELTA)); + Partitions partitions = SKETCH.getEvenPartitionsByMaxSize((int) Math.ceil(NUM_STRING + DELTA)); assertSinglePartition(partitions); } @Test public void handlesOversizedPartition() { - String[] partitions = SKETCH.getEvenPartitionsByMaxSize(Integer.MAX_VALUE); + Partitions partitions = SKETCH.getEvenPartitionsByMaxSize(Integer.MAX_VALUE); assertSinglePartition(partitions); } } - private static void assertMaxNumberOfPartitions(String[] partitions) + private static void assertMaxNumberOfPartitions(Partitions partitions) { String partitionsString = toString(partitions); - Assert.assertEquals(partitionsString, NUM_STRING + 1, partitions.length); + Assert.assertEquals(partitionsString, NUM_STRING + 1, partitions.size()); assertFirstAndLastPartitionsCorrect(partitions); int previous = 0; - for (int i = 1; i < partitions.length; i++) { - int current = Integer.parseInt(partitions[i]); + for (int i = 1; i < partitions.size(); i++) { + int current = Integer.parseInt(partitions.get(i)); Assert.assertEquals( getErrMsgPrefix(1, i) + partitionsString, 1, @@ -350,16 +350,16 @@ private static void assertMaxNumberOfPartitions(String[] partitions) } } - private static void assertSinglePartition(String[] partitions) + private static void assertSinglePartition(Partitions partitions) { - Assert.assertEquals(2, partitions.length); + Assert.assertEquals(2, partitions.size()); assertFirstAndLastPartitionsCorrect(partitions); } - private static void assertFirstAndLastPartitionsCorrect(String[] partitions) + private static void assertFirstAndLastPartitionsCorrect(Partitions partitions) { - Assert.assertEquals(MIN_STRING, partitions[0]); - Assert.assertEquals(MAX_STRING, partitions[partitions.length - 1]); + Assert.assertEquals(MIN_STRING, partitions.get(0)); + Assert.assertEquals(MAX_STRING, partitions.get(partitions.size() - 1)); } private static String getErrMsgPrefix(int size, int i) @@ -367,12 +367,12 @@ private static String getErrMsgPrefix(int size, int i) return "size=" + size + " i=" + i + " of "; } - private static String toString(String[] partitions) + private static String toString(Partitions partitions) { - String prefix = "partitions[" + partitions.length + "]="; + String prefix = "partitions[" + partitions.size() + "]="; StringJoiner sj = new StringJoiner(" ", prefix, "]"); - for (int i = 0; i < partitions.length; i++) { - sj.add("[" + i + "]=" + partitions[i]); + for (int i = 0; i < partitions.size(); i++) { + sj.add("[" + i + "]=" + partitions.get(i)); } return sj.toString(); } From 15d35228ffa0d5f07d5864f3a0960458728e432a Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Tue, 3 Dec 2019 12:03:05 -0800 Subject: [PATCH 05/17] Fix docs, strict compile, sketch check, rollup check --- docs/ingestion/native-batch.md | 4 +-- .../parallel/ParallelIndexSupervisorTask.java | 3 ++ .../PartialDimensionDistributionTask.java | 32 +++++++++++-------- .../PartialGenericSegmentMergeTask.java | 4 +-- .../PartialDimensionDistributionTaskTest.java | 22 ++++++------- 5 files changed, 36 insertions(+), 29 deletions(-) diff --git a/docs/ingestion/native-batch.md b/docs/ingestion/native-batch.md index 24e2f479ca98..0202e937bfea 100644 --- a/docs/ingestion/native-batch.md +++ b/docs/ingestion/native-batch.md @@ -246,7 +246,7 @@ You should use different partitionsSpec depending on the [rollup mode](../ingest For perfect rollup, you should use either `hashed` (partitioning based on the hash of dimensions in each row) or `single_dim` (based on ranges of a single dimension. For best-effort rollup, you should use `dynamic`. -For perfect rollup, `ashed partitioning is recommended in most cases, as it will improve indexing +For perfect rollup, `hashed` partitioning is recommended in most cases, as it will improve indexing performance and create more uniformly sized data segments relative to single-dimension partitioning. #### Hash-based partitioning @@ -261,7 +261,7 @@ performance and create more uniformly sized data segments relative to single-dim > Single-dimension range partitioning currently requires the > [druid-datasketches](../development/extensions-core/datasketches-extension.md) -> extension to be loaded. +> extension to be added to the classpath. |property|description|default|required?| |--------|-----------|-------|---------| diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java index 84133a857157..dd13cbf2b21e 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java @@ -376,6 +376,9 @@ PartialGenericSegmentMergeParallelIndexTaskRunner createPartialGenericSegmentMer @Override public boolean isReady(TaskActionClient taskActionClient) throws Exception { + if (useRangePartitions()) { + assertDataSketchesAvailable(); + } return determineLockGranularityAndTryLock(taskActionClient, ingestionSchema.getDataSchema().getGranularitySpec()); } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java index a50239362181..7b013b8cb926 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java @@ -82,7 +82,7 @@ public class PartialDimensionDistributionTask extends PerfectRollupWorkerTask private final IndexTaskClientFactory taskClientFactory; // For testing - private final Supplier ungroupedRowDimValueFilterSupplier; + private final Supplier dedupRowDimValueFilterSupplier; @JsonCreator PartialDimensionDistributionTask( @@ -108,7 +108,7 @@ public class PartialDimensionDistributionTask extends PerfectRollupWorkerTask context, indexingServiceClient, taskClientFactory, - () -> new UngroupedRowDimensionValueFilter( + () -> new DedupRowDimensionValueFilter( ingestionSchema.getDataSchema().getGranularitySpec().getQueryGranularity() ) ); @@ -125,7 +125,7 @@ public class PartialDimensionDistributionTask extends PerfectRollupWorkerTask final Map context, IndexingServiceClient indexingServiceClient, IndexTaskClientFactory taskClientFactory, - Supplier ungroupedRowDimValueFilterSupplier + Supplier dedupRowDimValueFilterSupplier ) { super( @@ -148,7 +148,7 @@ public class PartialDimensionDistributionTask extends PerfectRollupWorkerTask this.supervisorTaskId = supervisorTaskId; this.indexingServiceClient = indexingServiceClient; this.taskClientFactory = taskClientFactory; - this.ungroupedRowDimValueFilterSupplier = ungroupedRowDimValueFilterSupplier; + this.dedupRowDimValueFilterSupplier = dedupRowDimValueFilterSupplier; } @JsonProperty @@ -252,9 +252,9 @@ private Map determineDistribution( { Map intervalToDistribution = new HashMap<>(); DimensionValueFilter dimValueFilter = - isAssumeGrouped && granularitySpec.isRollup() - ? new GroupedRowDimensionValueFilter() - : ungroupedRowDimValueFilterSupplier.get(); + !isAssumeGrouped && granularitySpec.isRollup() + ? dedupRowDimValueFilterSupplier.get() + : new PassthroughRowDimensionValueFilter(); int numParseExceptions = 0; @@ -340,7 +340,7 @@ private interface DimensionValueFilter * Approximate matching is used, so there is a small probability that rows that are not reoccurences are discarded. */ @VisibleForTesting - static class UngroupedRowDimensionValueFilter implements DimensionValueFilter + static class DedupRowDimensionValueFilter implements DimensionValueFilter { // A bloom filter is used to approximately group rows by query granularity. These values assume // time chunks have fewer than BLOOM_FILTER_EXPECTED_INSERTIONS rows. With the below values, the @@ -351,23 +351,23 @@ static class UngroupedRowDimensionValueFilter implements DimensionValueFilter private static final int BLOOM_FILTER_EXPECTED_INSERTIONS = 100_000_000; private static final double BLOOM_FILTER_EXPECTED_FALSE_POSITIVE_PROBABILTY = 0.001; - private final GroupedRowDimensionValueFilter delegate; + private final PassthroughRowDimensionValueFilter delegate; private final TimeDimTupleFactory timeDimTupleFactory; private final BloomFilter timeDimTupleBloomFilter; - UngroupedRowDimensionValueFilter(Granularity queryGranularity) + DedupRowDimensionValueFilter(Granularity queryGranularity) { this(queryGranularity, BLOOM_FILTER_EXPECTED_INSERTIONS, BLOOM_FILTER_EXPECTED_FALSE_POSITIVE_PROBABILTY); } @VisibleForTesting // to allow controlling false positive rate of bloom filter - UngroupedRowDimensionValueFilter( + DedupRowDimensionValueFilter( Granularity queryGranularity, int bloomFilterExpectedInsertions, double bloomFilterFalsePositiveProbability ) { - delegate = new GroupedRowDimensionValueFilter(); + delegate = new PassthroughRowDimensionValueFilter(); timeDimTupleFactory = new TimeDimTupleFactory(queryGranularity); timeDimTupleBloomFilter = BloomFilter.create( TimeDimTupleFunnel.INSTANCE, @@ -404,12 +404,16 @@ public Map getIntervalToMaxDimensionValue() } } - private static class GroupedRowDimensionValueFilter implements DimensionValueFilter + /** + * Accepts all input rows, even if they are reoccurrences of timestamps with the same query granularity and dimension + * value. + */ + private static class PassthroughRowDimensionValueFilter implements DimensionValueFilter { private final Map intervalToMinDimensionValue; private final Map intervalToMaxDimensionValue; - GroupedRowDimensionValueFilter() + PassthroughRowDimensionValueFilter() { this.intervalToMinDimensionValue = new HashMap<>(); this.intervalToMaxDimensionValue = new HashMap<>(); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTask.java index 0c369bf0f106..56865750fa6d 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeTask.java @@ -95,7 +95,7 @@ private static Table createIntervalAndIntegerToSha ShardSpec currShardSpec = intervalAndIntegerToShardSpec.get(p.getInterval(), p.getPartitionId()); Preconditions.checkArgument( currShardSpec == null || p.getShardSpec().equals(currShardSpec), - "interval %s, partitionId %d mismatched shard specs: %s", + "interval %s, partitionId %s mismatched shard specs: %s", p.getInterval(), p.getPartitionId(), partitionLocations @@ -125,7 +125,7 @@ ShardSpec createShardSpec(TaskToolbox toolbox, Interval interval, int partitionI { return Preconditions.checkNotNull( intervalAndIntegerToShardSpec.get(interval, partitionId), - "no shard spec exists for interval %s, partitionId %d: %s", + "no shard spec exists for interval %s, partitionId %s: %s", interval, partitionId, intervalAndIntegerToShardSpec.rowMap() diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java index e627068e0e92..e7334ba0cf45 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java @@ -346,8 +346,8 @@ public void preservesMinAndMaxWhenAssumeGroupedFalse() .tuningConfig(tuningConfig) .dataSchema(dataSchema) .inputSource(inlineInputSource) - .ungroupedRowDimValueFilterSupplier( - () -> new PartialDimensionDistributionTask.UngroupedRowDimensionValueFilter( + .dedupRowDimValueFilterSupplier( + () -> new PartialDimensionDistributionTask.DedupRowDimensionValueFilter( dataSchema.getGranularitySpec().getQueryGranularity(), smallBloomFilter, manyFalsePositiveBloomFilter @@ -412,8 +412,8 @@ private static class PartialDimensionDistributionTaskBuilder ParallelIndexTestingFactory.createDataSchema(ParallelIndexTestingFactory.INPUT_INTERVALS); private IndexTaskClientFactory taskClientFactory = ParallelIndexTestingFactory.TASK_CLIENT_FACTORY; - private Supplier - ungroupedRowDimValueFilterSupplier = null; + private Supplier dedupRowDimValueFilterSupplier = + null; @SuppressWarnings("SameParameterValue") PartialDimensionDistributionTaskBuilder id(String id) @@ -448,11 +448,11 @@ PartialDimensionDistributionTaskBuilder taskClientFactory( return this; } - PartialDimensionDistributionTaskBuilder ungroupedRowDimValueFilterSupplier( - Supplier ungroupedRowDimValueFilterSupplier + PartialDimensionDistributionTaskBuilder dedupRowDimValueFilterSupplier( + Supplier dedupRowDimValueFilterSupplier ) { - this.ungroupedRowDimValueFilterSupplier = ungroupedRowDimValueFilterSupplier; + this.dedupRowDimValueFilterSupplier = dedupRowDimValueFilterSupplier; return this; } @@ -461,12 +461,12 @@ PartialDimensionDistributionTask build() ParallelIndexIngestionSpec ingestionSpec = ParallelIndexTestingFactory.createIngestionSpec(inputSource, INPUT_FORMAT, tuningConfig, dataSchema); - Supplier supplier = - ungroupedRowDimValueFilterSupplier == null - ? () -> new PartialDimensionDistributionTask.UngroupedRowDimensionValueFilter( + Supplier supplier = + dedupRowDimValueFilterSupplier == null + ? () -> new PartialDimensionDistributionTask.DedupRowDimensionValueFilter( dataSchema.getGranularitySpec().getQueryGranularity() ) - : ungroupedRowDimValueFilterSupplier; + : dedupRowDimValueFilterSupplier; return new PartialDimensionDistributionTask( id, From 5f10caeebf96e3de62272e86a4f86864e872db9c Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Wed, 4 Dec 2019 18:23:32 -0800 Subject: [PATCH 06/17] Fix first shard spec, partition serde, single subtask --- ...PartitionCachingLocalSegmentAllocator.java | 4 ++++ .../parallel/ParallelIndexSupervisorTask.java | 6 ++++- .../parallel/distribution/Partitions.java | 8 +++++++ ...itionCachingLocalSegmentAllocatorTest.java | 23 ++++++++++++++++--- .../PartialRangeSegmentGenerateTaskTest.java | 5 +++- .../parallel/distribution/PartitionsTest.java | 7 ++++++ 6 files changed, 48 insertions(+), 5 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java index 3ef5bd28328c..59cf8513fc3a 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java @@ -103,6 +103,9 @@ private List translatePartitions( String[] uniquePartitions = partitions.stream().distinct().toArray(String[]::new); int numUniquePartition = uniquePartitions.length; + // First partition starts with null (see StringPartitionChunk.isStart()) + uniquePartitions[0] = null; + List segmentIds = IntStream.range(0, numUniquePartition - 1) .mapToObj(i -> createSegmentIdWithShardSpec( @@ -132,6 +135,7 @@ private SegmentIdWithShardSpec createLastSegmentIdWithShardSpec( int partitionNum ) { + // Last partition ends with null (see StringPartitionChunk.isEnd()) return createSegmentIdWithShardSpec(interval, version, partitionStart, null, partitionNum); } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java index dd13cbf2b21e..130a05cc975f 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java @@ -512,7 +512,11 @@ private void initializeSubTaskCleaner() private boolean isParallelMode() { - return baseInputSource.isSplittable() && ingestionSchema.getTuningConfig().getMaxNumConcurrentSubTasks() > 1; + // Range partitioning is not implemented for runSequential() (but hash partitioning is) + int minRequiredNumConcurrentSubTasks = useRangePartitions() ? 1 : 2; + + return baseInputSource.isSplittable() + && ingestionSchema.getTuningConfig().getMaxNumConcurrentSubTasks() >= minRequiredNumConcurrentSubTasks; } /** diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/Partitions.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/Partitions.java index d3f967923471..f19bb4fea406 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/Partitions.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/Partitions.java @@ -22,6 +22,7 @@ import com.google.common.collect.ForwardingList; import com.google.common.collect.ImmutableList; +import java.util.ArrayList; import java.util.List; /** @@ -31,6 +32,13 @@ public class Partitions extends ForwardingList implements List { private final List delegate; + // For jackson + @SuppressWarnings("unused") + private Partitions() + { + delegate = new ArrayList<>(); + } + public Partitions(String... partitions) { delegate = ImmutableList.copyOf(partitions); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java index b4ab9f77a30d..590069bf946f 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java @@ -174,19 +174,36 @@ public void allocatesCorrectShardSpecsForLastPartitionWithFrequentMax() @SuppressWarnings("SameParameterValue") private void testAllocate(InputRow row, Interval interval, int partitionNum) { - testAllocate(row, interval, partitionNum, INTERVAL_TO_PARTITONS.get(interval).get(partitionNum + 1)); + String partitionEnd = getPartitionEnd(interval, partitionNum); + testAllocate(row, interval, partitionNum, partitionEnd); + } + + @Nullable + private static String getPartitionEnd(Interval interval, int partitionNum) + { + Partitions partitions = INTERVAL_TO_PARTITONS.get(interval); + boolean isLastPartition = (partitionNum + 1) == partitions.size(); + return isLastPartition ? null : partitions.get(partitionNum + 1); } private void testAllocate(InputRow row, Interval interval, int partitionNum, @Nullable String partitionEnd) { - testAllocate(row, interval, partitionNum, INTERVAL_TO_PARTITONS.get(interval).get(partitionNum), partitionEnd); + String partitionStart = getPartitionStart(interval, partitionNum); + testAllocate(row, interval, partitionNum, partitionStart, partitionEnd); + } + + @Nullable + private static String getPartitionStart(Interval interval, int partitionNum) + { + boolean isFirstPartition = partitionNum == 0; + return isFirstPartition ? null : INTERVAL_TO_PARTITONS.get(interval).get(partitionNum); } private void testAllocate( InputRow row, Interval interval, int partitionNum, - String partitionStart, + @Nullable String partitionStart, @Nullable String partitionEnd ) { diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTaskTest.java index 67a4919cd9df..9bfc1f53fbf1 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTaskTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTaskTest.java @@ -20,6 +20,7 @@ package org.apache.druid.indexing.common.task.batch.parallel; import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.ImmutableMap; import org.apache.druid.data.input.InputFormat; import org.apache.druid.data.input.InputSource; import org.apache.druid.data.input.impl.InlineInputSource; @@ -27,6 +28,8 @@ import org.apache.druid.indexer.partitions.HashedPartitionsSpec; import org.apache.druid.indexer.partitions.PartitionsSpec; import org.apache.druid.indexing.common.task.IndexTaskClientFactory; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; +import org.apache.druid.java.util.common.Intervals; import org.apache.druid.segment.TestHelper; import org.apache.druid.segment.indexing.DataSchema; import org.hamcrest.Matchers; @@ -141,7 +144,7 @@ PartialRangeSegmentGenerateTask build() ParallelIndexTestingFactory.NUM_ATTEMPTS, ingestionSpec, ParallelIndexTestingFactory.CONTEXT, - Collections.emptyMap(), + ImmutableMap.of(Intervals.ETERNITY, new Partitions("a")), ParallelIndexTestingFactory.INDEXING_SERVICE_CLIENT, taskClientFactory, ParallelIndexTestingFactory.APPENDERATORS_MANAGER diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionsTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionsTest.java index 6564ec8db368..861ec2323b57 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionsTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionsTest.java @@ -19,6 +19,7 @@ package org.apache.druid.indexing.common.task.batch.parallel.distribution; +import org.apache.druid.segment.TestHelper; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -57,4 +58,10 @@ public void cannotBeIndirectlyModified() Assert.assertEquals(Arrays.asList(originalValues), target); Assert.assertNotEquals(Arrays.asList(values), target); } + + @Test + public void serializesDeserializes() + { + TestHelper.testSerializesDeserializes(TestHelper.JSON_MAPPER, target); + } } From c338bc0ac48647ba2b9e0ac912f48195bed5df20 Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Wed, 4 Dec 2019 19:27:26 -0800 Subject: [PATCH 07/17] Fix first partition check in test --- .../RangePartitionMultiPhaseParallelIndexingTest.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java index 0ab776864251..39e8fabc4a77 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java @@ -236,11 +236,13 @@ private static void assertValuesInRange(List values, DataSegment segment { SingleDimensionShardSpec shardSpec = (SingleDimensionShardSpec) segment.getShardSpec(); String start = shardSpec.getStart(); - Assert.assertNotNull(start); String end = shardSpec.getEnd(); + Assert.assertTrue(shardSpec.toString(), start != null || end != null); for (String value : values) { - Assert.assertThat(value.compareTo(start), Matchers.greaterThanOrEqualTo(0)); + if (start != null) { + Assert.assertThat(value.compareTo(start), Matchers.greaterThanOrEqualTo(0)); + } if (end != null) { Assert.assertThat(value.compareTo(end), Matchers.lessThan(0)); From 83ab7a86a24c1eb739c3f6e44974d68b188ab898 Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Thu, 5 Dec 2019 21:33:29 -0800 Subject: [PATCH 08/17] Misc rewording/refactoring to address code review --- docs/ingestion/native-batch.md | 14 ++-- ...PartitionCachingLocalSegmentAllocator.java | 67 ++++++------------- .../GeneratedPartitionsMetadataReport.java | 8 +-- ...etadata.java => GenericPartitionStat.java} | 8 +-- .../parallel/ParallelIndexSupervisorTask.java | 24 ++++--- .../PartialDimensionDistributionTask.java | 6 +- ...egmentGenerateParallelIndexTaskRunner.java | 10 +-- .../PartialRangeSegmentGenerateTask.java | 20 +++--- ...rtitions.java => PartitionBoundaries.java} | 32 +++++++-- .../distribution/StringDistribution.java | 8 +-- .../parallel/distribution/StringSketch.java | 8 +-- ...itionCachingLocalSegmentAllocatorTest.java | 52 +++----------- ...est.java => GenericPartitionStatTest.java} | 6 +- .../PartialDimensionDistributionTaskTest.java | 31 +++++---- .../PartialRangeSegmentGenerateTaskTest.java | 4 +- ...rtitionMultiPhaseParallelIndexingTest.java | 6 +- ...Test.java => PartitionBoundariesTest.java} | 32 ++++++--- .../distribution/StringSketchMergerTest.java | 10 +-- .../distribution/StringSketchTest.java | 67 +++++++++---------- 19 files changed, 200 insertions(+), 213 deletions(-) rename indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/{PartitionMetadata.java => GenericPartitionStat.java} (90%) rename indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/{Partitions.java => PartitionBoundaries.java} (54%) rename indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/{PartitionMetadataTest.java => GenericPartitionStatTest.java} (94%) rename indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/{PartitionsTest.java => PartitionBoundariesTest.java} (67%) diff --git a/docs/ingestion/native-batch.md b/docs/ingestion/native-batch.md index 0202e937bfea..68ba38674cea 100644 --- a/docs/ingestion/native-batch.md +++ b/docs/ingestion/native-batch.md @@ -244,10 +244,13 @@ Currently only one splitHintSpec, i.e., `segments`, is available. PartitionsSpec is used to describe the secondary partitioning method. You should use different partitionsSpec depending on the [rollup mode](../ingestion/index.md#rollup) you want. For perfect rollup, you should use either `hashed` (partitioning based on the hash of dimensions in each row) or -`single_dim` (based on ranges of a single dimension. For best-effort rollup, you should use `dynamic`. +`single_dim` (based on ranges of a single dimension). For best-effort rollup, you should use `dynamic`. -For perfect rollup, `hashed` partitioning is recommended in most cases, as it will improve indexing -performance and create more uniformly sized data segments relative to single-dimension partitioning. +The three `partitionsSpec` types have different pros and cons: +- `dynamic`: Fastest ingestion speed. Guarantees a well-balanced distribution in segment size. Only best-effort rollup. +- `hashed`: Moderate ingestion speed. Creates a well-balanced distribution in segment size. Allows perfect rollup. +- `single_dim`: Slowest ingestion speed. Segment sizes may be skewed depending on the partition key, but the broker can + use the partition information to efficiently prune segments early to speed up queries. Allows perfect rollup. #### Hash-based partitioning @@ -261,7 +264,10 @@ performance and create more uniformly sized data segments relative to single-dim > Single-dimension range partitioning currently requires the > [druid-datasketches](../development/extensions-core/datasketches-extension.md) -> extension to be added to the classpath. +> extension to be [loaded from the classpath](..development/extension.md#loading-extensions-from-the-classpath). + +> Because single-range partitioning makes two passes over the input, the index task may fail if the input changes +> in between the two passes. |property|description|default|required?| |--------|-----------|-------|---------| diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java index 59cf8513fc3a..977a9bf2fc49 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocator.java @@ -22,14 +22,13 @@ import com.google.common.collect.Maps; import org.apache.druid.data.input.InputRow; import org.apache.druid.indexing.common.TaskToolbox; -import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.PartitionBoundaries; import org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec; import org.apache.druid.timeline.partition.SingleDimensionShardSpec; import org.joda.time.Interval; import javax.annotation.Nullable; import java.io.IOException; -import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; @@ -46,7 +45,7 @@ public class RangePartitionCachingLocalSegmentAllocator implements IndexTaskSegm { private final String dataSource; private final String partitionDimension; - private final Map intervalsToPartitions; + private final Map intervalsToPartitions; private final IndexTaskSegmentAllocator delegate; public RangePartitionCachingLocalSegmentAllocator( @@ -55,7 +54,7 @@ public RangePartitionCachingLocalSegmentAllocator( String supervisorTaskId, String dataSource, String partitionDimension, - Map intervalsToPartitions + Map intervalsToPartitions ) throws IOException { this.dataSource = dataSource; @@ -76,10 +75,10 @@ private Map> getIntervalToSegmentIds(Func Maps.newHashMapWithExpectedSize(intervalsToPartitions.size()); intervalsToPartitions.forEach( - (interval, partitions) -> + (interval, partitionBoundaries) -> intervalToSegmentIds.put( interval, - translatePartitions(interval, partitions, versionFinder) + translatePartitionBoundaries(interval, partitionBoundaries, versionFinder) ) ); @@ -87,56 +86,28 @@ private Map> getIntervalToSegmentIds(Func } /** - * Translate {@link org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution} partititions - * into the corresponding {@link org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec} with segment id. + * Translate {@link PartitionBoundaries} into the corresponding + * {@link org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec} with segment id. */ - private List translatePartitions( + private List translatePartitionBoundaries( Interval interval, - Partitions partitions, + PartitionBoundaries partitionBoundaries, Function versionFinder ) { - if (partitions.isEmpty()) { + if (partitionBoundaries.isEmpty()) { return Collections.emptyList(); } - String[] uniquePartitions = partitions.stream().distinct().toArray(String[]::new); - int numUniquePartition = uniquePartitions.length; - - // First partition starts with null (see StringPartitionChunk.isStart()) - uniquePartitions[0] = null; - - List segmentIds = - IntStream.range(0, numUniquePartition - 1) - .mapToObj(i -> createSegmentIdWithShardSpec( - interval, - versionFinder.apply(interval), - uniquePartitions[i], - uniquePartitions[i + 1], - i - )) - .collect(Collectors.toCollection(ArrayList::new)); - segmentIds.add( - createLastSegmentIdWithShardSpec( - interval, - versionFinder.apply(interval), - uniquePartitions[numUniquePartition - 1], - segmentIds.size() - ) - ); - - return segmentIds; - } - - private SegmentIdWithShardSpec createLastSegmentIdWithShardSpec( - Interval interval, - String version, - String partitionStart, - int partitionNum - ) - { - // Last partition ends with null (see StringPartitionChunk.isEnd()) - return createSegmentIdWithShardSpec(interval, version, partitionStart, null, partitionNum); + return IntStream.range(0, partitionBoundaries.size() - 1) + .mapToObj(i -> createSegmentIdWithShardSpec( + interval, + versionFinder.apply(interval), + partitionBoundaries.get(i), + partitionBoundaries.get(i + 1), + i + )) + .collect(Collectors.toList()); } private SegmentIdWithShardSpec createSegmentIdWithShardSpec( diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedPartitionsMetadataReport.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedPartitionsMetadataReport.java index 9b50f9f7f37d..021422bd3dd1 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedPartitionsMetadataReport.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GeneratedPartitionsMetadataReport.java @@ -25,19 +25,19 @@ import java.util.List; /** - * Report containing the {@link PartitionMetadata}s created by a {@link PartialSegmentGenerateTask}. This report is + * Report containing the {@link GenericPartitionStat}s created by a {@link PartialSegmentGenerateTask}. This report is * collected by {@link ParallelIndexSupervisorTask} and used to generate {@link PartialGenericSegmentMergeIOConfig}. */ -class GeneratedPartitionsMetadataReport extends GeneratedPartitionsReport implements SubTaskReport +class GeneratedPartitionsMetadataReport extends GeneratedPartitionsReport implements SubTaskReport { public static final String TYPE = "generated_partitions_metadata"; @JsonCreator GeneratedPartitionsMetadataReport( @JsonProperty("taskId") String taskId, - @JsonProperty("partitionStats") List partitionMetadata + @JsonProperty("partitionStats") List partitionStats ) { - super(taskId, partitionMetadata); + super(taskId, partitionStats); } } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartitionMetadata.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStat.java similarity index 90% rename from indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartitionMetadata.java rename to indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStat.java index e8f5c4a5503a..5f4d16db2b19 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartitionMetadata.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStat.java @@ -28,12 +28,12 @@ import java.util.Objects; /** - * Partition description ({@link ShardSpec}) and statistics created by {@link PartialSegmentGenerateTask}. Each + * Generic partition description ({@link ShardSpec}) and statistics created by {@link PartialSegmentGenerateTask}. Each * partition is a set of data of the same time chunk (primary partition key) and the same {@link ShardSpec} (secondary * partition key). The {@link ShardSpec} is later used by {@link PartialGenericSegmentMergeTask} to merge the partial * segments. */ -public class PartitionMetadata extends PartitionStat +public class GenericPartitionStat extends PartitionStat { private static final String PROP_SHARD_SPEC = "shardSpec"; @@ -41,7 +41,7 @@ public class PartitionMetadata extends PartitionStat private final ShardSpec shardSpec; @JsonCreator - public PartitionMetadata( + public GenericPartitionStat( @JsonProperty("taskExecutorHost") String taskExecutorHost, @JsonProperty("taskExecutorPort") int taskExecutorPort, @JsonProperty("useHttps") boolean useHttps, @@ -80,7 +80,7 @@ public boolean equals(Object o) if (!super.equals(o)) { return false; } - PartitionMetadata that = (PartitionMetadata) o; + GenericPartitionStat that = (GenericPartitionStat) o; return Objects.equals(shardSpec, that.shardSpec); } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java index 130a05cc975f..444ad6e3cdd6 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java @@ -58,7 +58,7 @@ import org.apache.druid.indexing.common.task.TaskResource; import org.apache.druid.indexing.common.task.Tasks; import org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexTaskRunner.SubTaskSpecStatus; -import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.PartitionBoundaries; import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution; import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistributionMerger; import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch; @@ -323,7 +323,7 @@ PartialDimensionDistributionParallelIndexTaskRunner createPartialDimensionDistri @VisibleForTesting PartialRangeSegmentGenerateParallelIndexTaskRunner createPartialRangeSegmentGenerateRunner( TaskToolbox toolbox, - Map intervalToPartitions + Map intervalToPartitions ) { return new PartialRangeSegmentGenerateParallelIndexTaskRunner( @@ -394,7 +394,11 @@ private static void assertDataSketchesAvailable() new StringSketch(); } catch (NoClassDefFoundError e) { - throw new ISE(e, "DataSketches is unvailable. Try adding the druid-datasketches extension to the classpath."); + throw new ISE( + e, + "DataSketches is unvailable." + + " Try loading the druid-datasketches extension from the classpath for the overlord and middleManagers/indexers." + ); } } @@ -605,7 +609,7 @@ private TaskStatus runRangePartitionMultiPhaseParallel(TaskToolbox toolbox) thro return TaskStatus.failure(getId()); } - Map intervalToPartitions = + Map intervalToPartitions = determineAllRangePartitions(distributionRunner.getReports().values()); if (intervalToPartitions.isEmpty()) { @@ -615,7 +619,7 @@ private TaskStatus runRangePartitionMultiPhaseParallel(TaskToolbox toolbox) thro return TaskStatus.success(getId(), msg); } - ParallelIndexTaskRunner> indexingRunner = + ParallelIndexTaskRunner> indexingRunner = createRunner(toolbox, tb -> createPartialRangeSegmentGenerateRunner(tb, intervalToPartitions)); TaskState indexingState = runNextPhase(indexingRunner); @@ -643,7 +647,7 @@ private TaskStatus runRangePartitionMultiPhaseParallel(TaskToolbox toolbox) thro return TaskStatus.fromCode(getId(), mergeState); } - private Map determineAllRangePartitions(Collection reports) + private Map determineAllRangePartitions(Collection reports) { Multimap intervalToDistributions = ArrayListMultimap.create(); reports.forEach(report -> { @@ -654,7 +658,7 @@ private Map determineAllRangePartitions(Collection distributions) + private PartitionBoundaries determineRangePartition(Collection distributions) { StringDistributionMerger distributionMerger = new StringSketchMerger(); distributions.forEach(distributionMerger::merge); @@ -663,7 +667,7 @@ private Partitions determineRangePartition(Collection distri SingleDimensionPartitionsSpec partitionsSpec = (SingleDimensionPartitionsSpec) ingestionSchema.getTuningConfig().getGivenOrDefaultPartitionsSpec(); - final Partitions partitions; + final PartitionBoundaries partitions; Integer targetRowsPerSegment = partitionsSpec.getTargetRowsPerSegment(); if (targetRowsPerSegment == null) { partitions = mergedDistribution.getEvenPartitionsByMaxSize(partitionsSpec.getMaxRowsPerSegment()); @@ -693,10 +697,10 @@ private static Map, List> groupHa } private static Map, List> groupGenericPartitionLocationsPerPartition( - Map> subTaskIdToReport + Map> subTaskIdToReport ) { - BiFunction createPartitionLocationFunction = + BiFunction createPartitionLocationFunction = (subtaskId, partitionStat) -> new GenericPartitionLocation( partitionStat.getTaskExecutorHost(), diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java index 7b013b8cb926..ca4c1838b939 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java @@ -192,9 +192,9 @@ public TaskStatus runTask(TaskToolbox toolbox) throws Exception ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig(); SingleDimensionPartitionsSpec partitionsSpec = (SingleDimensionPartitionsSpec) tuningConfig.getPartitionsSpec(); - Preconditions.checkNotNull(partitionsSpec); + Preconditions.checkNotNull(partitionsSpec, "partitionsSpec required in tuningConfig"); String partitionDimension = partitionsSpec.getPartitionDimension(); - Preconditions.checkNotNull(partitionDimension, "partitionDimension required"); + Preconditions.checkNotNull(partitionDimension, "partitionDimension required in partitionsSpec"); boolean isAssumeGrouped = partitionsSpec.isAssumeGrouped(); InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource( @@ -294,7 +294,7 @@ private Map determineDistribution( } } - // UngroupedDimValueFilter may not accept the min/max dimensionValue. If needed, add the min/max + // DedupRowDimensionValueFilter may not accept the min/max dimensionValue. If needed, add the min/max // values to the distributions so they have an accurate min/max. dimValueFilter.getIntervalToMinDimensionValue() .forEach((interval, min) -> intervalToDistribution.get(interval).putIfNewMin(min)); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java index 06f6ddb1d2b9..71f084dab86e 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java @@ -24,7 +24,7 @@ import org.apache.druid.data.input.InputSplit; import org.apache.druid.indexing.common.TaskToolbox; import org.apache.druid.indexing.common.task.IndexTaskClientFactory; -import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.PartitionBoundaries; import org.apache.druid.segment.realtime.appenderator.AppenderatorsManager; import org.joda.time.Interval; @@ -36,11 +36,11 @@ * @see PartialHashSegmentMergeParallelIndexTaskRunner */ class PartialRangeSegmentGenerateParallelIndexTaskRunner - extends InputSourceSplitParallelIndexTaskRunner> + extends InputSourceSplitParallelIndexTaskRunner> { private final IndexTaskClientFactory taskClientFactory; private final AppenderatorsManager appenderatorsManager; - private final Map intervalToPartitions; + private final Map intervalToPartitions; PartialRangeSegmentGenerateParallelIndexTaskRunner( TaskToolbox toolbox, @@ -49,7 +49,7 @@ class PartialRangeSegmentGenerateParallelIndexTaskRunner ParallelIndexIngestionSpec ingestionSchema, Map context, IndexingServiceClient indexingServiceClient, - Map intervalToPartitions + Map intervalToPartitions ) { this( @@ -73,7 +73,7 @@ class PartialRangeSegmentGenerateParallelIndexTaskRunner ParallelIndexIngestionSpec ingestionSchema, Map context, IndexingServiceClient indexingServiceClient, - Map intervalToPartitions, + Map intervalToPartitions, IndexTaskClientFactory taskClientFactory, AppenderatorsManager appenderatorsManager ) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java index 00ec70f22ef9..3cecf67c8690 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java @@ -32,7 +32,7 @@ import org.apache.druid.indexing.common.task.IndexTaskSegmentAllocator; import org.apache.druid.indexing.common.task.RangePartitionCachingLocalSegmentAllocator; import org.apache.druid.indexing.common.task.TaskResource; -import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.PartitionBoundaries; import org.apache.druid.indexing.common.task.batch.parallel.iterator.RangePartitionIndexTaskInputRowIteratorBuilder; import org.apache.druid.indexing.worker.ShuffleDataSegmentPusher; import org.apache.druid.segment.realtime.appenderator.AppenderatorsManager; @@ -58,7 +58,7 @@ public class PartialRangeSegmentGenerateTask extends PartialSegmentGenerateTask< private final String supervisorTaskId; private final int numAttempts; private final ParallelIndexIngestionSpec ingestionSchema; - private final Map intervalToPartitions; + private final Map intervalToPartitions; @JsonCreator public PartialRangeSegmentGenerateTask( @@ -70,7 +70,7 @@ public PartialRangeSegmentGenerateTask( @JsonProperty("numAttempts") int numAttempts, // zero-based counting @JsonProperty(PROP_SPEC) ParallelIndexIngestionSpec ingestionSchema, @JsonProperty("context") Map context, - @JsonProperty("intervalToPartitions") Map intervalToPartitions, + @JsonProperty("intervalToPartitions") Map intervalToPartitions, @JacksonInject IndexingServiceClient indexingServiceClient, @JacksonInject IndexTaskClientFactory taskClientFactory, @JacksonInject AppenderatorsManager appenderatorsManager @@ -130,7 +130,7 @@ public String getSupervisorTaskId() } @JsonProperty - public Map getIntervalToPartitions() + public Map getIntervalToPartitions() { return intervalToPartitions; } @@ -163,15 +163,15 @@ IndexTaskSegmentAllocator createSegmentAllocator(TaskToolbox toolbox) throws IOE @Override GeneratedPartitionsMetadataReport createGeneratedPartitionsReport(TaskToolbox toolbox, List segments) { - List partitionsMetadata = segments.stream() - .map(segment -> createPartitionStat(toolbox, segment)) - .collect(Collectors.toList()); - return new GeneratedPartitionsMetadataReport(getId(), partitionsMetadata); + List partitionStats = segments.stream() + .map(segment -> createPartitionStat(toolbox, segment)) + .collect(Collectors.toList()); + return new GeneratedPartitionsMetadataReport(getId(), partitionStats); } - private PartitionMetadata createPartitionStat(TaskToolbox toolbox, DataSegment segment) + private GenericPartitionStat createPartitionStat(TaskToolbox toolbox, DataSegment segment) { - return new PartitionMetadata( + return new GenericPartitionStat( toolbox.getTaskExecutorNode().getHost(), toolbox.getTaskExecutorNode().getPortToUse(), toolbox.getTaskExecutorNode().isEnableTlsPort(), diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/Partitions.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionBoundaries.java similarity index 54% rename from indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/Partitions.java rename to indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionBoundaries.java index f19bb4fea406..dc14ace91c6e 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/Partitions.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionBoundaries.java @@ -20,28 +20,48 @@ package org.apache.druid.indexing.common.task.batch.parallel.distribution; import com.google.common.collect.ForwardingList; -import com.google.common.collect.ImmutableList; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.List; +import java.util.stream.Collectors; /** - * Convenience wrapper to make code more readable. + * List of range partition boundaries. */ -public class Partitions extends ForwardingList implements List +public class PartitionBoundaries extends ForwardingList implements List { private final List delegate; // For jackson @SuppressWarnings("unused") - private Partitions() + private PartitionBoundaries() { delegate = new ArrayList<>(); } - public Partitions(String... partitions) + /** + * @param partitions Elements corresponding to evenly-spaced fractional ranks of the distribution + */ + public PartitionBoundaries(String... partitions) { - delegate = ImmutableList.copyOf(partitions); + if (partitions.length == 0) { + delegate = Collections.emptyList(); + return; + } + + List partitionBoundaries = Arrays.stream(partitions) + .distinct() + .collect(Collectors.toCollection(ArrayList::new)); + + // First partition starts with null (see StringPartitionChunk.isStart()) + partitionBoundaries.set(0, null); + + // Last partition ends with null (see StringPartitionChunk.isEnd()) + partitionBoundaries.add(null); + + delegate = Collections.unmodifiableList(partitionBoundaries); } @Override diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistribution.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistribution.java index 116dea63f2c2..5fbd8d61abb4 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistribution.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringDistribution.java @@ -50,17 +50,17 @@ public interface StringDistribution * Split the distribution in the fewest number of evenly-sized partitions while honoring a max * partition size. * - * @return Array of elements that correspond to the endpoints of evenly-sized partitions of the + * @return List of elements that correspond to the endpoints of evenly-sized partitions of the * sorted elements. */ - Partitions getEvenPartitionsByMaxSize(int maxSize); + PartitionBoundaries getEvenPartitionsByMaxSize(int maxSize); /** * Split the distribution in the fewest number of evenly-sized partitions while honoring a target * partition size (actual partition sizes may be slightly lower or higher). * - * @return Array of elements that correspond to the endpoints of evenly-sized partitions of the + * @return List of elements that correspond to the endpoints of evenly-sized partitions of the * sorted elements. */ - Partitions getEvenPartitionsByTargetSize(int targetSize); + PartitionBoundaries getEvenPartitionsByTargetSize(int targetSize); } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java index 8fff0f742ee4..8203ffad04cc 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java @@ -86,7 +86,7 @@ public void putIfNewMax(String string) } @Override - public Partitions getEvenPartitionsByMaxSize(int maxSize) + public PartitionBoundaries getEvenPartitionsByMaxSize(int maxSize) { Preconditions.checkArgument(maxSize > 0, "maxSize must be positive but is %s", maxSize); long n = delegate.getN(); @@ -97,7 +97,7 @@ public Partitions getEvenPartitionsByMaxSize(int maxSize) } @Override - public Partitions getEvenPartitionsByTargetSize(int targetSize) + public PartitionBoundaries getEvenPartitionsByTargetSize(int targetSize) { Preconditions.checkArgument(targetSize > 0, "targetSize must be positive but is %s", targetSize); long n = delegate.getN(); @@ -105,7 +105,7 @@ public Partitions getEvenPartitionsByTargetSize(int targetSize) return getEvenPartitionsByCount(evenPartitionCount); } - private Partitions getEvenPartitionsByCount(int evenPartitionCount) + private PartitionBoundaries getEvenPartitionsByCount(int evenPartitionCount) { Preconditions.checkArgument( evenPartitionCount > 0, @@ -113,7 +113,7 @@ private Partitions getEvenPartitionsByCount(int evenPartitionCount) evenPartitionCount ); String[] partitions = delegate.getQuantiles(evenPartitionCount + 1); // add 1 since this returns endpoints - return new Partitions((partitions == null) ? new String[0] : partitions); + return new PartitionBoundaries((partitions == null) ? new String[0] : partitions); } @Override diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java index 590069bf946f..c2de2200a465 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java @@ -25,7 +25,7 @@ import org.apache.druid.indexing.common.TaskToolbox; import org.apache.druid.indexing.common.actions.LockListAction; import org.apache.druid.indexing.common.actions.TaskActionClient; -import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.PartitionBoundaries; import org.apache.druid.java.util.common.DateTimes; import org.apache.druid.java.util.common.Intervals; import org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec; @@ -56,40 +56,26 @@ public class RangePartitionCachingLocalSegmentAllocatorTest private static final Interval INTERVAL_EMPTY = Intervals.utc(0, 1000); private static final Interval INTERVAL_SINGLETON = Intervals.utc(1000, 2000); private static final Interval INTERVAL_NORMAL = Intervals.utc(2000, 3000); - private static final Interval INTERVAL_FREQUENT_MID = Intervals.utc(3000, 4000); - private static final Interval INTERVAL_FREQUENT_MAX = Intervals.utc(5000, 6000); private static final Map INTERVAL_TO_VERSION = ImmutableMap.of( INTERVAL_EMPTY, "version-empty", INTERVAL_SINGLETON, "version-singleton", - INTERVAL_NORMAL, "version-normal", - INTERVAL_FREQUENT_MID, "version-frequent-mid", - INTERVAL_FREQUENT_MAX, "version-frequent-max" + INTERVAL_NORMAL, "version-normal" ); private static final String PARTITION0 = "0"; private static final String PARTITION5 = "5"; private static final String PARTITION9 = "9"; - private static final Partitions EMPTY_PARTITIONS = new Partitions(); - private static final Partitions SINGLETON_PARTITIONS = new Partitions(PARTITION0, PARTITION0); - private static final Partitions NORMAL_PARTITIONS = new Partitions(PARTITION0, PARTITION5, PARTITION9); - private static final Partitions FREQUENT_MID_PARTITIONS = new Partitions( + private static final PartitionBoundaries EMPTY_PARTITIONS = new PartitionBoundaries(); + private static final PartitionBoundaries SINGLETON_PARTITIONS = new PartitionBoundaries(PARTITION0, PARTITION0); + private static final PartitionBoundaries NORMAL_PARTITIONS = new PartitionBoundaries( PARTITION0, PARTITION5, - PARTITION5, - PARTITION9 - ); - private static final Partitions FREQUENT_MAX_PARTITIONS = new Partitions( - PARTITION0, - PARTITION5, - PARTITION9, PARTITION9 ); - private static final Map INTERVAL_TO_PARTITONS = ImmutableMap.of( + private static final Map INTERVAL_TO_PARTITONS = ImmutableMap.of( INTERVAL_EMPTY, EMPTY_PARTITIONS, INTERVAL_SINGLETON, SINGLETON_PARTITIONS, - INTERVAL_NORMAL, NORMAL_PARTITIONS, - INTERVAL_FREQUENT_MID, FREQUENT_MID_PARTITIONS, - INTERVAL_FREQUENT_MAX, FREQUENT_MAX_PARTITIONS + INTERVAL_NORMAL, NORMAL_PARTITIONS ); private RangePartitionCachingLocalSegmentAllocator target; @@ -147,28 +133,12 @@ public void allocatesCorrectShardSpecsForFirstPartition() } @Test - public void allocatesCorrectShardSpecsForLastPartitionWithoutFrequentValue() + public void allocatesCorrectShardSpecsForLastPartition() { Interval interval = INTERVAL_NORMAL; InputRow row = createInputRow(interval, PARTITION9); - testAllocate(row, interval, INTERVAL_TO_PARTITONS.get(interval).size() - 1, null); - } - - @Test - public void allocatesCorrectShardSpecsForLastPartitionWithFrequentMid() - { - Interval interval = INTERVAL_FREQUENT_MID; - InputRow row = createInputRow(interval, PARTITION9); - Partitions partitions = INTERVAL_TO_PARTITONS.get(interval); - testAllocate(row, interval, partitions.size() - 2, partitions.get(partitions.size() - 1), null); - } - - @Test - public void allocatesCorrectShardSpecsForLastPartitionWithFrequentMax() - { - Interval interval = INTERVAL_FREQUENT_MAX; - InputRow row = createInputRow(interval, PARTITION9); - testAllocate(row, interval, INTERVAL_TO_PARTITONS.get(interval).size() - 2, null); + int partitionNum = INTERVAL_TO_PARTITONS.get(interval).size() - 2; // -2 = -1 0-based + -1 skip null tail + testAllocate(row, interval, partitionNum, null); } @SuppressWarnings("SameParameterValue") @@ -181,7 +151,7 @@ private void testAllocate(InputRow row, Interval interval, int partitionNum) @Nullable private static String getPartitionEnd(Interval interval, int partitionNum) { - Partitions partitions = INTERVAL_TO_PARTITONS.get(interval); + PartitionBoundaries partitions = INTERVAL_TO_PARTITONS.get(interval); boolean isLastPartition = (partitionNum + 1) == partitions.size(); return isLastPartition ? null : partitions.get(partitionNum + 1); } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartitionMetadataTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStatTest.java similarity index 94% rename from indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartitionMetadataTest.java rename to indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStatTest.java index 3deb64d391e7..2bcac8edfd47 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartitionMetadataTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/GenericPartitionStatTest.java @@ -25,16 +25,16 @@ import org.junit.Before; import org.junit.Test; -public class PartitionMetadataTest +public class GenericPartitionStatTest { private static final ObjectMapper OBJECT_MAPPER = ParallelIndexTestingFactory.createObjectMapper(); - private PartitionMetadata target; + private GenericPartitionStat target; @Before public void setup() { - target = new PartitionMetadata( + target = new GenericPartitionStat( ParallelIndexTestingFactory.TASK_EXECUTOR_HOST, ParallelIndexTestingFactory.TASK_EXECUTOR_PORT, ParallelIndexTestingFactory.USE_HTTPS, diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java index e7334ba0cf45..9e622b83e869 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java @@ -33,7 +33,7 @@ import org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec; import org.apache.druid.indexing.common.TaskToolbox; import org.apache.druid.indexing.common.task.IndexTaskClientFactory; -import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.PartitionBoundaries; import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.segment.TestHelper; @@ -274,11 +274,10 @@ public void sendsCorrectReportWhenAssumeGroupedTrue() Map intervalToDistribution = report.getIntervalToDistribution(); StringDistribution distribution = Iterables.getOnlyElement(intervalToDistribution.values()); Assert.assertNotNull(distribution); - Partitions partitions = distribution.getEvenPartitionsByMaxSize(1); - Assert.assertEquals(3, partitions.size()); - Assert.assertEquals(dimensionValue, partitions.get(0)); - Assert.assertEquals(dimensionValue, partitions.get(1)); - Assert.assertEquals(dimensionValue, partitions.get(2)); + PartitionBoundaries partitions = distribution.getEvenPartitionsByMaxSize(1); + Assert.assertEquals(2, partitions.size()); + Assert.assertNull(partitions.get(0)); + Assert.assertNull(partitions.get(1)); } @Test @@ -305,10 +304,10 @@ public void groupsRowsWhenAssumeGroupedFalse() Map intervalToDistribution = report.getIntervalToDistribution(); StringDistribution distribution = Iterables.getOnlyElement(intervalToDistribution.values()); Assert.assertNotNull(distribution); - Partitions partitions = distribution.getEvenPartitionsByMaxSize(1); + PartitionBoundaries partitions = distribution.getEvenPartitionsByMaxSize(1); Assert.assertEquals(2, partitions.size()); - Assert.assertEquals(dimensionValue, partitions.get(0)); - Assert.assertEquals(dimensionValue, partitions.get(1)); + Assert.assertNull(partitions.get(0)); + Assert.assertNull(partitions.get(1)); } @Test @@ -323,8 +322,6 @@ public void preservesMinAndMaxWhenAssumeGroupedFalse() List dimensionValues = IntStream.range(0, minBloomFilterBits * 10) .mapToObj(i -> StringUtils.format("%010d", i)) .collect(Collectors.toCollection(ArrayList::new)); - String minDimensionValue = dimensionValues.get(0); - String maxDimensionValue = dimensionValues.get(dimensionValues.size() - 1); List rows = dimensionValues.stream() .map(d -> ParallelIndexTestingFactory.createRow(timestamp, d)) .collect(Collectors.toList()); @@ -360,10 +357,16 @@ public void preservesMinAndMaxWhenAssumeGroupedFalse() Map intervalToDistribution = report.getIntervalToDistribution(); StringDistribution distribution = Iterables.getOnlyElement(intervalToDistribution.values()); Assert.assertNotNull(distribution); - Partitions partitions = distribution.getEvenPartitionsByMaxSize(1); + PartitionBoundaries partitions = distribution.getEvenPartitionsByMaxSize(1); Assert.assertEquals(minBloomFilterBits + 3, partitions.size()); // 3 = min + max + exclusive endpoint - Assert.assertEquals(minDimensionValue, partitions.get(0)); - Assert.assertEquals(maxDimensionValue, partitions.get(partitions.size() - 1)); + + // Min + Assert.assertNull(partitions.get(0)); + Assert.assertEquals(dimensionValues.get(1), partitions.get(1)); + + // Max + Assert.assertNull(partitions.get(partitions.size() - 1)); + Assert.assertEquals(dimensionValues.get(dimensionValues.size() - 1), partitions.get(partitions.size() - 2)); } @Test diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTaskTest.java index 9bfc1f53fbf1..0e12010185e4 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTaskTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTaskTest.java @@ -28,7 +28,7 @@ import org.apache.druid.indexer.partitions.HashedPartitionsSpec; import org.apache.druid.indexer.partitions.PartitionsSpec; import org.apache.druid.indexing.common.task.IndexTaskClientFactory; -import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.PartitionBoundaries; import org.apache.druid.java.util.common.Intervals; import org.apache.druid.segment.TestHelper; import org.apache.druid.segment.indexing.DataSchema; @@ -144,7 +144,7 @@ PartialRangeSegmentGenerateTask build() ParallelIndexTestingFactory.NUM_ATTEMPTS, ingestionSpec, ParallelIndexTestingFactory.CONTEXT, - ImmutableMap.of(Intervals.ETERNITY, new Partitions("a")), + ImmutableMap.of(Intervals.ETERNITY, new PartitionBoundaries("a")), ParallelIndexTestingFactory.INDEXING_SERVICE_CLIENT, taskClientFactory, ParallelIndexTestingFactory.APPENDERATORS_MANAGER diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java index 39e8fabc4a77..1f75da389552 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java @@ -36,7 +36,7 @@ import org.apache.druid.indexing.common.task.IndexTaskClientFactory; import org.apache.druid.indexing.common.task.TaskResource; import org.apache.druid.indexing.common.task.TestAppenderatorsManager; -import org.apache.druid.indexing.common.task.batch.parallel.distribution.Partitions; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.PartitionBoundaries; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.Intervals; import org.apache.druid.java.util.common.StringUtils; @@ -293,7 +293,7 @@ PartialDimensionDistributionParallelIndexTaskRunner createPartialDimensionDistri @Override PartialRangeSegmentGenerateParallelIndexTaskRunner createPartialRangeSegmentGenerateRunner( TaskToolbox toolbox, - Map intervalToPartitions + Map intervalToPartitions ) { return new TestPartialRangeSegmentGenerateRunner( @@ -346,7 +346,7 @@ private TestPartialRangeSegmentGenerateRunner( TaskToolbox toolbox, ParallelIndexSupervisorTask supervisorTask, IndexingServiceClient indexingServiceClient, - Map intervalToPartitions + Map intervalToPartitions ) { super( diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionsTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionBoundariesTest.java similarity index 67% rename from indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionsTest.java rename to indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionBoundariesTest.java index 861ec2323b57..d1b20fde71cf 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionsTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionBoundariesTest.java @@ -25,23 +25,27 @@ import org.junit.Test; import java.util.Arrays; +import java.util.Collections; +import java.util.List; -public class PartitionsTest +public class PartitionBoundariesTest { - private Partitions target; + private PartitionBoundaries target; private String[] values; + private List expected; @Before public void setup() { - values = new String[]{"a", "b"}; - target = new Partitions(values); + values = new String[]{"a", "dup", "dup", "z"}; + expected = Arrays.asList(null, "dup", "z", null); + target = new PartitionBoundaries(values); } @Test public void hasCorrectValues() { - Assert.assertEquals(Arrays.asList(values), target); + Assert.assertEquals(expected, target); } @Test(expected = UnsupportedOperationException.class) @@ -53,10 +57,20 @@ public void isImmutable() @Test public void cannotBeIndirectlyModified() { - String[] originalValues = Arrays.copyOf(values, values.length); - values[0] = "changed"; - Assert.assertEquals(Arrays.asList(originalValues), target); - Assert.assertNotEquals(Arrays.asList(values), target); + values[1] = "changed"; + Assert.assertEquals(expected, target); + } + + @Test + public void handlesNoValues() + { + Assert.assertEquals(Collections.emptyList(), new PartitionBoundaries()); + } + + @Test + public void handlesRepeatedValue() + { + Assert.assertEquals(Arrays.asList(null, null), new PartitionBoundaries("a", "a", "a")); } @Test diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java index 9ca6c07b8835..8d0c987d0630 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java @@ -70,11 +70,11 @@ public void mergesCorrectly() target.merge(sketch3); StringDistribution merged = target.getResult(); - Partitions partitions = merged.getEvenPartitionsByMaxSize(1); + PartitionBoundaries partitions = merged.getEvenPartitionsByMaxSize(1); Assert.assertEquals(4, partitions.size()); - Assert.assertEquals(string1, partitions.get(0)); // min - Assert.assertEquals(string2, partitions.get(1)); // median - Assert.assertEquals(string3, partitions.get(2)); // max - Assert.assertEquals(string3, partitions.get(3)); // max + Assert.assertNull(partitions.get(0)); + Assert.assertEquals(string2, partitions.get(1)); + Assert.assertEquals(string3, partitions.get(2)); + Assert.assertNull(partitions.get(3)); } } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java index 9381a7b33a5c..0e21c137fb75 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java @@ -155,26 +155,25 @@ public void requiresPositiveSize() public void handlesEmptySketch() { StringSketch sketch = new StringSketch(); - Partitions partitions = sketch.getEvenPartitionsByTargetSize(1); + PartitionBoundaries partitions = sketch.getEvenPartitionsByTargetSize(1); Assert.assertEquals(0, partitions.size()); } @Test public void handlesSingletonSketch() { - String value = MIN_STRING; StringSketch sketch = new StringSketch(); - sketch.put(value); - Partitions partitions = sketch.getEvenPartitionsByTargetSize(1); + sketch.put(MIN_STRING); + PartitionBoundaries partitions = sketch.getEvenPartitionsByTargetSize(1); Assert.assertEquals(2, partitions.size()); - Assert.assertEquals(value, partitions.get(0)); - Assert.assertEquals(value, partitions.get(1)); + Assert.assertNull(partitions.get(0)); + Assert.assertNull(partitions.get(1)); } @Test public void handlesMinimimumSize() { - Partitions partitions = SKETCH.getEvenPartitionsByTargetSize(1); + PartitionBoundaries partitions = SKETCH.getEvenPartitionsByTargetSize(1); assertMaxNumberOfPartitions(partitions); } @@ -187,7 +186,7 @@ public void handlesUnevenPartitions() private static void testHandlesUnevenPartitions(int targetSize) { - Partitions partitions = SKETCH.getEvenPartitionsByTargetSize(targetSize); + PartitionBoundaries partitions = SKETCH.getEvenPartitionsByTargetSize(targetSize); assertFirstAndLastPartitionsCorrect(partitions); @@ -197,16 +196,16 @@ private static void testHandlesUnevenPartitions(int targetSize) Assert.assertThat( "targetSize=" + targetSize + " " + partitionsString, partitions.size(), - Matchers.lessThanOrEqualTo(expectedHighPartitionCount + 1) + Matchers.lessThanOrEqualTo(expectedHighPartitionCount + 2) // +2 = endpoint + null ); Assert.assertThat( "targetSize=" + targetSize + " " + partitionsString, partitions.size(), - Matchers.greaterThanOrEqualTo(expectedLowPartitionCount + 1) + Matchers.greaterThanOrEqualTo(expectedLowPartitionCount + 2) // +2 = endpoint + null ); int previous = 0; - for (int i = 1; i < partitions.size(); i++) { + for (int i = 1; i < partitions.size() - 1; i++) { int current = Integer.parseInt(partitions.get(i)); int size = current - previous; Assert.assertThat( @@ -221,14 +220,14 @@ private static void testHandlesUnevenPartitions(int targetSize) @Test public void handlesSinglePartition() { - Partitions partitions = SKETCH.getEvenPartitionsByTargetSize(NUM_STRING); + PartitionBoundaries partitions = SKETCH.getEvenPartitionsByTargetSize(NUM_STRING); assertSinglePartition(partitions); } @Test public void handlesOversizedPartition() { - Partitions partitions = SKETCH.getEvenPartitionsByTargetSize(Integer.MAX_VALUE); + PartitionBoundaries partitions = SKETCH.getEvenPartitionsByTargetSize(Integer.MAX_VALUE); assertSinglePartition(partitions); } } @@ -251,26 +250,25 @@ public void requiresPositiveSize() public void handlesEmptySketch() { StringSketch sketch = new StringSketch(); - Partitions partitions = sketch.getEvenPartitionsByMaxSize(1); + PartitionBoundaries partitions = sketch.getEvenPartitionsByMaxSize(1); Assert.assertEquals(0, partitions.size()); } @Test public void handlesSingletonSketch() { - String value = MIN_STRING; StringSketch sketch = new StringSketch(); - sketch.put(value); - Partitions partitions = sketch.getEvenPartitionsByMaxSize(1); + sketch.put(MIN_STRING); + PartitionBoundaries partitions = sketch.getEvenPartitionsByMaxSize(1); Assert.assertEquals(2, partitions.size()); - Assert.assertEquals(value, partitions.get(0)); - Assert.assertEquals(value, partitions.get(1)); + Assert.assertNull(partitions.get(0)); + Assert.assertNull(partitions.get(1)); } @Test public void handlesMinimimumSize() { - Partitions partitions = SKETCH.getEvenPartitionsByMaxSize(1); + PartitionBoundaries partitions = SKETCH.getEvenPartitionsByMaxSize(1); assertMaxNumberOfPartitions(partitions); } @@ -283,7 +281,7 @@ public void handlesUnevenPartitions() private static void testHandlesUnevenPartitions(int maxSize) { - Partitions partitions = SKETCH.getEvenPartitionsByMaxSize(maxSize); + PartitionBoundaries partitions = SKETCH.getEvenPartitionsByMaxSize(maxSize); assertFirstAndLastPartitionsCorrect(partitions); @@ -291,14 +289,14 @@ private static void testHandlesUnevenPartitions(int maxSize) long expectedPartitionCount = (long) Math.ceil((double) NUM_STRING / maxSize); Assert.assertEquals( "maxSize=" + maxSize + " " + partitionsString, - expectedPartitionCount + 1, + expectedPartitionCount + 2, // +2 = endpoint + null partitions.size() ); double minSize = (double) NUM_STRING / expectedPartitionCount - DELTA; int previous = 0; - for (int i = 1; i < partitions.size(); i++) { + for (int i = 1; i < partitions.size() - 1; i++) { int current = Integer.parseInt(partitions.get(i)); int size = current - previous; Assert.assertThat( @@ -318,27 +316,27 @@ private static void testHandlesUnevenPartitions(int maxSize) @Test public void handlesSinglePartition() { - Partitions partitions = SKETCH.getEvenPartitionsByMaxSize((int) Math.ceil(NUM_STRING + DELTA)); + PartitionBoundaries partitions = SKETCH.getEvenPartitionsByMaxSize((int) Math.ceil(NUM_STRING + DELTA)); assertSinglePartition(partitions); } @Test public void handlesOversizedPartition() { - Partitions partitions = SKETCH.getEvenPartitionsByMaxSize(Integer.MAX_VALUE); + PartitionBoundaries partitions = SKETCH.getEvenPartitionsByMaxSize(Integer.MAX_VALUE); assertSinglePartition(partitions); } } - private static void assertMaxNumberOfPartitions(Partitions partitions) + private static void assertMaxNumberOfPartitions(PartitionBoundaries partitions) { String partitionsString = toString(partitions); - Assert.assertEquals(partitionsString, NUM_STRING + 1, partitions.size()); + Assert.assertEquals(partitionsString, StringSketch.SKETCH_K + 2, partitions.size()); // +2 = endpoint + null assertFirstAndLastPartitionsCorrect(partitions); int previous = 0; - for (int i = 1; i < partitions.size(); i++) { + for (int i = 1; i < partitions.size() - 1; i++) { int current = Integer.parseInt(partitions.get(i)); Assert.assertEquals( getErrMsgPrefix(1, i) + partitionsString, @@ -350,16 +348,17 @@ private static void assertMaxNumberOfPartitions(Partitions partitions) } } - private static void assertSinglePartition(Partitions partitions) + private static void assertSinglePartition(PartitionBoundaries partitions) { - Assert.assertEquals(2, partitions.size()); + Assert.assertEquals(3, partitions.size()); // +2 = endpoint + null assertFirstAndLastPartitionsCorrect(partitions); } - private static void assertFirstAndLastPartitionsCorrect(Partitions partitions) + private static void assertFirstAndLastPartitionsCorrect(PartitionBoundaries partitions) { - Assert.assertEquals(MIN_STRING, partitions.get(0)); - Assert.assertEquals(MAX_STRING, partitions.get(partitions.size() - 1)); + Assert.assertNull(partitions.get(0)); + Assert.assertEquals(MAX_STRING, partitions.get(partitions.size() - 2)); + Assert.assertNull(partitions.get(partitions.size() - 1)); } private static String getErrMsgPrefix(int size, int i) @@ -367,7 +366,7 @@ private static String getErrMsgPrefix(int size, int i) return "size=" + size + " i=" + i + " of "; } - private static String toString(Partitions partitions) + private static String toString(PartitionBoundaries partitions) { String prefix = "partitions[" + partitions.size() + "]="; StringJoiner sj = new StringJoiner(" ", prefix, "]"); From ded55f994a567eb801d8a0a6f16e77981627d9d2 Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Fri, 6 Dec 2019 19:56:12 -0800 Subject: [PATCH 09/17] Fix doc link --- docs/ingestion/native-batch.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ingestion/native-batch.md b/docs/ingestion/native-batch.md index e55408bd72a9..f1af93590bb4 100644 --- a/docs/ingestion/native-batch.md +++ b/docs/ingestion/native-batch.md @@ -264,7 +264,7 @@ The three `partitionsSpec` types have different pros and cons: > Single-dimension range partitioning currently requires the > [druid-datasketches](../development/extensions-core/datasketches-extension.md) -> extension to be [loaded from the classpath](..development/extension.md#loading-extensions-from-the-classpath). +> extension to be [loaded from the classpath](../development/extensions.md#loading-extensions-from-the-classpath). > Because single-range partitioning makes two passes over the input, the index task may fail if the input changes > in between the two passes. @@ -968,4 +968,4 @@ A spec that applies a filter and reads a subset of the original datasource's col } ``` -This spec above will only return the `page`, `user` dimensions and `added` metric. Only rows where `page` = `Druid` will be returned. \ No newline at end of file +This spec above will only return the `page`, `user` dimensions and `added` metric. Only rows where `page` = `Druid` will be returned. From 15235ea36ca5738cea732e875fdfa45fefb36587 Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Fri, 6 Dec 2019 20:57:59 -0800 Subject: [PATCH 10/17] Split batch index integration test --- .travis.yml | 7 +++++++ .../src/test/java/org/apache/druid/tests/TestNGGroup.java | 1 + .../apache/druid/tests/indexer/ITParallelIndexTest.java | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a0ad65bdb1fe..8c77835f57fc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -290,6 +290,13 @@ jobs: docker exec -it druid-$v sh -c 'dmesg | tail -3' ; done + - &integration_parallel_batch_index + name: "parallel batch index integration test" + services: *integration_test_services + env: TESTNG_GROUPS='-Dgroups=parallel-batch-index' + script: *run_integration_test + after_failure: *integration_test_diags + - &integration_kafka_index name: "kafka index integration test" services: *integration_test_services diff --git a/integration-tests/src/test/java/org/apache/druid/tests/TestNGGroup.java b/integration-tests/src/test/java/org/apache/druid/tests/TestNGGroup.java index dc37952ff436..10497cf0d0df 100644 --- a/integration-tests/src/test/java/org/apache/druid/tests/TestNGGroup.java +++ b/integration-tests/src/test/java/org/apache/druid/tests/TestNGGroup.java @@ -29,6 +29,7 @@ public class TestNGGroup public static final String HADOOP_INDEX = "hadoop-index"; public static final String KAFKA_INDEX = "kafka-index"; public static final String OTHER_INDEX = "other-index"; + public static final String PARALLEL_BATCH_INDEX = "parallel-batch-index"; public static final String QUERY = "query"; public static final String REALTIME_INDEX = "realtime-index"; public static final String SECURITY = "security"; diff --git a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITParallelIndexTest.java b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITParallelIndexTest.java index 77ccb411e1c4..1e2dff2c908d 100644 --- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITParallelIndexTest.java +++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITParallelIndexTest.java @@ -34,7 +34,7 @@ import java.io.Closeable; import java.util.function.Function; -@Test(groups = TestNGGroup.BATCH_INDEX) +@Test(groups = TestNGGroup.PARALLEL_BATCH_INDEX) @Guice(moduleFactory = DruidTestModuleFactory.class) public class ITParallelIndexTest extends AbstractITBatchIndexTest { From 275ad8eed6284411d0f7440800b2fab26eab6518 Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Fri, 6 Dec 2019 22:33:12 -0800 Subject: [PATCH 11/17] Do not run parallel-batch-index twice --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 8c77835f57fc..0047c1667b1a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -321,6 +321,6 @@ jobs: - &integration_tests name: "other integration test" services: *integration_test_services - env: TESTNG_GROUPS='-DexcludedGroups=batch-index,kafka-index,query,realtime-index' + env: TESTNG_GROUPS='-DexcludedGroups=batch-index,parallel-batch-index,kafka-index,query,realtime-index' script: *run_integration_test after_failure: *integration_test_diags From f40ed699c0a4b684b490ccbc7d77bfd8111548dc Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Sat, 7 Dec 2019 16:22:52 -0800 Subject: [PATCH 12/17] Adjust last partition --- .../distribution/PartitionBoundaries.java | 7 +- .../parallel/distribution/StringSketch.java | 13 ++ ...itionCachingLocalSegmentAllocatorTest.java | 2 +- .../PartialDimensionDistributionTaskTest.java | 13 +- ...rtitionMultiPhaseParallelIndexingTest.java | 1 - .../distribution/PartitionBoundariesTest.java | 2 +- .../distribution/StringSketchMergerTest.java | 5 +- .../distribution/StringSketchTest.java | 135 +++++++++--------- 8 files changed, 97 insertions(+), 81 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionBoundaries.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionBoundaries.java index dc14ace91c6e..32a0a0ffaf0f 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionBoundaries.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionBoundaries.java @@ -51,6 +51,7 @@ public PartitionBoundaries(String... partitions) return; } + // Future improvement: Handle skewed partitions better (e.g., many values are repeated). List partitionBoundaries = Arrays.stream(partitions) .distinct() .collect(Collectors.toCollection(ArrayList::new)); @@ -59,7 +60,11 @@ public PartitionBoundaries(String... partitions) partitionBoundaries.set(0, null); // Last partition ends with null (see StringPartitionChunk.isEnd()) - partitionBoundaries.add(null); + if (partitionBoundaries.size() == 1) { + partitionBoundaries.add(null); + } else { + partitionBoundaries.set(partitionBoundaries.size() - 1, null); + } delegate = Collections.unmodifiableList(partitionBoundaries); } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java index 8203ffad04cc..bba16cc46628 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketch.java @@ -29,6 +29,7 @@ import com.fasterxml.jackson.databind.deser.std.StdDeserializer; import com.fasterxml.jackson.databind.jsontype.TypeSerializer; import com.fasterxml.jackson.databind.ser.std.StdSerializer; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import org.apache.datasketches.ArrayOfStringsSerDe; import org.apache.datasketches.memory.Memory; @@ -105,6 +106,18 @@ public PartitionBoundaries getEvenPartitionsByTargetSize(int targetSize) return getEvenPartitionsByCount(evenPartitionCount); } + @VisibleForTesting + public String getMin() + { + return delegate.getMinValue(); + } + + @VisibleForTesting + public String getMax() + { + return delegate.getMaxValue(); + } + private PartitionBoundaries getEvenPartitionsByCount(int evenPartitionCount) { Preconditions.checkArgument( diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java index c2de2200a465..6e91d10066af 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RangePartitionCachingLocalSegmentAllocatorTest.java @@ -137,7 +137,7 @@ public void allocatesCorrectShardSpecsForLastPartition() { Interval interval = INTERVAL_NORMAL; InputRow row = createInputRow(interval, PARTITION9); - int partitionNum = INTERVAL_TO_PARTITONS.get(interval).size() - 2; // -2 = -1 0-based + -1 skip null tail + int partitionNum = INTERVAL_TO_PARTITONS.get(interval).size() - 2; testAllocate(row, interval, partitionNum, null); } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java index 9e622b83e869..5d905f064d71 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTaskTest.java @@ -35,6 +35,7 @@ import org.apache.druid.indexing.common.task.IndexTaskClientFactory; import org.apache.druid.indexing.common.task.batch.parallel.distribution.PartitionBoundaries; import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution; +import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.segment.TestHelper; import org.apache.druid.segment.indexing.DataSchema; @@ -358,15 +359,13 @@ public void preservesMinAndMaxWhenAssumeGroupedFalse() StringDistribution distribution = Iterables.getOnlyElement(intervalToDistribution.values()); Assert.assertNotNull(distribution); PartitionBoundaries partitions = distribution.getEvenPartitionsByMaxSize(1); - Assert.assertEquals(minBloomFilterBits + 3, partitions.size()); // 3 = min + max + exclusive endpoint + Assert.assertEquals(minBloomFilterBits + 2, partitions.size()); // 2 = min + max - // Min - Assert.assertNull(partitions.get(0)); - Assert.assertEquals(dimensionValues.get(1), partitions.get(1)); + String minDimensionValue = dimensionValues.get(0); + Assert.assertEquals(minDimensionValue, ((StringSketch) distribution).getMin()); - // Max - Assert.assertNull(partitions.get(partitions.size() - 1)); - Assert.assertEquals(dimensionValues.get(dimensionValues.size() - 1), partitions.get(partitions.size() - 2)); + String maxDimensionValue = dimensionValues.get(dimensionValues.size() - 1); + Assert.assertEquals(maxDimensionValue, ((StringSketch) distribution).getMax()); } @Test diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java index 1f75da389552..94ccf5cb03a1 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionMultiPhaseParallelIndexingTest.java @@ -218,7 +218,6 @@ private static void assertNumPartition( expectedNumPartition -= 1; } expectedNumPartition *= NUM_DAY; - expectedNumPartition += 1; // max dimension value has its own partition Assert.assertEquals(expectedNumPartition, segments.size()); } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionBoundariesTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionBoundariesTest.java index d1b20fde71cf..8f98bb1d59b1 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionBoundariesTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/PartitionBoundariesTest.java @@ -38,7 +38,7 @@ public class PartitionBoundariesTest public void setup() { values = new String[]{"a", "dup", "dup", "z"}; - expected = Arrays.asList(null, "dup", "z", null); + expected = Arrays.asList(null, "dup", null); target = new PartitionBoundaries(values); } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java index 8d0c987d0630..fb363536f6a9 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchMergerTest.java @@ -71,10 +71,9 @@ public void mergesCorrectly() StringDistribution merged = target.getResult(); PartitionBoundaries partitions = merged.getEvenPartitionsByMaxSize(1); - Assert.assertEquals(4, partitions.size()); + Assert.assertEquals(3, partitions.size()); Assert.assertNull(partitions.get(0)); Assert.assertEquals(string2, partitions.get(1)); - Assert.assertEquals(string3, partitions.get(2)); - Assert.assertNull(partitions.get(3)); + Assert.assertNull(partitions.get(2)); } } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java index 0e21c137fb75..b09634df3f89 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/distribution/StringSketchTest.java @@ -54,7 +54,7 @@ public class StringSketchTest private static final String MAX_STRING = STRINGS.get(NUM_STRING - 1); static { - ItemsSketch.rand.setSeed(0); // make sketches deterministic for testing + ItemsSketch.rand.setSeed(0); // make sketches deterministic for testing } public static class SerializationDeserializationTest @@ -155,8 +155,8 @@ public void requiresPositiveSize() public void handlesEmptySketch() { StringSketch sketch = new StringSketch(); - PartitionBoundaries partitions = sketch.getEvenPartitionsByTargetSize(1); - Assert.assertEquals(0, partitions.size()); + PartitionBoundaries partitionBoundaries = sketch.getEvenPartitionsByTargetSize(1); + Assert.assertEquals(0, partitionBoundaries.size()); } @Test @@ -164,17 +164,17 @@ public void handlesSingletonSketch() { StringSketch sketch = new StringSketch(); sketch.put(MIN_STRING); - PartitionBoundaries partitions = sketch.getEvenPartitionsByTargetSize(1); - Assert.assertEquals(2, partitions.size()); - Assert.assertNull(partitions.get(0)); - Assert.assertNull(partitions.get(1)); + PartitionBoundaries partitionBoundaries = sketch.getEvenPartitionsByTargetSize(1); + Assert.assertEquals(2, partitionBoundaries.size()); + Assert.assertNull(partitionBoundaries.get(0)); + Assert.assertNull(partitionBoundaries.get(1)); } @Test public void handlesMinimimumSize() { - PartitionBoundaries partitions = SKETCH.getEvenPartitionsByTargetSize(1); - assertMaxNumberOfPartitions(partitions); + PartitionBoundaries partitionBoundaries = SKETCH.getEvenPartitionsByTargetSize(1); + assertMaxNumberOfPartitions(partitionBoundaries); } @Test @@ -186,30 +186,30 @@ public void handlesUnevenPartitions() private static void testHandlesUnevenPartitions(int targetSize) { - PartitionBoundaries partitions = SKETCH.getEvenPartitionsByTargetSize(targetSize); + PartitionBoundaries partitionBoundaries = SKETCH.getEvenPartitionsByTargetSize(targetSize); - assertFirstAndLastPartitionsCorrect(partitions); + assertFirstAndLastPartitionsCorrect(partitionBoundaries); - String partitionsString = PartitionTest.toString(partitions); - int expectedHighPartitionCount = (int) Math.ceil((double) NUM_STRING / targetSize); - int expectedLowPartitionCount = expectedHighPartitionCount - 1; + String partitionBoundariesString = PartitionTest.toString(partitionBoundaries); + int expectedHighPartitionBoundaryCount = (int) Math.ceil((double) NUM_STRING / targetSize); + int expectedLowPartitionBoundaryCount = expectedHighPartitionBoundaryCount - 1; Assert.assertThat( - "targetSize=" + targetSize + " " + partitionsString, - partitions.size(), - Matchers.lessThanOrEqualTo(expectedHighPartitionCount + 2) // +2 = endpoint + null + "targetSize=" + targetSize + " " + partitionBoundariesString, + partitionBoundaries.size(), + Matchers.lessThanOrEqualTo(expectedHighPartitionBoundaryCount + 1) ); Assert.assertThat( - "targetSize=" + targetSize + " " + partitionsString, - partitions.size(), - Matchers.greaterThanOrEqualTo(expectedLowPartitionCount + 2) // +2 = endpoint + null + "targetSize=" + targetSize + " " + partitionBoundariesString, + partitionBoundaries.size(), + Matchers.greaterThanOrEqualTo(expectedLowPartitionBoundaryCount + 1) ); int previous = 0; - for (int i = 1; i < partitions.size() - 1; i++) { - int current = Integer.parseInt(partitions.get(i)); + for (int i = 1; i < partitionBoundaries.size() - 1; i++) { + int current = Integer.parseInt(partitionBoundaries.get(i)); int size = current - previous; Assert.assertThat( - getErrMsgPrefix(targetSize, i) + partitionsString, + getErrMsgPrefix(targetSize, i) + partitionBoundariesString, (double) size, IsCloseTo.closeTo(targetSize, Math.ceil(DELTA) * 2) ); @@ -220,15 +220,15 @@ private static void testHandlesUnevenPartitions(int targetSize) @Test public void handlesSinglePartition() { - PartitionBoundaries partitions = SKETCH.getEvenPartitionsByTargetSize(NUM_STRING); - assertSinglePartition(partitions); + PartitionBoundaries partitionBoundaries = SKETCH.getEvenPartitionsByTargetSize(NUM_STRING); + assertSinglePartition(partitionBoundaries); } @Test public void handlesOversizedPartition() { - PartitionBoundaries partitions = SKETCH.getEvenPartitionsByTargetSize(Integer.MAX_VALUE); - assertSinglePartition(partitions); + PartitionBoundaries partitionBoundaries = SKETCH.getEvenPartitionsByTargetSize(Integer.MAX_VALUE); + assertSinglePartition(partitionBoundaries); } } @@ -250,8 +250,8 @@ public void requiresPositiveSize() public void handlesEmptySketch() { StringSketch sketch = new StringSketch(); - PartitionBoundaries partitions = sketch.getEvenPartitionsByMaxSize(1); - Assert.assertEquals(0, partitions.size()); + PartitionBoundaries partitionBoundaries = sketch.getEvenPartitionsByMaxSize(1); + Assert.assertEquals(0, partitionBoundaries.size()); } @Test @@ -259,17 +259,17 @@ public void handlesSingletonSketch() { StringSketch sketch = new StringSketch(); sketch.put(MIN_STRING); - PartitionBoundaries partitions = sketch.getEvenPartitionsByMaxSize(1); - Assert.assertEquals(2, partitions.size()); - Assert.assertNull(partitions.get(0)); - Assert.assertNull(partitions.get(1)); + PartitionBoundaries partitionBoundaries = sketch.getEvenPartitionsByMaxSize(1); + Assert.assertEquals(2, partitionBoundaries.size()); + Assert.assertNull(partitionBoundaries.get(0)); + Assert.assertNull(partitionBoundaries.get(1)); } @Test public void handlesMinimimumSize() { - PartitionBoundaries partitions = SKETCH.getEvenPartitionsByMaxSize(1); - assertMaxNumberOfPartitions(partitions); + PartitionBoundaries partitionBoundaries = SKETCH.getEvenPartitionsByMaxSize(1); + assertMaxNumberOfPartitions(partitionBoundaries); } @Test @@ -281,31 +281,31 @@ public void handlesUnevenPartitions() private static void testHandlesUnevenPartitions(int maxSize) { - PartitionBoundaries partitions = SKETCH.getEvenPartitionsByMaxSize(maxSize); + PartitionBoundaries partitionBoundaries = SKETCH.getEvenPartitionsByMaxSize(maxSize); - assertFirstAndLastPartitionsCorrect(partitions); + assertFirstAndLastPartitionsCorrect(partitionBoundaries); - String partitionsString = PartitionTest.toString(partitions); + String partitionBoundariesString = PartitionTest.toString(partitionBoundaries); long expectedPartitionCount = (long) Math.ceil((double) NUM_STRING / maxSize); Assert.assertEquals( - "maxSize=" + maxSize + " " + partitionsString, - expectedPartitionCount + 2, // +2 = endpoint + null - partitions.size() + "maxSize=" + maxSize + " " + partitionBoundariesString, + expectedPartitionCount + 1, + partitionBoundaries.size() ); double minSize = (double) NUM_STRING / expectedPartitionCount - DELTA; int previous = 0; - for (int i = 1; i < partitions.size() - 1; i++) { - int current = Integer.parseInt(partitions.get(i)); + for (int i = 1; i < partitionBoundaries.size() - 1; i++) { + int current = Integer.parseInt(partitionBoundaries.get(i)); int size = current - previous; Assert.assertThat( - getErrMsgPrefix(maxSize, i) + partitionsString, + getErrMsgPrefix(maxSize, i) + partitionBoundariesString, size, Matchers.lessThanOrEqualTo(maxSize) ); Assert.assertThat( - getErrMsgPrefix(maxSize, i) + partitionsString, + getErrMsgPrefix(maxSize, i) + partitionBoundariesString, (double) size, Matchers.greaterThanOrEqualTo(minSize) ); @@ -316,30 +316,32 @@ private static void testHandlesUnevenPartitions(int maxSize) @Test public void handlesSinglePartition() { - PartitionBoundaries partitions = SKETCH.getEvenPartitionsByMaxSize((int) Math.ceil(NUM_STRING + DELTA)); - assertSinglePartition(partitions); + PartitionBoundaries partitionBoundaries = SKETCH.getEvenPartitionsByMaxSize( + (int) Math.ceil(NUM_STRING + DELTA) + ); + assertSinglePartition(partitionBoundaries); } @Test public void handlesOversizedPartition() { - PartitionBoundaries partitions = SKETCH.getEvenPartitionsByMaxSize(Integer.MAX_VALUE); - assertSinglePartition(partitions); + PartitionBoundaries partitionBoundaries = SKETCH.getEvenPartitionsByMaxSize(Integer.MAX_VALUE); + assertSinglePartition(partitionBoundaries); } } - private static void assertMaxNumberOfPartitions(PartitionBoundaries partitions) + private static void assertMaxNumberOfPartitions(PartitionBoundaries partitionBoundaries) { - String partitionsString = toString(partitions); + String partitionBoundariesString = toString(partitionBoundaries); - Assert.assertEquals(partitionsString, StringSketch.SKETCH_K + 2, partitions.size()); // +2 = endpoint + null - assertFirstAndLastPartitionsCorrect(partitions); + Assert.assertEquals(partitionBoundariesString, StringSketch.SKETCH_K + 1, partitionBoundaries.size()); + assertFirstAndLastPartitionsCorrect(partitionBoundaries); int previous = 0; - for (int i = 1; i < partitions.size() - 1; i++) { - int current = Integer.parseInt(partitions.get(i)); + for (int i = 1; i < partitionBoundaries.size() - 1; i++) { + int current = Integer.parseInt(partitionBoundaries.get(i)); Assert.assertEquals( - getErrMsgPrefix(1, i) + partitionsString, + getErrMsgPrefix(1, i) + partitionBoundariesString, 1, current - previous, FACTOR @@ -348,17 +350,16 @@ private static void assertMaxNumberOfPartitions(PartitionBoundaries partitions) } } - private static void assertSinglePartition(PartitionBoundaries partitions) + private static void assertSinglePartition(PartitionBoundaries partitionBoundaries) { - Assert.assertEquals(3, partitions.size()); // +2 = endpoint + null - assertFirstAndLastPartitionsCorrect(partitions); + Assert.assertEquals(2, partitionBoundaries.size()); + assertFirstAndLastPartitionsCorrect(partitionBoundaries); } - private static void assertFirstAndLastPartitionsCorrect(PartitionBoundaries partitions) + private static void assertFirstAndLastPartitionsCorrect(PartitionBoundaries partitionBoundaries) { - Assert.assertNull(partitions.get(0)); - Assert.assertEquals(MAX_STRING, partitions.get(partitions.size() - 2)); - Assert.assertNull(partitions.get(partitions.size() - 1)); + Assert.assertNull(partitionBoundaries.get(0)); + Assert.assertNull(partitionBoundaries.get(partitionBoundaries.size() - 1)); } private static String getErrMsgPrefix(int size, int i) @@ -366,12 +367,12 @@ private static String getErrMsgPrefix(int size, int i) return "size=" + size + " i=" + i + " of "; } - private static String toString(PartitionBoundaries partitions) + private static String toString(PartitionBoundaries partitionBoundaries) { - String prefix = "partitions[" + partitions.size() + "]="; + String prefix = "partitionBoundaries[" + partitionBoundaries.size() + "]="; StringJoiner sj = new StringJoiner(" ", prefix, "]"); - for (int i = 0; i < partitions.size(); i++) { - sj.add("[" + i + "]=" + partitions.get(i)); + for (int i = 0; i < partitionBoundaries.size(); i++) { + sj.add("[" + i + "]=" + partitionBoundaries.get(i)); } return sj.toString(); } From 1753d64f8c790aaece73d32ad539748724acbc20 Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Sat, 7 Dec 2019 16:55:03 -0800 Subject: [PATCH 13/17] Split ITParallelIndexTest to reduce runtime --- .travis.yml | 8 +- .../org/apache/druid/tests/TestNGGroup.java | 2 +- .../indexer/AbstractITBatchIndexTest.java | 2 +- ...> ITImperfectRollupParallelIndexTest.java} | 89 ++++++-------- .../ITPerfectRollupParallelIndexTest.java | 113 ++++++++++++++++++ 5 files changed, 153 insertions(+), 61 deletions(-) rename integration-tests/src/test/java/org/apache/druid/tests/indexer/{ITParallelIndexTest.java => ITImperfectRollupParallelIndexTest.java} (59%) create mode 100644 integration-tests/src/test/java/org/apache/druid/tests/indexer/ITPerfectRollupParallelIndexTest.java diff --git a/.travis.yml b/.travis.yml index 0047c1667b1a..0dc60f2af3e1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -290,10 +290,10 @@ jobs: docker exec -it druid-$v sh -c 'dmesg | tail -3' ; done - - &integration_parallel_batch_index - name: "parallel batch index integration test" + - &integration_perfect_rollup_parallel_batch_index + name: "perfect rollup parallel batch index integration test" services: *integration_test_services - env: TESTNG_GROUPS='-Dgroups=parallel-batch-index' + env: TESTNG_GROUPS='-Dgroups=perfect-rollup-parallel-batch-index' script: *run_integration_test after_failure: *integration_test_diags @@ -321,6 +321,6 @@ jobs: - &integration_tests name: "other integration test" services: *integration_test_services - env: TESTNG_GROUPS='-DexcludedGroups=batch-index,parallel-batch-index,kafka-index,query,realtime-index' + env: TESTNG_GROUPS='-DexcludedGroups=batch-index,perfect-rollup-parallel-batch-index,kafka-index,query,realtime-index' script: *run_integration_test after_failure: *integration_test_diags diff --git a/integration-tests/src/test/java/org/apache/druid/tests/TestNGGroup.java b/integration-tests/src/test/java/org/apache/druid/tests/TestNGGroup.java index 10497cf0d0df..ad8a1454a6d0 100644 --- a/integration-tests/src/test/java/org/apache/druid/tests/TestNGGroup.java +++ b/integration-tests/src/test/java/org/apache/druid/tests/TestNGGroup.java @@ -29,7 +29,7 @@ public class TestNGGroup public static final String HADOOP_INDEX = "hadoop-index"; public static final String KAFKA_INDEX = "kafka-index"; public static final String OTHER_INDEX = "other-index"; - public static final String PARALLEL_BATCH_INDEX = "parallel-batch-index"; + public static final String PERFECT_ROLLUP_PARALLEL_BATCH_INDEX = "perfect-rollup-parallel-batch-index"; public static final String QUERY = "query"; public static final String REALTIME_INDEX = "realtime-index"; public static final String SECURITY = "security"; diff --git a/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractITBatchIndexTest.java b/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractITBatchIndexTest.java index 430e3bfccb45..9fc01a7451d4 100644 --- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractITBatchIndexTest.java +++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractITBatchIndexTest.java @@ -226,7 +226,7 @@ private void submitTaskAndWait(String taskSpec, String dataSourceName, boolean w ); } - // ITParallelIndexTest does a second round of ingestion to replace segements in an existing + // IT*ParallelIndexTest do a second round of ingestion to replace segements in an existing // data source. For that second round we need to make sure the coordinator actually learned // about the new segments befor waiting for it to report that all segments are loaded; otherwise // this method could return too early because the coordinator is merely reporting that all the diff --git a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITParallelIndexTest.java b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITImperfectRollupParallelIndexTest.java similarity index 59% rename from integration-tests/src/test/java/org/apache/druid/tests/indexer/ITParallelIndexTest.java rename to integration-tests/src/test/java/org/apache/druid/tests/indexer/ITImperfectRollupParallelIndexTest.java index 1e2dff2c908d..853eba11ed29 100644 --- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITParallelIndexTest.java +++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITImperfectRollupParallelIndexTest.java @@ -21,12 +21,11 @@ import com.fasterxml.jackson.core.JsonProcessingException; import org.apache.druid.indexer.partitions.DynamicPartitionsSpec; -import org.apache.druid.indexer.partitions.HashedPartitionsSpec; import org.apache.druid.indexer.partitions.PartitionsSpec; -import org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.testing.guice.DruidTestModuleFactory; import org.apache.druid.tests.TestNGGroup; +import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Guice; import org.testng.annotations.Test; @@ -34,9 +33,9 @@ import java.io.Closeable; import java.util.function.Function; -@Test(groups = TestNGGroup.PARALLEL_BATCH_INDEX) +@Test(groups = TestNGGroup.BATCH_INDEX) @Guice(moduleFactory = DruidTestModuleFactory.class) -public class ITParallelIndexTest extends AbstractITBatchIndexTest +public class ITImperfectRollupParallelIndexTest extends AbstractITBatchIndexTest { private static final String INDEX_TASK = "/indexer/wikipedia_parallel_index_task.json"; private static final String INDEX_QUERIES_RESOURCE = "/indexer/wikipedia_parallel_index_queries.json"; @@ -52,27 +51,27 @@ public class ITParallelIndexTest extends AbstractITBatchIndexTest public static Object[][] resources() { return new Object[][]{ - {new DynamicPartitionsSpec(null, null)}, - {new HashedPartitionsSpec(null, 2, null)}, - {new SingleDimensionPartitionsSpec(2, null, "namespace", false)}, + {new DynamicPartitionsSpec(null, null)} }; } @Test(dataProvider = "resources") public void testIndexData(PartitionsSpec partitionsSpec) throws Exception { - try (final Closeable ignored1 = unloader(INDEX_DATASOURCE + config.getExtraDatasourceNameSuffix()); - final Closeable ignored2 = unloader(INDEX_INGEST_SEGMENT_DATASOURCE + config.getExtraDatasourceNameSuffix()); - final Closeable ignored3 = unloader(INDEX_DRUID_INPUT_SOURCE_DATASOURCE + config.getExtraDatasourceNameSuffix()) + try ( + final Closeable ignored1 = unloader(INDEX_DATASOURCE + config.getExtraDatasourceNameSuffix()); + final Closeable ignored2 = unloader(INDEX_INGEST_SEGMENT_DATASOURCE + config.getExtraDatasourceNameSuffix()); + final Closeable ignored3 = unloader(INDEX_DRUID_INPUT_SOURCE_DATASOURCE + config.getExtraDatasourceNameSuffix()) ) { boolean forceGuaranteedRollup = partitionsSpec.isForceGuaranteedRollupCompatible(); + Assert.assertFalse(forceGuaranteedRollup, "parititionSpec does not support best-effort rollup"); final Function rollupTransform = spec -> { try { spec = StringUtils.replace( spec, "%%FORCE_GUARANTEED_ROLLUP%%", - Boolean.toString(forceGuaranteedRollup) + Boolean.toString(false) ); return StringUtils.replace( spec, @@ -93,52 +92,32 @@ public void testIndexData(PartitionsSpec partitionsSpec) throws Exception false ); - // Missing intervals is not supported yet if forceGuaranteedRollup = true - if (!forceGuaranteedRollup) { - // Index again, this time only choosing the second data file, and without explicit intervals chosen. - // The second datafile covers both day segments, so this should replace them, as reflected in the queries. - doIndexTest( - INDEX_DATASOURCE, - REINDEX_TASK, - rollupTransform, - REINDEX_QUERIES_RESOURCE, - true - ); - - doReindexTest( - INDEX_DATASOURCE, - INDEX_INGEST_SEGMENT_DATASOURCE, - rollupTransform, - INDEX_INGEST_SEGMENT_TASK, - REINDEX_QUERIES_RESOURCE - ); + // Index again, this time only choosing the second data file, and without explicit intervals chosen. + // The second datafile covers both day segments, so this should replace them, as reflected in the queries. + doIndexTest( + INDEX_DATASOURCE, + REINDEX_TASK, + rollupTransform, + REINDEX_QUERIES_RESOURCE, + true + ); - // with DruidInputSource instead of IngestSegmentFirehose - doReindexTest( - INDEX_DATASOURCE, - INDEX_DRUID_INPUT_SOURCE_DATASOURCE, - rollupTransform, - INDEX_DRUID_INPUT_SOURCE_TASK, - REINDEX_QUERIES_RESOURCE - ); - } else { - doReindexTest( - INDEX_DATASOURCE, - INDEX_INGEST_SEGMENT_DATASOURCE, - rollupTransform, - INDEX_INGEST_SEGMENT_TASK, - INDEX_QUERIES_RESOURCE - ); + doReindexTest( + INDEX_DATASOURCE, + INDEX_INGEST_SEGMENT_DATASOURCE, + rollupTransform, + INDEX_INGEST_SEGMENT_TASK, + REINDEX_QUERIES_RESOURCE + ); - // with DruidInputSource instead of IngestSegmentFirehose - doReindexTest( - INDEX_DATASOURCE, - INDEX_DRUID_INPUT_SOURCE_DATASOURCE, - rollupTransform, - INDEX_DRUID_INPUT_SOURCE_TASK, - INDEX_QUERIES_RESOURCE - ); - } + // with DruidInputSource instead of IngestSegmentFirehose + doReindexTest( + INDEX_DATASOURCE, + INDEX_DRUID_INPUT_SOURCE_DATASOURCE, + rollupTransform, + INDEX_DRUID_INPUT_SOURCE_TASK, + REINDEX_QUERIES_RESOURCE + ); } } } diff --git a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITPerfectRollupParallelIndexTest.java b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITPerfectRollupParallelIndexTest.java new file mode 100644 index 000000000000..03442032de03 --- /dev/null +++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITPerfectRollupParallelIndexTest.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.tests.indexer; + +import com.fasterxml.jackson.core.JsonProcessingException; +import org.apache.druid.indexer.partitions.HashedPartitionsSpec; +import org.apache.druid.indexer.partitions.PartitionsSpec; +import org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec; +import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.testing.guice.DruidTestModuleFactory; +import org.apache.druid.tests.TestNGGroup; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Guice; +import org.testng.annotations.Test; + +import java.io.Closeable; +import java.util.function.Function; + +@Test(groups = TestNGGroup.PERFECT_ROLLUP_PARALLEL_BATCH_INDEX) +@Guice(moduleFactory = DruidTestModuleFactory.class) +public class ITPerfectRollupParallelIndexTest extends AbstractITBatchIndexTest +{ + private static final String INDEX_TASK = "/indexer/wikipedia_parallel_index_task.json"; + private static final String INDEX_QUERIES_RESOURCE = "/indexer/wikipedia_parallel_index_queries.json"; + private static final String INDEX_DATASOURCE = "wikipedia_parallel_index_test"; + private static final String INDEX_INGEST_SEGMENT_DATASOURCE = "wikipedia_parallel_ingest_segment_index_test"; + private static final String INDEX_INGEST_SEGMENT_TASK = "/indexer/wikipedia_parallel_ingest_segment_index_task.json"; + private static final String INDEX_DRUID_INPUT_SOURCE_DATASOURCE = "wikipedia_parallel_druid_input_source_index_test"; + private static final String INDEX_DRUID_INPUT_SOURCE_TASK = "/indexer/wikipedia_parallel_druid_input_source_index_task.json"; + + @DataProvider + public static Object[][] resources() + { + return new Object[][]{ + {new HashedPartitionsSpec(null, 2, null)}, + {new SingleDimensionPartitionsSpec(2, null, "namespace", false)} + }; + } + + @Test(dataProvider = "resources") + public void testIndexData(PartitionsSpec partitionsSpec) throws Exception + { + try ( + final Closeable ignored1 = unloader(INDEX_DATASOURCE + config.getExtraDatasourceNameSuffix()); + final Closeable ignored2 = unloader(INDEX_INGEST_SEGMENT_DATASOURCE + config.getExtraDatasourceNameSuffix()); + final Closeable ignored3 = unloader(INDEX_DRUID_INPUT_SOURCE_DATASOURCE + config.getExtraDatasourceNameSuffix()) + ) { + boolean forceGuaranteedRollup = partitionsSpec.isForceGuaranteedRollupCompatible(); + Assert.assertTrue(forceGuaranteedRollup, "parititionSpec does not support perfect rollup"); + + final Function rollupTransform = spec -> { + try { + spec = StringUtils.replace( + spec, + "%%FORCE_GUARANTEED_ROLLUP%%", + Boolean.toString(true) + ); + return StringUtils.replace( + spec, + "%%PARTITIONS_SPEC%%", + jsonMapper.writeValueAsString(partitionsSpec) + ); + } + catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + }; + + doIndexTest( + INDEX_DATASOURCE, + INDEX_TASK, + rollupTransform, + INDEX_QUERIES_RESOURCE, + false + ); + + doReindexTest( + INDEX_DATASOURCE, + INDEX_INGEST_SEGMENT_DATASOURCE, + rollupTransform, + INDEX_INGEST_SEGMENT_TASK, + INDEX_QUERIES_RESOURCE + ); + + // with DruidInputSource instead of IngestSegmentFirehose + doReindexTest( + INDEX_DATASOURCE, + INDEX_DRUID_INPUT_SOURCE_DATASOURCE, + rollupTransform, + INDEX_DRUID_INPUT_SOURCE_TASK, + INDEX_QUERIES_RESOURCE + ); + } + } +} From 96215101c09484b5e504fd0f7c50b9eb9799d69a Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Mon, 9 Dec 2019 10:08:39 -0800 Subject: [PATCH 14/17] Rename test class --- ...lIndexTest.java => ITBestEffortRollupParallelIndexTest.java} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename integration-tests/src/test/java/org/apache/druid/tests/indexer/{ITImperfectRollupParallelIndexTest.java => ITBestEffortRollupParallelIndexTest.java} (98%) diff --git a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITImperfectRollupParallelIndexTest.java b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITBestEffortRollupParallelIndexTest.java similarity index 98% rename from integration-tests/src/test/java/org/apache/druid/tests/indexer/ITImperfectRollupParallelIndexTest.java rename to integration-tests/src/test/java/org/apache/druid/tests/indexer/ITBestEffortRollupParallelIndexTest.java index 853eba11ed29..0c975b208e65 100644 --- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITImperfectRollupParallelIndexTest.java +++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITBestEffortRollupParallelIndexTest.java @@ -35,7 +35,7 @@ @Test(groups = TestNGGroup.BATCH_INDEX) @Guice(moduleFactory = DruidTestModuleFactory.class) -public class ITImperfectRollupParallelIndexTest extends AbstractITBatchIndexTest +public class ITBestEffortRollupParallelIndexTest extends AbstractITBatchIndexTest { private static final String INDEX_TASK = "/indexer/wikipedia_parallel_index_task.json"; private static final String INDEX_QUERIES_RESOURCE = "/indexer/wikipedia_parallel_index_queries.json"; From 8d714bdf7d6a08b3d5a1a607b313184ae8f25e44 Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Mon, 9 Dec 2019 16:24:19 -0800 Subject: [PATCH 15/17] Allow null values in range partitions --- .../PartialDimensionDistributionTask.java | 5 +- .../PartialRangeSegmentGenerateTask.java | 3 +- ...itionIndexTaskInputRowIteratorBuilder.java | 24 +++- ...InputRowIteratorBuilderTestingFactory.java | 11 +- ...titionTaskInputRowIteratorBuilderTest.java | 110 +++++++++++++++--- 5 files changed, 127 insertions(+), 26 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java index ca4c1838b939..60c2d185162c 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java @@ -75,6 +75,9 @@ public class PartialDimensionDistributionTask extends PerfectRollupWorkerTask public static final String TYPE = "partial_dimension_distribution"; private static final Logger LOG = new Logger(PartialDimensionDistributionTask.class); + // Future work: StringDistribution does not handle inserting NULLs. This is the same behavior as hadoop indexing. + private static final boolean SKIP_NULL = true; + private final int numAttempts; private final ParallelIndexIngestionSpec ingestionSchema; private final String supervisorTaskId; @@ -220,7 +223,7 @@ public TaskStatus runTask(TaskToolbox toolbox) throws Exception try ( CloseableIterator inputRowIterator = inputSourceReader.read(); - HandlingInputRowIterator iterator = new RangePartitionIndexTaskInputRowIteratorBuilder(partitionDimension) + HandlingInputRowIterator iterator = new RangePartitionIndexTaskInputRowIteratorBuilder(partitionDimension, SKIP_NULL) .delegate(inputRowIterator) .granularitySpec(granularitySpec) .nullRowRunnable(IndexTaskInputRowIteratorBuilder.NOOP_RUNNABLE) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java index 3cecf67c8690..b52b26b410fc 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java @@ -54,6 +54,7 @@ public class PartialRangeSegmentGenerateTask extends PartialSegmentGenerateTask< { public static final String TYPE = "partial_range_index_generate"; private static final String PROP_SPEC = "spec"; + private static final boolean SKIP_NULL = true; private final String supervisorTaskId; private final int numAttempts; @@ -86,7 +87,7 @@ public PartialRangeSegmentGenerateTask( indexingServiceClient, taskClientFactory, appenderatorsManager, - new RangePartitionIndexTaskInputRowIteratorBuilder(getPartitionDimension(ingestionSchema)) + new RangePartitionIndexTaskInputRowIteratorBuilder(getPartitionDimension(ingestionSchema), !SKIP_NULL) ); this.numAttempts = numAttempts; diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionIndexTaskInputRowIteratorBuilder.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionIndexTaskInputRowIteratorBuilder.java index b2884b99d439..4373af494e01 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionIndexTaskInputRowIteratorBuilder.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionIndexTaskInputRowIteratorBuilder.java @@ -54,10 +54,19 @@ public class RangePartitionIndexTaskInputRowIteratorBuilder implements IndexTask { private final DefaultIndexTaskInputRowIteratorBuilder delegate; - public RangePartitionIndexTaskInputRowIteratorBuilder(String partitionDimension) + /** + * @param partitionDimension Create range partitions for this dimension + * @param skipNull Whether to skip rows with a dimension value of null + */ + public RangePartitionIndexTaskInputRowIteratorBuilder(String partitionDimension, boolean skipNull) { delegate = new DefaultIndexTaskInputRowIteratorBuilder(); - delegate.appendInputRowHandler(createOnlySingleDimensionValueRowsHandler(partitionDimension)); + + if (skipNull) { + delegate.appendInputRowHandler(createOnlySingleDimensionValueRowsHandler(partitionDimension)); + } else { + delegate.appendInputRowHandler(createOnlySingleOrNullDimensionValueRowsHandler(partitionDimension)); + } } @Override @@ -99,4 +108,15 @@ private static HandlingInputRowIterator.InputRowHandler createOnlySingleDimensio return dimensionValues.size() != 1; }; } + + private static HandlingInputRowIterator.InputRowHandler createOnlySingleOrNullDimensionValueRowsHandler( + String partitionDimension + ) + { + return inputRow -> { + List dimensionValues = inputRow.getDimension(partitionDimension); + return dimensionValues.size() > 1; // Rows.objectToStrings() returns an empty list for a single null value + }; + } + } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/IndexTaskInputRowIteratorBuilderTestingFactory.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/IndexTaskInputRowIteratorBuilderTestingFactory.java index 628a5b008e3c..39300acd1a9f 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/IndexTaskInputRowIteratorBuilderTestingFactory.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/IndexTaskInputRowIteratorBuilderTestingFactory.java @@ -22,6 +22,7 @@ import com.google.common.base.Optional; import org.apache.druid.data.input.HandlingInputRowIterator; import org.apache.druid.data.input.InputRow; +import org.apache.druid.data.input.MapBasedInputRow; import org.apache.druid.java.util.common.DateTimes; import org.apache.druid.java.util.common.Intervals; import org.apache.druid.java.util.common.parsers.CloseableIterator; @@ -52,11 +53,11 @@ static InputRow createInputRow(DateTime timestamp) static InputRow createInputRow(DateTime timestamp, List dimensionValues) { - InputRow inputRow = EasyMock.mock(InputRow.class); - EasyMock.expect(inputRow.getTimestamp()).andStubReturn(timestamp); - EasyMock.expect(inputRow.getDimension(DIMENSION)).andStubReturn(dimensionValues); - EasyMock.replay(inputRow); - return inputRow; + return new MapBasedInputRow( + timestamp, + dimensionValues, + Collections.singletonMap(DIMENSION, dimensionValues) + ); } static CloseableIterator createInputRowIterator(InputRow inputRow) diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionTaskInputRowIteratorBuilderTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionTaskInputRowIteratorBuilderTest.java index 6093d0d0eb46..45a028a5e96d 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionTaskInputRowIteratorBuilderTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionTaskInputRowIteratorBuilderTest.java @@ -32,8 +32,14 @@ public class RangePartitionTaskInputRowIteratorBuilderTest { + private static final boolean SKIP_NULL = true; private static final IndexTaskInputRowIteratorBuilderTestingFactory.HandlerTester HANDLER_TESTER = - IndexTaskInputRowIteratorBuilderTestingFactory.createHandlerTester(() -> new RangePartitionIndexTaskInputRowIteratorBuilder(IndexTaskInputRowIteratorBuilderTestingFactory.DIMENSION)); + IndexTaskInputRowIteratorBuilderTestingFactory.createHandlerTester( + () -> new RangePartitionIndexTaskInputRowIteratorBuilder( + IndexTaskInputRowIteratorBuilderTestingFactory.DIMENSION, + SKIP_NULL + ) + ); private static final InputRow NO_NEXT_INPUT_ROW = null; @Test @@ -41,15 +47,24 @@ public void invokesDimensionValueCountFilterLast() { DateTime timestamp = IndexTaskInputRowIteratorBuilderTestingFactory.TIMESTAMP; List multipleDimensionValues = Arrays.asList("multiple", "dimension", "values"); - InputRow inputRow = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRow(timestamp, multipleDimensionValues); - CloseableIterator inputRowIterator = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRowIterator(inputRow); - GranularitySpec granularitySpec = IndexTaskInputRowIteratorBuilderTestingFactory.createGranularitySpec(timestamp, IndexTaskInputRowIteratorBuilderTestingFactory.PRESENT_BUCKET_INTERVAL_OPT); - - List handlerInvocationHistory = HANDLER_TESTER.invokeHandlers( - inputRowIterator, - granularitySpec, - NO_NEXT_INPUT_ROW + InputRow inputRow = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRow( + timestamp, + multipleDimensionValues ); + CloseableIterator inputRowIterator = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRowIterator( + inputRow + ); + GranularitySpec granularitySpec = IndexTaskInputRowIteratorBuilderTestingFactory.createGranularitySpec( + timestamp, + IndexTaskInputRowIteratorBuilderTestingFactory.PRESENT_BUCKET_INTERVAL_OPT + ); + + List handlerInvocationHistory = + HANDLER_TESTER.invokeHandlers( + inputRowIterator, + granularitySpec, + NO_NEXT_INPUT_ROW + ); Assert.assertEquals(Collections.emptyList(), handlerInvocationHistory); } @@ -58,16 +73,77 @@ public void invokesDimensionValueCountFilterLast() public void doesNotInvokeHandlersIfRowValid() { DateTime timestamp = IndexTaskInputRowIteratorBuilderTestingFactory.TIMESTAMP; - List singleDimensionValue = Collections.singletonList("single-dimension-value"); - InputRow inputRow = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRow(timestamp, singleDimensionValue); - CloseableIterator inputRowIterator = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRowIterator(inputRow); - GranularitySpec granularitySpec = IndexTaskInputRowIteratorBuilderTestingFactory.createGranularitySpec(timestamp, IndexTaskInputRowIteratorBuilderTestingFactory.PRESENT_BUCKET_INTERVAL_OPT); - - List handlerInvocationHistory = HANDLER_TESTER.invokeHandlers( - inputRowIterator, - granularitySpec, + List nullDimensionValue = Collections.singletonList(null); + InputRow inputRow = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRow(timestamp, nullDimensionValue); + CloseableIterator inputRowIterator = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRowIterator( + inputRow + ); + GranularitySpec granularitySpec = IndexTaskInputRowIteratorBuilderTestingFactory.createGranularitySpec( + timestamp, + IndexTaskInputRowIteratorBuilderTestingFactory.PRESENT_BUCKET_INTERVAL_OPT + ); + + List handlerInvocationHistory = + HANDLER_TESTER.invokeHandlers( + inputRowIterator, + granularitySpec, + inputRow + ); + + Assert.assertEquals(Collections.emptyList(), handlerInvocationHistory); + } + + @Test + public void invokesHandlerIfRowInvalidNull() + { + DateTime timestamp = IndexTaskInputRowIteratorBuilderTestingFactory.TIMESTAMP; + List nullDimensionValue = null; + InputRow inputRow = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRow(timestamp, nullDimensionValue); + CloseableIterator inputRowIterator = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRowIterator( + inputRow + ); + GranularitySpec granularitySpec = IndexTaskInputRowIteratorBuilderTestingFactory.createGranularitySpec( + timestamp, + IndexTaskInputRowIteratorBuilderTestingFactory.PRESENT_BUCKET_INTERVAL_OPT + ); + + List handlerInvocationHistory = + HANDLER_TESTER.invokeHandlers( + inputRowIterator, + granularitySpec, + NO_NEXT_INPUT_ROW + ); + + Assert.assertEquals(Collections.emptyList(), handlerInvocationHistory); + } + + @Test + public void doesNotInvokeHandlersIfRowValidNull() + { + DateTime timestamp = IndexTaskInputRowIteratorBuilderTestingFactory.TIMESTAMP; + List nullDimensionValue = null; + InputRow inputRow = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRow(timestamp, nullDimensionValue); + CloseableIterator inputRowIterator = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRowIterator( inputRow ); + GranularitySpec granularitySpec = IndexTaskInputRowIteratorBuilderTestingFactory.createGranularitySpec( + timestamp, + IndexTaskInputRowIteratorBuilderTestingFactory.PRESENT_BUCKET_INTERVAL_OPT + ); + + IndexTaskInputRowIteratorBuilderTestingFactory.HandlerTester handlerTester = + IndexTaskInputRowIteratorBuilderTestingFactory.createHandlerTester( + () -> new RangePartitionIndexTaskInputRowIteratorBuilder( + IndexTaskInputRowIteratorBuilderTestingFactory.DIMENSION, + !SKIP_NULL + ) + ); + List handlerInvocationHistory = + handlerTester.invokeHandlers( + inputRowIterator, + granularitySpec, + inputRow + ); Assert.assertEquals(Collections.emptyList(), handlerInvocationHistory); } From e423be5456a126a88556cf5188524ad96c5e02cf Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Mon, 9 Dec 2019 17:08:38 -0800 Subject: [PATCH 16/17] Indicate which phase failed --- .../task/batch/parallel/ParallelIndexSupervisorTask.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java index 444ad6e3cdd6..db31af67d91a 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java @@ -606,7 +606,7 @@ private TaskStatus runRangePartitionMultiPhaseParallel(TaskToolbox toolbox) thro TaskState distributionState = runNextPhase(distributionRunner); if (distributionState.isFailure()) { - return TaskStatus.failure(getId()); + return TaskStatus.failure(getId(), PartialDimensionDistributionTask.TYPE + " failed"); } Map intervalToPartitions = @@ -624,7 +624,7 @@ private TaskStatus runRangePartitionMultiPhaseParallel(TaskToolbox toolbox) thro TaskState indexingState = runNextPhase(indexingRunner); if (indexingState.isFailure()) { - return TaskStatus.failure(getId()); + return TaskStatus.failure(getId(), PartialRangeSegmentGenerateTask.TYPE + " failed"); } // partition (interval, partitionId) -> partition locations From 76221343a7ba032431ff333948d65388af6828b1 Mon Sep 17 00:00:00 2001 From: Chi Cao Minh Date: Mon, 9 Dec 2019 17:49:50 -0800 Subject: [PATCH 17/17] Improve asserts in tests --- ...titionTaskInputRowIteratorBuilderTest.java | 27 +++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionTaskInputRowIteratorBuilderTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionTaskInputRowIteratorBuilderTest.java index 45a028a5e96d..719535c42b43 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionTaskInputRowIteratorBuilderTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/iterator/RangePartitionTaskInputRowIteratorBuilderTest.java @@ -22,6 +22,7 @@ import org.apache.druid.data.input.InputRow; import org.apache.druid.java.util.common.parsers.CloseableIterator; import org.apache.druid.segment.indexing.granularity.GranularitySpec; +import org.hamcrest.Matchers; import org.joda.time.DateTime; import org.junit.Assert; import org.junit.Test; @@ -66,7 +67,14 @@ public void invokesDimensionValueCountFilterLast() NO_NEXT_INPUT_ROW ); - Assert.assertEquals(Collections.emptyList(), handlerInvocationHistory); + assertNotInHandlerInvocationHistory( + handlerInvocationHistory, + IndexTaskInputRowIteratorBuilderTestingFactory.HandlerTester.Handler.NULL_ROW + ); + assertNotInHandlerInvocationHistory( + handlerInvocationHistory, + IndexTaskInputRowIteratorBuilderTestingFactory.HandlerTester.Handler.ABSENT_BUCKET_INTERVAL + ); } @Test @@ -114,7 +122,14 @@ public void invokesHandlerIfRowInvalidNull() NO_NEXT_INPUT_ROW ); - Assert.assertEquals(Collections.emptyList(), handlerInvocationHistory); + assertNotInHandlerInvocationHistory( + handlerInvocationHistory, + IndexTaskInputRowIteratorBuilderTestingFactory.HandlerTester.Handler.NULL_ROW + ); + assertNotInHandlerInvocationHistory( + handlerInvocationHistory, + IndexTaskInputRowIteratorBuilderTestingFactory.HandlerTester.Handler.ABSENT_BUCKET_INTERVAL + ); } @Test @@ -147,4 +162,12 @@ public void doesNotInvokeHandlersIfRowValidNull() Assert.assertEquals(Collections.emptyList(), handlerInvocationHistory); } + + private static void assertNotInHandlerInvocationHistory( + List handlerInvocationHistory, + IndexTaskInputRowIteratorBuilderTestingFactory.HandlerTester.Handler handler + ) + { + Assert.assertThat(handlerInvocationHistory, Matchers.not(Matchers.contains(handler))); + } }