From 4b98828007ba1387b018bfa9919e228fbaacad24 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Tue, 11 Aug 2020 19:47:06 -0700 Subject: [PATCH 01/24] DruidInputSource: Fix issues in column projection, timestamp handling. DruidInputSource, DruidSegmentReader changes: 1) Remove "dimensions" and "metrics". They are not necessary, because we can compute which columns we need to read based on what is going to be used by the timestamp, transform, dimensions, and metrics. 2) Start using ColumnsFilter (see below) to decide which columns we need to read. 3) Actually respect the "timestampSpec". Previously, it was ignored, and the timestamp of the returned InputRows was set to the `__time` column of the input datasource. (1) and (2) together fix a bug in which the DruidInputSource would not properly read columns that are used as inputs to a transformSpec. (3) fixes a bug where the timestampSpec would be ignored if you attempted to set the column to something other than `__time`. (1) and (3) are breaking changes. Web console changes: 1) Remove "Dimensions" and "Metrics" from the Druid input source. 2) Set timestampSpec to `{"column": "__time", "format": "millis"}` for compatibility with the new behavior. Other changes: 1) Add ColumnsFilter, a new class that allows input readers to determine which columns they need to read. Currently, it's only used by the DruidInputSource, but it could be used by other columnar input sources in the future. 2) Add a ColumnsFilter to InputRowSchema. 3) Remove the metric names from InputRowSchema (they were unused). 4) Add InputRowSchemas.fromDataSchema method that computes the proper ColumnsFilter for given timestamp, dimensions, transform, and metrics. 5) Add "getRequiredColumns" method to TransformSpec to support the above. --- .../druid/data/input/ColumnsFilter.java | 152 +++++ .../druid/data/input/InputRowSchema.java | 25 +- .../apache/druid/data/input/InputSource.java | 6 +- .../druid/data/input/MapBasedInputRow.java | 24 + ...rehoseFactoryToInputSourceAdaptorTest.java | 3 +- .../data/input/impl/ColumnsFilterTest.java | 74 +++ .../druid/data/input/impl/CsvReaderTest.java | 6 +- .../data/input/impl/DelimitedReaderTest.java | 4 +- .../impl/InputEntityIteratingReaderTest.java | 4 +- .../druid/data/input/impl/JsonReaderTest.java | 11 +- docs/ingestion/native-batch.md | 96 +-- .../data/input/aliyun/OssInputSourceTest.java | 8 +- .../data/input/avro/AvroOCFReaderTest.java | 5 +- .../GoogleCloudStorageInputSourceTest.java | 5 +- .../inputsource/hdfs/HdfsInputSourceTest.java | 3 +- .../druid/data/input/orc/OrcReaderTest.java | 3 +- .../parquet/CompatParquetReaderTest.java | 14 +- .../parquet/DecimalParquetReaderTest.java | 7 +- .../parquet/FlattenSpecParquetReaderTest.java | 18 +- .../ParquetReaderResourceLeakTest.java | 4 +- .../parquet/TimestampsParquetReaderTest.java | 10 +- .../input/parquet/WikiParquetReaderTest.java | 4 +- .../data/input/s3/S3InputSourceTest.java | 5 +- .../common/ReingestionTimelineUtils.java | 3 + .../indexing/common/task/CompactionTask.java | 6 +- .../druid/indexing/common/task/IndexTask.java | 14 +- .../common/task/InputSourceProcessor.java | 19 +- .../PartialDimensionDistributionTask.java | 20 +- .../batch/parallel/SinglePhaseSubTask.java | 14 +- .../IngestSegmentFirehoseFactory.java | 3 + .../indexing/input/DruidInputSource.java | 47 +- .../input/DruidSegmentInputFormat.java | 16 +- .../indexing/input/DruidSegmentReader.java | 209 ++++--- .../druid/indexing/input/InputRowSchemas.java | 130 ++++ .../overlord/sampler/InputSourceSampler.java | 10 +- .../SeekableStreamIndexTaskRunner.java | 11 +- .../common/task/CompactionTaskTest.java | 11 +- .../input/DruidSegmentReaderTest.java | 567 +++++++++++++++++- .../indexing/input/InputRowSchemasTest.java | 105 ++++ .../RecordSupplierInputSourceTest.java | 3 +- .../seekablestream/StreamChunkParserTest.java | 5 +- .../transform/ExpressionTransform.java | 16 +- .../druid/segment/transform/Transform.java | 7 + .../segment/transform/TransformSpec.java | 15 + .../metadata/input/SqlInputSourceTest.java | 4 +- web-console/src/utils/ingestion-spec.tsx | 24 - .../views/load-data-view/load-data-view.tsx | 2 +- 47 files changed, 1422 insertions(+), 330 deletions(-) create mode 100644 core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java create mode 100644 core/src/test/java/org/apache/druid/data/input/impl/ColumnsFilterTest.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/input/InputRowSchemas.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/input/InputRowSchemasTest.java diff --git a/core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java b/core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java new file mode 100644 index 000000000000..f391e7e41c40 --- /dev/null +++ b/core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.data.input; + +import java.util.Collections; +import java.util.Objects; +import java.util.Set; + +/** + * Used by some {@link InputSourceReader} implementations in order to know what columns will need to be read out + * of the {@link InputRow} objects they create. + * + * This is meant to be useful as an optimization: if we're reading from a columnar data format, then when a column + * isn't going to be needed, we shouldn't read it. + * + * @see InputSource#reader accepts objects of this class + */ +public abstract class ColumnsFilter +{ + /** + * Accepts all columns. + */ + public static ColumnsFilter all() + { + return new ExclusionBased(Collections.emptySet()); + } + + /** + * Accepts a specific list of columns. + */ + public static ColumnsFilter inclusionBased(final Set inclusions) + { + return new InclusionBased(inclusions); + } + + + /** + * Accepts all columns, except those on a specific list. + */ + public static ColumnsFilter exclusionBased(final Set exclusions) + { + return new ExclusionBased(exclusions); + } + + /** + * Check if a column should be included or not. + */ + public abstract boolean apply(final String column); + + public static class InclusionBased extends ColumnsFilter + { + private final Set inclusions; + + private InclusionBased(Set inclusions) + { + this.inclusions = inclusions; + } + + @Override + public boolean apply(String column) + { + return inclusions.contains(column); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + InclusionBased that = (InclusionBased) o; + return Objects.equals(inclusions, that.inclusions); + } + + @Override + public int hashCode() + { + return Objects.hash(inclusions); + } + + @Override + public String toString() + { + return "ColumnsFilter.InclusionBased{" + + "inclusions=" + inclusions + + '}'; + } + } + + public static class ExclusionBased extends ColumnsFilter + { + private final Set exclusions; + + public ExclusionBased(Set exclusions) + { + this.exclusions = exclusions; + } + + @Override + public boolean apply(String column) + { + return !exclusions.contains(column); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + ExclusionBased that = (ExclusionBased) o; + return Objects.equals(exclusions, that.exclusions); + } + + @Override + public int hashCode() + { + return Objects.hash(exclusions); + } + + @Override + public String toString() + { + return "ColumnsFilter.ExclusionBased{" + + "exclusions=" + exclusions + + '}'; + } + } +} diff --git a/core/src/main/java/org/apache/druid/data/input/InputRowSchema.java b/core/src/main/java/org/apache/druid/data/input/InputRowSchema.java index c908187962e7..227bd3a6d198 100644 --- a/core/src/main/java/org/apache/druid/data/input/InputRowSchema.java +++ b/core/src/main/java/org/apache/druid/data/input/InputRowSchema.java @@ -22,8 +22,6 @@ import org.apache.druid.data.input.impl.DimensionsSpec; import org.apache.druid.data.input.impl.TimestampSpec; -import java.util.List; - /** * Schema of {@link InputRow}. */ @@ -31,13 +29,17 @@ public class InputRowSchema { private final TimestampSpec timestampSpec; private final DimensionsSpec dimensionsSpec; - private final List metricNames; + private final ColumnsFilter columnsFilter; - public InputRowSchema(TimestampSpec timestampSpec, DimensionsSpec dimensionsSpec, List metricNames) + public InputRowSchema( + final TimestampSpec timestampSpec, + final DimensionsSpec dimensionsSpec, + final ColumnsFilter columnsFilter + ) { this.timestampSpec = timestampSpec; this.dimensionsSpec = dimensionsSpec; - this.metricNames = metricNames; + this.columnsFilter = columnsFilter; } public TimestampSpec getTimestampSpec() @@ -50,8 +52,17 @@ public DimensionsSpec getDimensionsSpec() return dimensionsSpec; } - public List getMetricNames() + /** + * A {@link ColumnsFilter} that can filter down the list of columns that must be read after flattening. + * + * Logically, Druid applies ingestion spec components in a particular order: first flattenSpec (if any), then + * timestampSpec, then transformSpec, and finally dimensionsSpec and metricsSpec. + * + * If a flattenSpec is provided, this method returns a filter that should be applied after flattening. So, it will + * be based on what needs to pass between the flattenSpec and everything beyond it. + */ + public ColumnsFilter getColumnsFilter() { - return metricNames; + return columnsFilter; } } diff --git a/core/src/main/java/org/apache/druid/data/input/InputSource.java b/core/src/main/java/org/apache/druid/data/input/InputSource.java index b0144c51eef5..1dce5f04deac 100644 --- a/core/src/main/java/org/apache/druid/data/input/InputSource.java +++ b/core/src/main/java/org/apache/druid/data/input/InputSource.java @@ -76,5 +76,9 @@ public interface InputSource * @param inputFormat to parse data. It can be null if {@link #needsFormat()} = true * @param temporaryDirectory to store temp data. It will be cleaned up automatically once the task is finished. */ - InputSourceReader reader(InputRowSchema inputRowSchema, @Nullable InputFormat inputFormat, File temporaryDirectory); + InputSourceReader reader( + InputRowSchema inputRowSchema, + @Nullable InputFormat inputFormat, + File temporaryDirectory + ); } diff --git a/core/src/main/java/org/apache/druid/data/input/MapBasedInputRow.java b/core/src/main/java/org/apache/druid/data/input/MapBasedInputRow.java index 59ab8a55710c..e9117f26911a 100644 --- a/core/src/main/java/org/apache/druid/data/input/MapBasedInputRow.java +++ b/core/src/main/java/org/apache/druid/data/input/MapBasedInputRow.java @@ -25,8 +25,10 @@ import java.util.List; import java.util.Map; +import java.util.Objects; /** + * */ @PublicApi public class MapBasedInputRow extends MapBasedRow implements InputRow @@ -59,6 +61,28 @@ public List getDimensions() return dimensions; } + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + if (!super.equals(o)) { + return false; + } + MapBasedInputRow that = (MapBasedInputRow) o; + return Objects.equals(dimensions, that.dimensions); + } + + @Override + public int hashCode() + { + return Objects.hash(super.hashCode(), dimensions); + } + @Override public String toString() { diff --git a/core/src/test/java/org/apache/druid/data/input/FirehoseFactoryToInputSourceAdaptorTest.java b/core/src/test/java/org/apache/druid/data/input/FirehoseFactoryToInputSourceAdaptorTest.java index 088bed58cfcb..7a1634721c52 100644 --- a/core/src/test/java/org/apache/druid/data/input/FirehoseFactoryToInputSourceAdaptorTest.java +++ b/core/src/test/java/org/apache/druid/data/input/FirehoseFactoryToInputSourceAdaptorTest.java @@ -36,7 +36,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.stream.Stream; @@ -70,7 +69,7 @@ public void testUnimplementedInputFormat() throws IOException new InputRowSchema( inputRowParser.getParseSpec().getTimestampSpec(), inputRowParser.getParseSpec().getDimensionsSpec(), - Collections.emptyList() + ColumnsFilter.all() ), null, null diff --git a/core/src/test/java/org/apache/druid/data/input/impl/ColumnsFilterTest.java b/core/src/test/java/org/apache/druid/data/input/impl/ColumnsFilterTest.java new file mode 100644 index 000000000000..d85e9278de66 --- /dev/null +++ b/core/src/test/java/org/apache/druid/data/input/impl/ColumnsFilterTest.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.data.input.impl; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import nl.jqno.equalsverifier.EqualsVerifier; +import org.apache.druid.data.input.ColumnsFilter; +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; +import java.util.stream.Collectors; + +public class ColumnsFilterTest +{ + private static final List COLUMNS = ImmutableList.of("a", "b", "c"); + + @Test + public void testAll() + { + Assert.assertEquals( + ImmutableList.of("a", "b", "c"), + apply(ColumnsFilter.all(), COLUMNS) + ); + } + + @Test + public void testInclusionBased() + { + Assert.assertEquals( + ImmutableList.of("b"), + apply(ColumnsFilter.inclusionBased(ImmutableSet.of("b")), COLUMNS) + ); + } + + @Test + public void testExclusionBased() + { + Assert.assertEquals( + ImmutableList.of("a", "c"), + apply(ColumnsFilter.exclusionBased(ImmutableSet.of("b")), COLUMNS) + ); + } + + @Test + public void testEquals() + { + EqualsVerifier.forClass(ColumnsFilter.InclusionBased.class).usingGetClass().verify(); + EqualsVerifier.forClass(ColumnsFilter.ExclusionBased.class).usingGetClass().verify(); + } + + private List apply(ColumnsFilter columnsFilter, List columns) + { + return columns.stream().filter(columnsFilter::apply).collect(Collectors.toList()); + } +} diff --git a/core/src/test/java/org/apache/druid/data/input/impl/CsvReaderTest.java b/core/src/test/java/org/apache/druid/data/input/impl/CsvReaderTest.java index ec942379f3b2..c1faa274845c 100644 --- a/core/src/test/java/org/apache/druid/data/input/impl/CsvReaderTest.java +++ b/core/src/test/java/org/apache/druid/data/input/impl/CsvReaderTest.java @@ -23,6 +23,7 @@ import com.google.common.collect.ImmutableMap; import com.google.common.collect.Iterables; import org.apache.druid.common.config.NullHandling; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputEntityReader; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowSchema; @@ -37,7 +38,6 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Arrays; -import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.stream.Collectors; @@ -47,7 +47,7 @@ public class CsvReaderTest private static final InputRowSchema INPUT_ROW_SCHEMA = new InputRowSchema( new TimestampSpec("ts", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Arrays.asList("ts", "name"))), - Collections.emptyList() + ColumnsFilter.all() ); @BeforeClass @@ -229,7 +229,7 @@ public void testQuotes() throws IOException new InputRowSchema( new TimestampSpec("Timestamp", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("Timestamp"))), - Collections.emptyList() + ColumnsFilter.all() ), source, null diff --git a/core/src/test/java/org/apache/druid/data/input/impl/DelimitedReaderTest.java b/core/src/test/java/org/apache/druid/data/input/impl/DelimitedReaderTest.java index e590ed566a93..c98d8fff6a85 100644 --- a/core/src/test/java/org/apache/druid/data/input/impl/DelimitedReaderTest.java +++ b/core/src/test/java/org/apache/druid/data/input/impl/DelimitedReaderTest.java @@ -22,6 +22,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import org.apache.druid.common.config.NullHandling; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputEntityReader; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowSchema; @@ -35,7 +36,6 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Arrays; -import java.util.Collections; import java.util.List; import java.util.stream.Collectors; @@ -44,7 +44,7 @@ public class DelimitedReaderTest private static final InputRowSchema INPUT_ROW_SCHEMA = new InputRowSchema( new TimestampSpec("ts", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Arrays.asList("ts", "name"))), - Collections.emptyList() + ColumnsFilter.all() ); @BeforeClass diff --git a/core/src/test/java/org/apache/druid/data/input/impl/InputEntityIteratingReaderTest.java b/core/src/test/java/org/apache/druid/data/input/impl/InputEntityIteratingReaderTest.java index e202d152047a..37b35f149829 100644 --- a/core/src/test/java/org/apache/druid/data/input/impl/InputEntityIteratingReaderTest.java +++ b/core/src/test/java/org/apache/druid/data/input/impl/InputEntityIteratingReaderTest.java @@ -21,6 +21,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowSchema; import org.apache.druid.java.util.common.DateTimes; @@ -37,7 +38,6 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.util.ArrayList; -import java.util.Collections; import java.util.List; public class InputEntityIteratingReaderTest @@ -64,7 +64,7 @@ public void test() throws IOException new DimensionsSpec( DimensionsSpec.getDefaultSchemas(ImmutableList.of("time", "name", "score")) ), - Collections.emptyList() + ColumnsFilter.all() ), new CsvInputFormat( ImmutableList.of("time", "name", "score"), diff --git a/core/src/test/java/org/apache/druid/data/input/impl/JsonReaderTest.java b/core/src/test/java/org/apache/druid/data/input/impl/JsonReaderTest.java index a0c19557845f..ea40415678a8 100644 --- a/core/src/test/java/org/apache/druid/data/input/impl/JsonReaderTest.java +++ b/core/src/test/java/org/apache/druid/data/input/impl/JsonReaderTest.java @@ -21,6 +21,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputEntityReader; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowSchema; @@ -66,7 +67,7 @@ public void testParseRow() throws IOException new InputRowSchema( new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("bar", "foo"))), - Collections.emptyList() + ColumnsFilter.all() ), source, null @@ -116,7 +117,7 @@ public void testParseRowWithConditional() throws IOException new InputRowSchema( new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("foo"))), - Collections.emptyList() + ColumnsFilter.all() ), source, null @@ -158,7 +159,7 @@ public void testParseRowKeepNullColumns() throws IOException new InputRowSchema( new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Collections.emptyList())), - Collections.emptyList() + ColumnsFilter.all() ), source, null @@ -200,7 +201,7 @@ public void testKeepNullColumnsWithNoNullValues() throws IOException new InputRowSchema( new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Collections.emptyList())), - Collections.emptyList() + ColumnsFilter.all() ), source, null @@ -242,7 +243,7 @@ public void testFalseKeepNullColumns() throws IOException new InputRowSchema( new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Collections.emptyList())), - Collections.emptyList() + ColumnsFilter.all() ), source, null diff --git a/docs/ingestion/native-batch.md b/docs/ingestion/native-batch.md index 8538145844ce..2dfe96428174 100644 --- a/docs/ingestion/native-batch.md +++ b/docs/ingestion/native-batch.md @@ -1256,61 +1256,77 @@ no `inputFormat` field needs to be specified in the ingestion spec when using th |type|This should be "druid".|yes| |dataSource|A String defining the Druid datasource to fetch rows from|yes| |interval|A String representing an ISO-8601 interval, which defines the time range to fetch the data over.|yes| -|dimensions|A list of Strings containing the names of dimension columns to select from the Druid datasource. If the list is empty, no dimensions are returned. If null, all dimensions are returned. |no| -|metrics|The list of Strings containing the names of metric columns to select. If the list is empty, no metrics are returned. If null, all metrics are returned.|no| |filter| See [Filters](../querying/filters.md). Only rows that match the filter, if specified, will be returned.|no| -A minimal example DruidInputSource spec is shown below: +The Druid input source can be used for a variety of purposes, including: -```json -... - "ioConfig": { - "type": "index_parallel", - "inputSource": { - "type": "druid", - "dataSource": "wikipedia", - "interval": "2013-01-01/2013-01-02" - } - ... - }, -... -``` +- Creating new datasources that are rolled-up copies of existing datasources. +- Changing the [partitioning or sorting](index.md#partitioning) of a datasource to improve performance. +- Updating or removing rows using a [`transformSpec`](index.md#transformspec). -The spec above will read all existing dimension and metric columns from -the `wikipedia` datasource, including all rows with a timestamp (the `__time` column) -within the interval `2013-01-01/2013-01-02`. +When using the Druid input source, the timestamp column shows up as a numeric field named `__time` set to the number +of milliseconds since the epoch (January 1, 1970 00:00:00 UTC). It is common to use this in the timestampSpec, if you +want the output timestamp to be equivalent to the input timestamp. In this case, set the timestamp column to `__time` +and the format to `auto` or `millis`. -A spec that applies a filter and reads a subset of the original datasource's columns is shown below. +It is OK for the input and output datasources to be the same. In this case, the reindexed data will overwrite the +previous data. Generally, if you are going to do this, it is a good idea to test out your reindexing by writing to +a separate datasource before overwriting your main one. + +An example task spec is shown below. It reads from a hypothetical raw datasource `wikipedia_raw` and creates a new +rolled-up datasource `wikipedia_rollup` by grouping on hour, "countryName", and "page". ```json -... +{ + "type": "index_parallel", + "spec": { + "dataSchema": { + "dataSource": "wikipedia_rollup", + "timestampSpec": { + "column": "__time", + "format": "millis" + }, + "dimensionsSpec": { + "dimensions": [ + "countryName", + "page" + ] + }, + "metricsSpec": [ + { + "type": "count", + "name": "cnt" + } + ], + "granularitySpec": { + "type": "uniform", + "queryGranularity": "HOUR", + "segmentGranularity": "DAY", + "intervals": ["2016-06-27/P1D"], + "rollup": true + } + }, "ioConfig": { "type": "index_parallel", "inputSource": { "type": "druid", - "dataSource": "wikipedia", - "interval": "2013-01-01/2013-01-02", - "dimensions": [ - "page", - "user" - ], - "metrics": [ - "added" - ], - "filter": { - "type": "selector", - "dimension": "page", - "value": "Druid" - } + "dataSource": "wikipedia_raw", + "interval": "2016-06-27/P1D" } - ... }, -... + "tuningConfig": { + "type": "index_parallel", + "partitionsSpec": { + "type": "hashed", + "numShards": 1 + }, + "forceGuaranteedRollup": true, + "maxNumConcurrentSubTasks": 1 + } + } +} ``` -This spec above will only return the `page`, `user` dimensions and `added` metric. -Only rows where `page` = `Druid` will be returned. - ### SQL Input Source The SQL input source is used to read data directly from RDBMS. diff --git a/extensions-contrib/aliyun-oss-extensions/src/test/java/org/apache/druid/data/input/aliyun/OssInputSourceTest.java b/extensions-contrib/aliyun-oss-extensions/src/test/java/org/apache/druid/data/input/aliyun/OssInputSourceTest.java index 2bd9d5816acc..5b44b9d826c9 100644 --- a/extensions-contrib/aliyun-oss-extensions/src/test/java/org/apache/druid/data/input/aliyun/OssInputSourceTest.java +++ b/extensions-contrib/aliyun-oss-extensions/src/test/java/org/apache/druid/data/input/aliyun/OssInputSourceTest.java @@ -39,6 +39,7 @@ import com.google.inject.Guice; import com.google.inject.Injector; import com.google.inject.Provides; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowSchema; import org.apache.druid.data.input.InputSourceReader; @@ -110,7 +111,8 @@ public class OssInputSourceTest extends InitializedNullHandlingTest private static final OssClientConfig CLOUD_CONFIG_PROPERTIES = new OssClientConfig( "test.oss-cn.aliyun.com", new DefaultPasswordProvider("myKey"), - new DefaultPasswordProvider("mySecret")); + new DefaultPasswordProvider("mySecret") + ); private static final List EXPECTED_LOCATION = ImmutableList.of(new CloudObjectLocation("foo", "bar/file.csv")); @@ -453,7 +455,7 @@ public void testReader() throws IOException InputRowSchema someSchema = new InputRowSchema( new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), - ImmutableList.of("count") + ColumnsFilter.all() ); InputSourceReader reader = inputSource.reader( @@ -496,7 +498,7 @@ public void testCompressedReader() throws IOException InputRowSchema someSchema = new InputRowSchema( new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), - ImmutableList.of("count") + ColumnsFilter.all() ); InputSourceReader reader = inputSource.reader( diff --git a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroOCFReaderTest.java b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroOCFReaderTest.java index fe6070be6681..238dfef0651e 100644 --- a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroOCFReaderTest.java +++ b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroOCFReaderTest.java @@ -27,6 +27,7 @@ import org.apache.avro.generic.GenericRecord; import org.apache.druid.data.input.AvroHadoopInputRowParserTest; import org.apache.druid.data.input.AvroStreamInputRowParserTest; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputEntityReader; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowSchema; @@ -43,7 +44,6 @@ import java.io.File; import java.io.IOException; -import java.util.List; import java.util.Map; public class AvroOCFReaderTest @@ -150,10 +150,9 @@ private InputEntityReader createReader( final TimestampSpec timestampSpec = new TimestampSpec("timestamp", "auto", null); final DimensionsSpec dimensionsSpec = new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of( "eventType"))); - final List metricNames = ImmutableList.of("someLong"); final AvroOCFInputFormat inputFormat = new AvroOCFInputFormat(mapper, null, readerSchema, null); - final InputRowSchema schema = new InputRowSchema(timestampSpec, dimensionsSpec, metricNames); + final InputRowSchema schema = new InputRowSchema(timestampSpec, dimensionsSpec, ColumnsFilter.all()); final FileEntity entity = new FileEntity(someAvroFile); return inputFormat.createReader(schema, entity, temporaryFolder.newFolder()); } diff --git a/extensions-core/google-extensions/src/test/java/org/apache/druid/data/input/google/GoogleCloudStorageInputSourceTest.java b/extensions-core/google-extensions/src/test/java/org/apache/druid/data/input/google/GoogleCloudStorageInputSourceTest.java index 3888398fa9ea..1f4bea42e40d 100644 --- a/extensions-core/google-extensions/src/test/java/org/apache/druid/data/input/google/GoogleCloudStorageInputSourceTest.java +++ b/extensions-core/google-extensions/src/test/java/org/apache/druid/data/input/google/GoogleCloudStorageInputSourceTest.java @@ -31,6 +31,7 @@ import com.google.inject.Guice; import com.google.inject.Injector; import com.google.inject.Provides; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowSchema; import org.apache.druid.data.input.InputSourceReader; @@ -225,7 +226,7 @@ public void testReader() throws IOException InputRowSchema someSchema = new InputRowSchema( new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), - ImmutableList.of("count") + ColumnsFilter.all() ); InputSourceReader reader = inputSource.reader( @@ -268,7 +269,7 @@ public void testCompressedReader() throws IOException InputRowSchema someSchema = new InputRowSchema( new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), - ImmutableList.of("count") + ColumnsFilter.all() ); InputSourceReader reader = inputSource.reader( diff --git a/extensions-core/hdfs-storage/src/test/java/org/apache/druid/inputsource/hdfs/HdfsInputSourceTest.java b/extensions-core/hdfs-storage/src/test/java/org/apache/druid/inputsource/hdfs/HdfsInputSourceTest.java index 044930f838bf..cf4ee594fd1b 100644 --- a/extensions-core/hdfs-storage/src/test/java/org/apache/druid/inputsource/hdfs/HdfsInputSourceTest.java +++ b/extensions-core/hdfs-storage/src/test/java/org/apache/druid/inputsource/hdfs/HdfsInputSourceTest.java @@ -23,6 +23,7 @@ import com.fasterxml.jackson.databind.InjectableValues; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Iterables; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputFormat; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowSchema; @@ -74,7 +75,7 @@ public class HdfsInputSourceTest extends InitializedNullHandlingTest private static final InputRowSchema INPUT_ROW_SCHEMA = new InputRowSchema( new TimestampSpec(null, null, null), DimensionsSpec.EMPTY, - Collections.emptyList() + ColumnsFilter.all() ); private static final InputFormat INPUT_FORMAT = new CsvInputFormat( Arrays.asList(TimestampSpec.DEFAULT_COLUMN, COLUMN), diff --git a/extensions-core/orc-extensions/src/test/java/org/apache/druid/data/input/orc/OrcReaderTest.java b/extensions-core/orc-extensions/src/test/java/org/apache/druid/data/input/orc/OrcReaderTest.java index bef9b64728f4..9726c0e14671 100644 --- a/extensions-core/orc-extensions/src/test/java/org/apache/druid/data/input/orc/OrcReaderTest.java +++ b/extensions-core/orc-extensions/src/test/java/org/apache/druid/data/input/orc/OrcReaderTest.java @@ -21,6 +21,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputEntityReader; import org.apache.druid.data.input.InputFormat; import org.apache.druid.data.input.InputRow; @@ -259,7 +260,7 @@ private InputEntityReader createReader( String dataFile ) throws IOException { - final InputRowSchema schema = new InputRowSchema(timestampSpec, dimensionsSpec, Collections.emptyList()); + final InputRowSchema schema = new InputRowSchema(timestampSpec, dimensionsSpec, ColumnsFilter.all()); final FileEntity entity = new FileEntity(new File(dataFile)); return inputFormat.createReader(schema, entity, temporaryFolder.newFolder()); } diff --git a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/CompatParquetReaderTest.java b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/CompatParquetReaderTest.java index f8b586bec67d..60173212b53c 100644 --- a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/CompatParquetReaderTest.java +++ b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/CompatParquetReaderTest.java @@ -20,6 +20,7 @@ package org.apache.druid.data.input.parquet; import com.google.common.collect.ImmutableList; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputEntityReader; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowListPlusRawValues; @@ -34,7 +35,6 @@ import org.junit.Test; import java.io.IOException; -import java.util.Collections; import java.util.List; /** @@ -49,7 +49,7 @@ public void testBinaryAsString() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("ts", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("field"))), - ImmutableList.of() + ColumnsFilter.all() ); InputEntityReader reader = createReader( file, @@ -114,7 +114,7 @@ public void testParquet1217() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", DateTimes.of("2018-09-01T00:00:00.000Z")), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), - ImmutableList.of("metric1") + ColumnsFilter.all() ); List flattenExpr = ImmutableList.of( new JSONPathFieldSpec(JSONPathFieldType.ROOT, "col", "col"), @@ -200,7 +200,7 @@ required group nestedIntsColumn (LIST) { InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", DateTimes.of("2018-09-01T00:00:00.000Z")), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), - Collections.emptyList() + ColumnsFilter.all() ); List flattenExpr = ImmutableList.of( new JSONPathFieldSpec(JSONPathFieldType.PATH, "extractByLogicalMap", "$.intToStringColumn.1"), @@ -315,7 +315,7 @@ public void testOldRepeatedInt() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", DateTimes.of("2018-09-01T00:00:00.000Z")), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("repeatedInt"))), - Collections.emptyList() + ColumnsFilter.all() ); List flattenExpr = ImmutableList.of( new JSONPathFieldSpec(JSONPathFieldType.ROOT, "repeatedInt", "repeatedInt") @@ -353,7 +353,7 @@ public void testReadNestedArrayStruct() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", DateTimes.of("2018-09-01T00:00:00.000Z")), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("i32_dec", "extracted1", "extracted2"))), - Collections.emptyList() + ColumnsFilter.all() ); List flattenExpr = ImmutableList.of( new JSONPathFieldSpec(JSONPathFieldType.PATH, "extracted1", "$.myComplex[0].id"), @@ -395,7 +395,7 @@ public void testProtoStructWithArray() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", DateTimes.of("2018-09-01T00:00:00.000Z")), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), - Collections.emptyList() + ColumnsFilter.all() ); List flattenExpr = ImmutableList.of( new JSONPathFieldSpec(JSONPathFieldType.PATH, "extractedOptional", "$.optionalMessage.someId"), diff --git a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetReaderTest.java b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetReaderTest.java index 50b9fe2df27d..faa80e6d73f3 100644 --- a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetReaderTest.java +++ b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetReaderTest.java @@ -20,6 +20,7 @@ package org.apache.druid.data.input.parquet; import com.google.common.collect.ImmutableList; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputEntityReader; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowListPlusRawValues; @@ -49,7 +50,7 @@ public void testReadParquetDecimalFixedLen() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", DateTimes.of("2018-09-01T00:00:00.000Z")), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("fixed_len_dec"))), - ImmutableList.of("metric1") + ColumnsFilter.all() ); List flattenExpr = ImmutableList.of( new JSONPathFieldSpec(JSONPathFieldType.ROOT, "fixed_len_dec", "fixed_len_dec"), @@ -86,7 +87,7 @@ public void testReadParquetDecimali32() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", DateTimes.of("2018-09-01T00:00:00.000Z")), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("i32_dec"))), - ImmutableList.of("metric1") + ColumnsFilter.all() ); List flattenExpr = ImmutableList.of( new JSONPathFieldSpec(JSONPathFieldType.ROOT, "i32_dec", "i32_dec"), @@ -123,7 +124,7 @@ public void testReadParquetDecimali64() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", DateTimes.of("2018-09-01T00:00:00.000Z")), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("i64_dec"))), - ImmutableList.of("metric1") + ColumnsFilter.all() ); List flattenExpr = ImmutableList.of( new JSONPathFieldSpec(JSONPathFieldType.ROOT, "i32_dec", "i64_dec"), diff --git a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/FlattenSpecParquetReaderTest.java b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/FlattenSpecParquetReaderTest.java index 5be38dda494d..7ff430667fa5 100644 --- a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/FlattenSpecParquetReaderTest.java +++ b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/FlattenSpecParquetReaderTest.java @@ -20,6 +20,7 @@ package org.apache.druid.data.input.parquet; import com.google.common.collect.ImmutableList; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputEntityReader; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowListPlusRawValues; @@ -33,7 +34,6 @@ import org.junit.Test; import java.io.IOException; -import java.util.Collections; import java.util.List; /** @@ -69,7 +69,7 @@ public void testFlat1NoFlattenSpec() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2", "dim3", "listDim"))), - ImmutableList.of("metric1", "metric2") + ColumnsFilter.all() ); JSONPathSpec flattenSpec = new JSONPathSpec(false, ImmutableList.of()); InputEntityReader reader = createReader( @@ -103,7 +103,7 @@ public void testFlat1Autodiscover() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), - ImmutableList.of("metric1", "metric2") + ColumnsFilter.all() ); InputEntityReader reader = createReader( file, @@ -136,7 +136,7 @@ public void testFlat1Flatten() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2", "dim3", "list"))), - ImmutableList.of("metric1", "metric2") + ColumnsFilter.all() ); List flattenExpr = ImmutableList.of( new JSONPathFieldSpec(JSONPathFieldType.ROOT, "timestamp", null), @@ -177,7 +177,7 @@ public void testFlat1FlattenSelectListItem() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2", "listExtracted"))), - ImmutableList.of("metric1", "metric2") + ColumnsFilter.all() ); List flattenExpr = ImmutableList.of( new JSONPathFieldSpec(JSONPathFieldType.ROOT, "timestamp", null), @@ -217,7 +217,7 @@ public void testNested1NoFlattenSpec() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1"))), - ImmutableList.of("metric1") + ColumnsFilter.all() ); JSONPathSpec flattenSpec = new JSONPathSpec(false, ImmutableList.of()); InputEntityReader reader = createReader( @@ -253,7 +253,7 @@ public void testNested1Autodiscover() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), - ImmutableList.of("metric1", "metric2") + ColumnsFilter.all() ); InputEntityReader reader = createReader( file, @@ -286,7 +286,7 @@ public void testNested1Flatten() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), - ImmutableList.of("metric1", "metric2") + ColumnsFilter.all() ); List flattenExpr = ImmutableList.of( new JSONPathFieldSpec(JSONPathFieldType.ROOT, "timestamp", null), @@ -329,7 +329,7 @@ public void testNested1FlattenSelectListItem() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), - Collections.emptyList() + ColumnsFilter.all() ); List flattenExpr = ImmutableList.of( new JSONPathFieldSpec(JSONPathFieldType.ROOT, "timestamp", null), diff --git a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/ParquetReaderResourceLeakTest.java b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/ParquetReaderResourceLeakTest.java index 251fa344bb73..f8e56b3f2deb 100644 --- a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/ParquetReaderResourceLeakTest.java +++ b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/ParquetReaderResourceLeakTest.java @@ -20,6 +20,7 @@ package org.apache.druid.data.input.parquet; import com.google.common.collect.ImmutableList; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputEntityReader; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowSchema; @@ -39,7 +40,6 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.util.Collections; import java.util.Objects; public class ParquetReaderResourceLeakTest extends BaseParquetReaderTest @@ -55,7 +55,7 @@ public void testFetchOnReadCleanupAfterExhaustingIterator() throws IOException new DimensionsSpec( DimensionsSpec.getDefaultSchemas(ImmutableList.of("page", "language", "user", "unpatrolled")) ), - Collections.emptyList() + ColumnsFilter.all() ); FetchingFileEntity entity = new FetchingFileEntity(new File("example/wiki/wiki.parquet")); ParquetInputFormat parquet = new ParquetInputFormat(JSONPathSpec.DEFAULT, false, new Configuration()); diff --git a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/TimestampsParquetReaderTest.java b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/TimestampsParquetReaderTest.java index 19f1544dcff0..c0189fe8bc19 100644 --- a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/TimestampsParquetReaderTest.java +++ b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/TimestampsParquetReaderTest.java @@ -20,6 +20,7 @@ package org.apache.druid.data.input.parquet; import com.google.common.collect.ImmutableList; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputEntityReader; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowListPlusRawValues; @@ -31,7 +32,6 @@ import org.junit.Test; import java.io.IOException; -import java.util.Collections; import java.util.List; /** @@ -46,12 +46,12 @@ public void testDateHandling() throws IOException InputRowSchema schemaAsString = new InputRowSchema( new TimestampSpec("date_as_string", "Y-M-d", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), - Collections.emptyList() + ColumnsFilter.all() ); InputRowSchema schemaAsDate = new InputRowSchema( new TimestampSpec("date_as_date", null, null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), - Collections.emptyList() + ColumnsFilter.all() ); InputEntityReader readerAsString = createReader( file, @@ -104,7 +104,7 @@ public void testParseInt96Timestamp() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("ts", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), - Collections.emptyList() + ColumnsFilter.all() ); InputEntityReader reader = createReader(file, schema, JSONPathSpec.DEFAULT); @@ -130,7 +130,7 @@ public void testTimeMillisInInt64() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), - Collections.emptyList() + ColumnsFilter.all() ); InputEntityReader reader = createReader( file, diff --git a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/WikiParquetReaderTest.java b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/WikiParquetReaderTest.java index 75e5e916ec78..4bc7bac27b2e 100644 --- a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/WikiParquetReaderTest.java +++ b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/WikiParquetReaderTest.java @@ -20,6 +20,7 @@ package org.apache.druid.data.input.parquet; import com.google.common.collect.ImmutableList; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputEntityReader; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowListPlusRawValues; @@ -31,7 +32,6 @@ import org.junit.Test; import java.io.IOException; -import java.util.Collections; import java.util.List; /** @@ -45,7 +45,7 @@ public void testWiki() throws IOException InputRowSchema schema = new InputRowSchema( new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("page", "language", "user", "unpatrolled"))), - Collections.emptyList() + ColumnsFilter.all() ); InputEntityReader reader = createReader("example/wiki/wiki.parquet", schema, JSONPathSpec.DEFAULT); diff --git a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java index 8dd82a97ec9d..0bc23f6d3a5c 100644 --- a/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java +++ b/extensions-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java @@ -40,6 +40,7 @@ import com.google.inject.Guice; import com.google.inject.Injector; import com.google.inject.Provides; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowSchema; import org.apache.druid.data.input.InputSourceReader; @@ -508,7 +509,7 @@ public void testReader() throws IOException InputRowSchema someSchema = new InputRowSchema( new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), - ImmutableList.of("count") + ColumnsFilter.all() ); InputSourceReader reader = inputSource.reader( @@ -552,7 +553,7 @@ public void testCompressedReader() throws IOException InputRowSchema someSchema = new InputRowSchema( new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), - ImmutableList.of("count") + ColumnsFilter.all() ); InputSourceReader reader = inputSource.reader( diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/ReingestionTimelineUtils.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/ReingestionTimelineUtils.java index bd9d21457eb6..b1a2fb5ddd94 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/ReingestionTimelineUtils.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/ReingestionTimelineUtils.java @@ -34,6 +34,9 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; +/** + * @deprecated only used by {@link org.apache.druid.indexing.firehose.IngestSegmentFirehoseFactory} + */ public class ReingestionTimelineUtils { /** diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CompactionTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CompactionTask.java index 5cfaf32c92aa..10b7e1599ec5 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CompactionTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CompactionTask.java @@ -620,8 +620,8 @@ private static ParallelIndexIOConfig createIoConfig( interval, null, null, - dataSchema.getDimensionsSpec().getDimensionNames(), - Arrays.stream(dataSchema.getAggregators()).map(AggregatorFactory::getName).collect(Collectors.toList()), + null, + null, toolbox.getIndexIO(), coordinatorClient, segmentLoaderFactory, @@ -692,7 +692,7 @@ private static DataSchema createDataSchema( return new DataSchema( dataSource, - new TimestampSpec(null, null, null), + new TimestampSpec(ColumnHolder.TIME_COLUMN_NAME, "millis", null), finalDimensionsSpec, finalMetricsSpec, granularitySpec, diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/IndexTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/IndexTask.java index 8367ead3fa85..b636e83c8a5f 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/IndexTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/IndexTask.java @@ -39,7 +39,6 @@ import org.apache.druid.data.input.FirehoseFactoryToInputSourceAdaptor; import org.apache.druid.data.input.InputFormat; import org.apache.druid.data.input.InputRow; -import org.apache.druid.data.input.InputRowSchema; import org.apache.druid.data.input.InputSource; import org.apache.druid.data.input.InputSourceReader; import org.apache.druid.data.input.Rows; @@ -69,6 +68,7 @@ import org.apache.druid.indexing.common.task.batch.partition.HashPartitionAnalysis; import org.apache.druid.indexing.common.task.batch.partition.LinearPartitionAnalysis; import org.apache.druid.indexing.common.task.batch.partition.PartitionAnalysis; +import org.apache.druid.indexing.input.InputRowSchemas; import org.apache.druid.indexing.overlord.sampler.InputSourceSampler; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.ISE; @@ -81,7 +81,6 @@ import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.java.util.common.parsers.CloseableIterator; import org.apache.druid.java.util.common.parsers.ParseException; -import org.apache.druid.query.aggregation.AggregatorFactory; import org.apache.druid.segment.IndexSpec; import org.apache.druid.segment.indexing.BatchIOConfig; import org.apache.druid.segment.indexing.DataSchema; @@ -124,7 +123,6 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -136,7 +134,6 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; -import java.util.stream.Collectors; public class IndexTask extends AbstractBatchIndexTask implements ChatHandler { @@ -740,16 +737,9 @@ private Map> collectIntervalsAndShardSp Comparators.intervalsByStartThenEnd() ); final Granularity queryGranularity = granularitySpec.getQueryGranularity(); - final List metricsNames = Arrays.stream(ingestionSchema.getDataSchema().getAggregators()) - .map(AggregatorFactory::getName) - .collect(Collectors.toList()); final InputSourceReader inputSourceReader = ingestionSchema.getDataSchema().getTransformSpec().decorate( inputSource.reader( - new InputRowSchema( - ingestionSchema.getDataSchema().getTimestampSpec(), - ingestionSchema.getDataSchema().getDimensionsSpec(), - metricsNames - ), + InputRowSchemas.fromDataSchema(ingestionSchema.getDataSchema()), inputSource.needsFormat() ? getInputFormat(ingestionSchema) : null, tmpDir ) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/InputSourceProcessor.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/InputSourceProcessor.java index e88dab25f522..76ac510d6041 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/InputSourceProcessor.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/InputSourceProcessor.java @@ -23,18 +23,17 @@ import org.apache.druid.data.input.HandlingInputRowIterator; import org.apache.druid.data.input.InputFormat; import org.apache.druid.data.input.InputRow; -import org.apache.druid.data.input.InputRowSchema; import org.apache.druid.data.input.InputSource; import org.apache.druid.data.input.InputSourceReader; import org.apache.druid.indexer.partitions.DynamicPartitionsSpec; import org.apache.druid.indexer.partitions.PartitionsSpec; import org.apache.druid.indexing.common.stats.RowIngestionMeters; import org.apache.druid.indexing.common.task.batch.parallel.iterator.IndexTaskInputRowIteratorBuilder; +import org.apache.druid.indexing.input.InputRowSchemas; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.java.util.common.parsers.CloseableIterator; import org.apache.druid.java.util.common.parsers.ParseException; -import org.apache.druid.query.aggregation.AggregatorFactory; import org.apache.druid.segment.indexing.DataSchema; import org.apache.druid.segment.indexing.granularity.GranularitySpec; import org.apache.druid.segment.realtime.appenderator.AppenderatorDriverAddResult; @@ -46,11 +45,8 @@ import javax.annotation.Nullable; import java.io.File; import java.io.IOException; -import java.util.Arrays; -import java.util.List; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeoutException; -import java.util.stream.Collectors; public class InputSourceProcessor { @@ -104,16 +100,9 @@ public SegmentsAndCommitMetadata process( : null; final GranularitySpec granularitySpec = dataSchema.getGranularitySpec(); - final List metricsNames = Arrays.stream(dataSchema.getAggregators()) - .map(AggregatorFactory::getName) - .collect(Collectors.toList()); final InputSourceReader inputSourceReader = dataSchema.getTransformSpec().decorate( inputSource.reader( - new InputRowSchema( - dataSchema.getTimestampSpec(), - dataSchema.getDimensionsSpec(), - metricsNames - ), + InputRowSchemas.fromDataSchema(dataSchema), inputFormat, tmpDir ) @@ -188,9 +177,9 @@ private void handleParseException(ParseException e) buildSegmentsMeters.incrementUnparseable(); } - if (logParseExceptions) { +// if (logParseExceptions) { LOG.error(e, "Encountered parse exception"); - } +// } if (buildSegmentsSavedParseExceptions != null) { buildSegmentsSavedParseExceptions.add(e); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java index 55dbfb4bf689..46e875f0894f 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionTask.java @@ -31,7 +31,6 @@ import org.apache.druid.data.input.HandlingInputRowIterator; import org.apache.druid.data.input.InputFormat; import org.apache.druid.data.input.InputRow; -import org.apache.druid.data.input.InputRowSchema; import org.apache.druid.data.input.InputSource; import org.apache.druid.data.input.InputSourceReader; import org.apache.druid.data.input.Rows; @@ -46,23 +45,21 @@ import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch; import org.apache.druid.indexing.common.task.batch.parallel.iterator.IndexTaskInputRowIteratorBuilder; import org.apache.druid.indexing.common.task.batch.parallel.iterator.RangePartitionIndexTaskInputRowIteratorBuilder; +import org.apache.druid.indexing.input.InputRowSchemas; import org.apache.druid.java.util.common.granularity.Granularity; import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.java.util.common.parsers.CloseableIterator; import org.apache.druid.java.util.common.parsers.ParseException; -import org.apache.druid.query.aggregation.AggregatorFactory; import org.apache.druid.segment.indexing.DataSchema; import org.apache.druid.segment.indexing.granularity.GranularitySpec; import org.joda.time.DateTime; import org.joda.time.Interval; import javax.annotation.Nullable; -import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.function.Supplier; -import java.util.stream.Collectors; /** * The worker task of {@link PartialDimensionDistributionParallelIndexTaskRunner}. This task @@ -116,7 +113,8 @@ public class PartialDimensionDistributionTask extends PerfectRollupWorkerTask ); } - @VisibleForTesting // Only for testing + @VisibleForTesting + // Only for testing PartialDimensionDistributionTask( @Nullable String id, final String groupId, @@ -202,19 +200,12 @@ public TaskStatus runTask(TaskToolbox toolbox) throws Exception InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource( ingestionSchema.getDataSchema().getParser() ); - List metricsNames = Arrays.stream(dataSchema.getAggregators()) - .map(AggregatorFactory::getName) - .collect(Collectors.toList()); InputFormat inputFormat = inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null; InputSourceReader inputSourceReader = dataSchema.getTransformSpec().decorate( inputSource.reader( - new InputRowSchema( - dataSchema.getTimestampSpec(), - dataSchema.getDimensionsSpec(), - metricsNames - ), + InputRowSchemas.fromDataSchema(dataSchema), inputFormat, toolbox.getIndexingTmpDir() ) @@ -357,7 +348,8 @@ static class DedupInputRowFilter implements InputRowFilter this(queryGranularity, BLOOM_FILTER_EXPECTED_INSERTIONS, BLOOM_FILTER_EXPECTED_FALSE_POSITIVE_PROBABILTY); } - @VisibleForTesting // to allow controlling false positive rate of bloom filter + @VisibleForTesting + // to allow controlling false positive rate of bloom filter DedupInputRowFilter( Granularity queryGranularity, int bloomFilterExpectedInsertions, diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseSubTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseSubTask.java index 8f6131e92288..97a561f806d4 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseSubTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseSubTask.java @@ -26,7 +26,6 @@ import com.google.common.collect.FluentIterable; import org.apache.druid.client.indexing.IndexingServiceClient; import org.apache.druid.data.input.InputRow; -import org.apache.druid.data.input.InputRowSchema; import org.apache.druid.data.input.InputSource; import org.apache.druid.data.input.InputSourceReader; import org.apache.druid.indexer.TaskStatus; @@ -41,6 +40,7 @@ import org.apache.druid.indexing.common.task.SegmentAllocators; import org.apache.druid.indexing.common.task.TaskResource; import org.apache.druid.indexing.common.task.Tasks; +import org.apache.druid.indexing.input.InputRowSchemas; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.Intervals; import org.apache.druid.java.util.common.StringUtils; @@ -49,7 +49,6 @@ import org.apache.druid.java.util.common.parsers.CloseableIterator; import org.apache.druid.java.util.common.parsers.ParseException; import org.apache.druid.query.DruidMetrics; -import org.apache.druid.query.aggregation.AggregatorFactory; import org.apache.druid.segment.indexing.DataSchema; import org.apache.druid.segment.indexing.RealtimeIOConfig; import org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec; @@ -73,7 +72,6 @@ import javax.annotation.Nullable; import java.io.File; import java.io.IOException; -import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; @@ -81,7 +79,6 @@ import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeoutException; -import java.util.stream.Collectors; /** * The worker task of {@link SinglePhaseParallelIndexTaskRunner}. Similar to {@link IndexTask}, but this task @@ -326,16 +323,9 @@ private Set generateAndPushSegments( tuningConfig, getContextValue(Tasks.STORE_COMPACTION_STATE_KEY, Tasks.DEFAULT_STORE_COMPACTION_STATE) ); - final List metricsNames = Arrays.stream(ingestionSchema.getDataSchema().getAggregators()) - .map(AggregatorFactory::getName) - .collect(Collectors.toList()); final InputSourceReader inputSourceReader = dataSchema.getTransformSpec().decorate( inputSource.reader( - new InputRowSchema( - ingestionSchema.getDataSchema().getTimestampSpec(), - ingestionSchema.getDataSchema().getDimensionsSpec(), - metricsNames - ), + InputRowSchemas.fromDataSchema(ingestionSchema.getDataSchema()), inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null, tmpDir ) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/firehose/IngestSegmentFirehoseFactory.java b/indexing-service/src/main/java/org/apache/druid/indexing/firehose/IngestSegmentFirehoseFactory.java index 6248828d32b4..1defe67cf14c 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/firehose/IngestSegmentFirehoseFactory.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/firehose/IngestSegmentFirehoseFactory.java @@ -61,6 +61,9 @@ import java.util.Map; import java.util.stream.Stream; +/** + * @deprecated use {@link DruidInputSource} instead + */ public class IngestSegmentFirehoseFactory implements FiniteFirehoseFactory> { private static final EmittingLogger log = new EmittingLogger(IngestSegmentFirehoseFactory.class); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java index b9cc5759cd3b..6ea25dca8633 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java @@ -39,7 +39,6 @@ import org.apache.druid.data.input.SplitHintSpec; import org.apache.druid.data.input.impl.InputEntityIteratingReader; import org.apache.druid.data.input.impl.SplittableInputSource; -import org.apache.druid.indexing.common.ReingestionTimelineUtils; import org.apache.druid.indexing.common.RetryPolicy; import org.apache.druid.indexing.common.RetryPolicyFactory; import org.apache.druid.indexing.common.SegmentLoaderFactory; @@ -74,6 +73,11 @@ import java.util.concurrent.ThreadLocalRandom; import java.util.stream.Stream; +/** + * An {@link org.apache.druid.data.input.InputSource} that allows reading from Druid segments. + * + * Used internally by {@link org.apache.druid.indexing.common.task.CompactionTask}, and can also be used directly. + */ public class DruidInputSource extends AbstractInputSource implements SplittableInputSource> { private static final Logger LOG = new Logger(DruidInputSource.class); @@ -87,13 +91,21 @@ public class DruidInputSource extends AbstractInputSource implements SplittableI @Nullable private final List segmentIds; private final DimFilter dimFilter; - private final List dimensions; - private final List metrics; private final IndexIO indexIO; private final CoordinatorClient coordinatorClient; private final SegmentLoaderFactory segmentLoaderFactory; private final RetryPolicyFactory retryPolicyFactory; + /** + * Included for serde backwards-compatibility only. Not used. + */ + private final List dimensions; + + /** + * Included for serde backwards-compatibility only. Not used. + */ + private final List metrics; + @JsonCreator public DruidInputSource( @JsonProperty("dataSource") final String dataSource, @@ -134,6 +146,7 @@ public String getDataSource() @Nullable @JsonProperty + @JsonInclude(Include.NON_NULL) public Interval getInterval() { return interval; @@ -148,18 +161,27 @@ public List getSegmentIds() } @JsonProperty("filter") + @JsonInclude(Include.NON_NULL) public DimFilter getDimFilter() { return dimFilter; } + /** + * Included for serde backwards-compatibility only. Not used. + */ @JsonProperty + @JsonInclude(Include.NON_NULL) public List getDimensions() { return dimensions; } + /** + * Included for serde backwards-compatibility only. Not used. + */ @JsonProperty + @JsonInclude(Include.NON_NULL) public List getMetrics() { return metrics; @@ -181,25 +203,8 @@ protected InputSourceReader fixedFormatReader(InputRowSchema inputRowSchema, @Nu .from(partitionHolder) .transform(chunk -> new DruidSegmentInputEntity(segmentLoader, chunk.getObject(), holder.getInterval())); }).iterator(); - final List effectiveDimensions = ReingestionTimelineUtils.getDimensionsToReingest( - dimensions, - inputRowSchema.getDimensionsSpec(), - timeline - ); - - List effectiveMetrics; - if (metrics == null) { - effectiveMetrics = ReingestionTimelineUtils.getUniqueMetrics(timeline); - } else { - effectiveMetrics = metrics; - } - final DruidSegmentInputFormat inputFormat = new DruidSegmentInputFormat( - indexIO, - dimFilter, - effectiveDimensions, - effectiveMetrics - ); + final DruidSegmentInputFormat inputFormat = new DruidSegmentInputFormat(indexIO, dimFilter); return new InputEntityIteratingReader( inputRowSchema, diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentInputFormat.java b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentInputFormat.java index 80f87721357c..4d028596ff08 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentInputFormat.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentInputFormat.java @@ -27,26 +27,19 @@ import org.apache.druid.segment.IndexIO; import java.io.File; -import java.util.List; public class DruidSegmentInputFormat implements InputFormat { private final IndexIO indexIO; private final DimFilter dimFilter; - private List dimensions; - private List metrics; - DruidSegmentInputFormat( + public DruidSegmentInputFormat( IndexIO indexIO, - DimFilter dimFilter, - List dimensions, - List metrics + DimFilter dimFilter ) { this.indexIO = indexIO; this.dimFilter = dimFilter; - this.dimensions = dimensions; - this.metrics = metrics; } @Override @@ -65,8 +58,9 @@ public InputEntityReader createReader( return new DruidSegmentReader( source, indexIO, - dimensions, - metrics, + inputRowSchema.getTimestampSpec(), + inputRowSchema.getDimensionsSpec(), + inputRowSchema.getColumnsFilter(), dimFilter, temporaryDirectory ); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentReader.java b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentReader.java index 6460ae43d55d..3eb57b30597d 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentReader.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentReader.java @@ -21,12 +21,17 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; +import com.google.common.base.Supplier; +import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputEntity; import org.apache.druid.data.input.InputEntity.CleanableFile; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.IntermediateRowParsingReader; -import org.apache.druid.data.input.MapBasedInputRow; -import org.apache.druid.java.util.common.DateTimes; +import org.apache.druid.data.input.impl.DimensionsSpec; +import org.apache.druid.data.input.impl.MapInputRowParser; +import org.apache.druid.data.input.impl.TimestampSpec; import org.apache.druid.java.util.common.granularity.Granularities; import org.apache.druid.java.util.common.guava.Sequence; import org.apache.druid.java.util.common.guava.Sequences; @@ -35,56 +40,62 @@ import org.apache.druid.java.util.common.io.Closer; import org.apache.druid.java.util.common.parsers.CloseableIterator; import org.apache.druid.java.util.common.parsers.ParseException; -import org.apache.druid.query.dimension.DefaultDimensionSpec; import org.apache.druid.query.filter.DimFilter; +import org.apache.druid.segment.BaseDoubleColumnValueSelector; +import org.apache.druid.segment.BaseFloatColumnValueSelector; import org.apache.druid.segment.BaseLongColumnValueSelector; import org.apache.druid.segment.BaseObjectColumnValueSelector; +import org.apache.druid.segment.ColumnProcessorFactory; +import org.apache.druid.segment.ColumnProcessors; import org.apache.druid.segment.Cursor; import org.apache.druid.segment.DimensionSelector; import org.apache.druid.segment.IndexIO; import org.apache.druid.segment.QueryableIndexStorageAdapter; import org.apache.druid.segment.VirtualColumns; import org.apache.druid.segment.column.ColumnHolder; +import org.apache.druid.segment.column.ValueType; import org.apache.druid.segment.data.IndexedInts; import org.apache.druid.segment.filter.Filters; import org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter; import org.apache.druid.utils.CollectionUtils; -import org.joda.time.DateTime; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; -import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.NoSuchElementException; +import java.util.Set; public class DruidSegmentReader extends IntermediateRowParsingReader> { private final DruidSegmentInputEntity source; private final IndexIO indexIO; - private final List dimensions; - private final List metrics; + private final TimestampSpec timestampSpec; + private final DimensionsSpec dimensionsSpec; + private final ColumnsFilter columnsFilter; private final DimFilter dimFilter; private final File temporaryDirectory; DruidSegmentReader( - InputEntity source, - IndexIO indexIO, - List dimensions, - List metrics, - DimFilter dimFilter, - File temporaryDirectory + final InputEntity source, + final IndexIO indexIO, + final TimestampSpec timestampSpec, + final DimensionsSpec dimensionsSpec, + final ColumnsFilter columnsFilter, + final DimFilter dimFilter, + final File temporaryDirectory ) { Preconditions.checkArgument(source instanceof DruidSegmentInputEntity); this.source = (DruidSegmentInputEntity) source; this.indexIO = indexIO; - this.dimensions = dimensions; - this.metrics = metrics; + this.timestampSpec = timestampSpec; + this.dimensionsSpec = dimensionsSpec; + this.columnsFilter = columnsFilter; this.dimFilter = dimFilter; this.temporaryDirectory = temporaryDirectory; } @@ -109,10 +120,23 @@ protected CloseableIterator> intermediateRowIterator() throw null ); + // Retain order of columns from the original segments. Useful for preserving dimension order if we're in + // schemaless mode. + final Set columnsToRead = Sets.newLinkedHashSet( + Iterables.filter( + Iterables.concat( + Collections.singleton(ColumnHolder.TIME_COLUMN_NAME), + storageAdapter.getAdapter().getAvailableDimensions(), + storageAdapter.getAdapter().getAvailableMetrics() + ), + columnsFilter::apply + ) + ); + final Sequence> sequence = Sequences.concat( Sequences.map( cursors, - this::cursorToSequence + cursor -> cursorToSequence(cursor, columnsToRead) ) ); @@ -122,8 +146,7 @@ protected CloseableIterator> intermediateRowIterator() throw @Override protected List parseInputRows(Map intermediateRow) throws ParseException { - final DateTime timestamp = (DateTime) intermediateRow.get(ColumnHolder.TIME_COLUMN_NAME); - return Collections.singletonList(new MapBasedInputRow(timestamp.getMillis(), dimensions, intermediateRow)); + return Collections.singletonList(MapInputRowParser.parse(timestampSpec, dimensionsSpec, intermediateRow)); } @Override @@ -137,14 +160,13 @@ protected Map toMap(Map intermediateRow) * Map intermediate rows, selecting the dimensions and metrics of this segment reader. * * @param cursor A cursor + * * @return A sequence of intermediate rows */ - private Sequence> cursorToSequence( - final Cursor cursor - ) + private Sequence> cursorToSequence(final Cursor cursor, final Set columnsToRead) { return Sequences.simple( - () -> new IntermediateRowFromCursorIterator(cursor, dimensions, metrics) + () -> new IntermediateRowFromCursorIterator(cursor, columnsToRead) ); } @@ -152,8 +174,9 @@ private Sequence> cursorToSequence( * @param sequence A sequence of intermediate rows generated from a sequence of * cursors in {@link #intermediateRowIterator()} * @param segmentFile The underlying segment file containing the row data + * * @return A CloseableIterator from a sequence of intermediate rows, closing the underlying segment file - * when the iterator is closed. + * when the iterator is closed. */ @VisibleForTesting static CloseableIterator> makeCloseableIteratorFromSequenceAndSegmentFile( @@ -190,6 +213,66 @@ public void close() throws IOException }; } + /** + * Reads columns for {@link IntermediateRowFromCursorIterator}. + */ + private static class IntermediateRowColumnProcessorFactory implements ColumnProcessorFactory> + { + private static final IntermediateRowColumnProcessorFactory INSTANCE = new IntermediateRowColumnProcessorFactory(); + + @Override + public ValueType defaultType() + { + return ValueType.STRING; + } + + @Override + public Supplier makeDimensionProcessor(DimensionSelector selector, boolean multiValue) + { + return () -> { + final IndexedInts vals = selector.getRow(); + + int valsSize = vals.size(); + if (valsSize == 1) { + return selector.lookupName(vals.get(0)); + } else if (valsSize > 1) { + List dimVals = new ArrayList<>(valsSize); + for (int i = 0; i < valsSize; ++i) { + dimVals.add(selector.lookupName(vals.get(i))); + } + + return dimVals; + } + + return null; + }; + } + + @Override + public Supplier makeFloatProcessor(BaseFloatColumnValueSelector selector) + { + return () -> selector.isNull() ? null : selector.getFloat(); + } + + @Override + public Supplier makeDoubleProcessor(BaseDoubleColumnValueSelector selector) + { + return () -> selector.isNull() ? null : selector.getDouble(); + } + + @Override + public Supplier makeLongProcessor(BaseLongColumnValueSelector selector) + { + return () -> selector.isNull() ? null : selector.getLong(); + } + + @Override + public Supplier makeComplexProcessor(BaseObjectColumnValueSelector selector) + { + return selector::getObject; + } + } + /** * Given a {@link Cursor}, a list of dimension names, and a list of metric names, this iterator * returns the rows of the cursor as Map intermediate rows. @@ -197,39 +280,25 @@ public void close() throws IOException private static class IntermediateRowFromCursorIterator implements Iterator> { private final Cursor cursor; - private final BaseLongColumnValueSelector timestampColumnSelector; - private final Map dimSelectors; - private final Map metSelectors; + private final Map> columnReaders; public IntermediateRowFromCursorIterator( - Cursor cursor, - List dimensionNames, - List metricNames + final Cursor cursor, + final Set columnsToRead ) { this.cursor = cursor; + this.columnReaders = CollectionUtils.newLinkedHashMapWithExpectedSize(columnsToRead.size()); - timestampColumnSelector = cursor - .getColumnSelectorFactory() - .makeColumnValueSelector(ColumnHolder.TIME_COLUMN_NAME); - - dimSelectors = new HashMap<>(); - for (String dim : dimensionNames) { - final DimensionSelector dimSelector = cursor - .getColumnSelectorFactory() - .makeDimensionSelector(new DefaultDimensionSpec(dim, dim)); - // dimSelector is null if the dimension is not present - if (dimSelector != null) { - dimSelectors.put(dim, dimSelector); - } - } - - metSelectors = new HashMap<>(); - for (String metric : metricNames) { - final BaseObjectColumnValueSelector metricSelector = cursor - .getColumnSelectorFactory() - .makeColumnValueSelector(metric); - metSelectors.put(metric, metricSelector); + for (String column : columnsToRead) { + columnReaders.put( + column, + ColumnProcessors.makeProcessor( + column, + IntermediateRowColumnProcessorFactory.INSTANCE, + cursor.getColumnSelectorFactory() + ) + ); } } @@ -245,46 +314,18 @@ public Map next() if (!hasNext()) { throw new NoSuchElementException(); } - final Map theEvent = - CollectionUtils.newLinkedHashMapWithExpectedSize(dimSelectors.size() + metSelectors.size() + 1); - - for (Entry dimSelector : dimSelectors.entrySet()) { - final String dim = dimSelector.getKey(); - final DimensionSelector selector = dimSelector.getValue(); - final IndexedInts vals = selector.getRow(); + final Map rowMap = + CollectionUtils.newLinkedHashMapWithExpectedSize(columnReaders.size()); - int valsSize = vals.size(); - if (valsSize == 1) { - final String dimVal = selector.lookupName(vals.get(0)); - theEvent.put(dim, dimVal); - } else if (valsSize > 1) { - List dimVals = new ArrayList<>(valsSize); - for (int i = 0; i < valsSize; ++i) { - dimVals.add(selector.lookupName(vals.get(i))); - } - theEvent.put(dim, dimVals); - } - } - - for (Entry metSelector : metSelectors.entrySet()) { - final String metric = metSelector.getKey(); - final BaseObjectColumnValueSelector selector = metSelector.getValue(); - Object value = selector.getObject(); + for (Entry> entry : columnReaders.entrySet()) { + final Object value = entry.getValue().get(); if (value != null) { - theEvent.put(metric, value); + rowMap.put(entry.getKey(), value); } } - // Timestamp is added last because we expect that the time column will always be a date time object. - // If it is added earlier, it can be overwritten by metrics or dimenstions with the same name. - // - // If a user names a metric or dimension `__time` it will be overwritten. This case should be rare since - // __time is reserved for the time column in druid segments. - final long timestamp = timestampColumnSelector.getLong(); - theEvent.put(ColumnHolder.TIME_COLUMN_NAME, DateTimes.utc(timestamp)); - cursor.advance(); - return theEvent; + return rowMap; } } } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/input/InputRowSchemas.java b/indexing-service/src/main/java/org/apache/druid/indexing/input/InputRowSchemas.java new file mode 100644 index 000000000000..f273be7922be --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/input/InputRowSchemas.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.input; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.druid.data.input.ColumnsFilter; +import org.apache.druid.data.input.InputRowSchema; +import org.apache.druid.data.input.impl.DimensionsSpec; +import org.apache.druid.data.input.impl.TimestampSpec; +import org.apache.druid.query.aggregation.AggregatorFactory; +import org.apache.druid.segment.indexing.DataSchema; +import org.apache.druid.segment.transform.Transform; +import org.apache.druid.segment.transform.TransformSpec; + +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Utilities that are helpful when implementing {@link org.apache.druid.data.input.InputEntityReader}. + */ +public class InputRowSchemas +{ + private InputRowSchemas() + { + // No instantiation. + } + + /** + * Creates an {@link InputRowSchema} from a given {@link DataSchema}. + */ + public static InputRowSchema fromDataSchema(final DataSchema dataSchema) + { + return new InputRowSchema( + dataSchema.getTimestampSpec(), + dataSchema.getDimensionsSpec(), + createColumnsFilter( + dataSchema.getTimestampSpec(), + dataSchema.getDimensionsSpec(), + dataSchema.getTransformSpec(), + dataSchema.getAggregators() + ) + ); + } + + /** + * Build a {@link ColumnsFilter} that can filter down the list of columns that must be read after flattening. + * + * @see InputRowSchema#getColumnsFilter() + */ + @VisibleForTesting + static ColumnsFilter createColumnsFilter( + final TimestampSpec timestampSpec, + final DimensionsSpec dimensionsSpec, + final TransformSpec transformSpec, + final AggregatorFactory[] aggregators + ) + { + // We'll need to know what fields are generated from transforms, vs. expected from the raw data. + final Set transformOutputNames = + transformSpec.getTransforms().stream().map(Transform::getName).collect(Collectors.toSet()); + + if (dimensionsSpec.hasCustomDimensions()) { + // We need an inclusion-based filter. + final Set inclusions = new HashSet<>(); + + // Add timestamp column. + inclusions.add(timestampSpec.getTimestampColumn()); + + // Add all transform inputs. + inclusions.addAll(transformSpec.getRequiredColumns()); + + // Add all dimension inputs that are *not* transform outputs. + for (String column : dimensionsSpec.getDimensionNames()) { + if (!transformOutputNames.contains(column)) { + inclusions.add(column); + } + } + + // Add all aggregator inputs that are *not* transform outputs. + for (AggregatorFactory aggregator : aggregators) { + for (String column : aggregator.requiredFields()) { + if (!transformOutputNames.contains(column)) { + inclusions.add(column); + } + } + } + + return ColumnsFilter.inclusionBased(inclusions); + } else { + // Schemaless dimensions mode: we need an exclusion-based filter. + // Start from the list of dimension exclusions. + final Set exclusions = new HashSet<>(dimensionsSpec.getDimensionExclusions()); + + // Remove (un-exclude) timestamp column. + exclusions.remove(timestampSpec.getTimestampColumn()); + + // Remove (un-exclude) all transform inputs. + exclusions.removeAll(transformSpec.getRequiredColumns()); + + // Remove (un-exclude) all aggregator inputs that are *not* transform outputs. + for (AggregatorFactory aggregator : aggregators) { + for (String column : aggregator.requiredFields()) { + if (!transformOutputNames.contains(column)) { + exclusions.remove(column); + } + } + } + + return ColumnsFilter.exclusionBased(exclusions); + } + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java index 130f041f7853..5fdffe9f32b8 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java @@ -31,6 +31,7 @@ import org.apache.druid.data.input.impl.DimensionsSpec; import org.apache.druid.data.input.impl.TimedShutoffInputSourceReader; import org.apache.druid.data.input.impl.TimestampSpec; +import org.apache.druid.indexing.input.InputRowSchemas; import org.apache.druid.indexing.overlord.sampler.SamplerResponse.SamplerResponseRow; import org.apache.druid.java.util.common.DateTimes; import org.apache.druid.java.util.common.FileUtils; @@ -191,14 +192,7 @@ private InputSourceReader buildReader( File tempDir ) { - final List metricsNames = Arrays.stream(dataSchema.getAggregators()) - .map(AggregatorFactory::getName) - .collect(Collectors.toList()); - final InputRowSchema inputRowSchema = new InputRowSchema( - dataSchema.getTimestampSpec(), - dataSchema.getDimensionsSpec(), - metricsNames - ); + final InputRowSchema inputRowSchema = InputRowSchemas.fromDataSchema(dataSchema); InputSourceReader reader = inputSource.reader(inputRowSchema, inputFormat, tempDir); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskRunner.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskRunner.java index 8f538310dcd4..eea08672c172 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskRunner.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskRunner.java @@ -63,6 +63,7 @@ import org.apache.druid.indexing.common.stats.RowIngestionMetersFactory; import org.apache.druid.indexing.common.task.IndexTaskUtils; import org.apache.druid.indexing.common.task.RealtimeIndexTask; +import org.apache.druid.indexing.input.InputRowSchemas; import org.apache.druid.indexing.seekablestream.common.OrderedPartitionableRecord; import org.apache.druid.indexing.seekablestream.common.OrderedSequenceNumber; import org.apache.druid.indexing.seekablestream.common.RecordSupplier; @@ -74,7 +75,6 @@ import org.apache.druid.java.util.common.collect.Utils; import org.apache.druid.java.util.common.parsers.ParseException; import org.apache.druid.java.util.emitter.EmittingLogger; -import org.apache.druid.query.aggregation.AggregatorFactory; import org.apache.druid.segment.indexing.RealtimeIOConfig; import org.apache.druid.segment.realtime.FireDepartment; import org.apache.druid.segment.realtime.FireDepartmentMetrics; @@ -110,7 +110,6 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -248,13 +247,7 @@ public SeekableStreamIndexTaskRunner( this.task = task; this.ioConfig = task.getIOConfig(); this.tuningConfig = task.getTuningConfig(); - this.inputRowSchema = new InputRowSchema( - task.getDataSchema().getTimestampSpec(), - task.getDataSchema().getDimensionsSpec(), - Arrays.stream(task.getDataSchema().getAggregators()) - .map(AggregatorFactory::getName) - .collect(Collectors.toList()) - ); + this.inputRowSchema = InputRowSchemas.fromDataSchema(task.getDataSchema()); this.inputFormat = ioConfig.getInputFormat(); this.parser = parser; this.authorizerMapper = authorizerMapper; diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskTest.java index dbcfd4ad251a..88850031ca99 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskTest.java @@ -1212,7 +1212,11 @@ private void assertIngestionSchema( final DataSchema dataSchema = ingestionSchema.getDataSchema(); Assert.assertEquals(DATA_SOURCE, dataSchema.getDataSource()); - Assert.assertEquals(new TimestampSpec(null, null, null), dataSchema.getTimestampSpec()); + Assert.assertEquals( + new TimestampSpec(ColumnHolder.TIME_COLUMN_NAME, "millis", null), + dataSchema.getTimestampSpec() + ); + Assert.assertEquals( new HashSet<>(expectedDimensionsSpec.getDimensions()), new HashSet<>(dataSchema.getDimensionsSpec().getDimensions()) @@ -1244,11 +1248,6 @@ private void assertIngestionSchema( Assert.assertEquals(expectedSegmentIntervals.get(i), druidInputSource.getInterval()); Assert.assertNull(druidInputSource.getDimFilter()); - Assert.assertEquals( - new HashSet<>(expectedDimensionsSpec.getDimensionNames()), - new HashSet<>(druidInputSource.getDimensions()) - ); - // assert tuningConfig Assert.assertEquals(expectedTuningConfig, ingestionSchema.getTuningConfig()); } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/input/DruidSegmentReaderTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/input/DruidSegmentReaderTest.java index b3f514252256..3a01ec939fc3 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/input/DruidSegmentReaderTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/input/DruidSegmentReaderTest.java @@ -19,23 +19,527 @@ package org.apache.druid.indexing.input; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; import org.apache.commons.lang.mutable.MutableBoolean; +import org.apache.druid.common.config.NullHandlingTest; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputEntity.CleanableFile; +import org.apache.druid.data.input.InputRow; +import org.apache.druid.data.input.MapBasedInputRow; +import org.apache.druid.data.input.impl.DimensionsSpec; +import org.apache.druid.data.input.impl.DoubleDimensionSchema; +import org.apache.druid.data.input.impl.StringDimensionSchema; +import org.apache.druid.data.input.impl.TimestampSpec; +import org.apache.druid.hll.HyperLogLogCollector; +import org.apache.druid.hll.HyperLogLogHash; +import org.apache.druid.java.util.common.DateTimes; +import org.apache.druid.java.util.common.Intervals; import org.apache.druid.java.util.common.guava.BaseSequence; import org.apache.druid.java.util.common.guava.BaseSequence.IteratorMaker; import org.apache.druid.java.util.common.guava.Sequence; import org.apache.druid.java.util.common.parsers.CloseableIterator; +import org.apache.druid.query.aggregation.CountAggregatorFactory; +import org.apache.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory; +import org.apache.druid.query.filter.SelectorDimFilter; +import org.apache.druid.segment.IndexBuilder; +import org.apache.druid.segment.IndexIO; +import org.apache.druid.segment.IndexSpec; +import org.apache.druid.segment.Segment; +import org.apache.druid.segment.TestHelper; +import org.apache.druid.segment.incremental.IncrementalIndex; +import org.apache.druid.segment.incremental.IncrementalIndexSchema; +import org.apache.druid.segment.loading.SegmentLoader; +import org.apache.druid.segment.writeout.OnHeapMemorySegmentWriteOutMediumFactory; +import org.apache.druid.timeline.DataSegment; +import org.joda.time.Interval; import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.TemporaryFolder; import java.io.File; import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; +import java.util.List; import java.util.Map; -public class DruidSegmentReaderTest +public class DruidSegmentReaderTest extends NullHandlingTest { + @Rule + public TemporaryFolder temporaryFolder = new TemporaryFolder(); + + private File segmentDirectory; + + private final IndexIO indexIO = TestHelper.getTestIndexIO(); + + @Before + public void setUp() throws IOException + { + // Write a segment with two rows in it, with columns: s (string), d (double), cnt (long), met_s (complex). + final IncrementalIndex incrementalIndex = + IndexBuilder.create() + .schema( + new IncrementalIndexSchema.Builder() + .withDimensionsSpec( + new DimensionsSpec( + ImmutableList.of( + StringDimensionSchema.create("s"), + new DoubleDimensionSchema("d") + ) + ) + ) + .withMetrics( + new CountAggregatorFactory("cnt"), + new HyperUniquesAggregatorFactory("met_s", "s") + ) + .withRollup(false) + .build() + ) + .rows( + ImmutableList.of( + new MapBasedInputRow( + DateTimes.of("2000"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("s", "foo") + .put("d", 1.23) + .build() + ), + new MapBasedInputRow( + DateTimes.of("2000T01"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("s", "bar") + .put("d", 4.56) + .build() + ) + ) + ) + .buildIncrementalIndex(); + + segmentDirectory = temporaryFolder.newFolder(); + + try { + TestHelper.getTestIndexMergerV9( + OnHeapMemorySegmentWriteOutMediumFactory.instance() + ).persist( + incrementalIndex, + segmentDirectory, + new IndexSpec(), + null + ); + } + finally { + incrementalIndex.close(); + } + } + + @Test + public void testReader() throws IOException + { + final DruidSegmentReader reader = new DruidSegmentReader( + makeInputEntity(Intervals.of("2000/P1D")), + indexIO, + new TimestampSpec("__time", "millis", DateTimes.of("1971")), + new DimensionsSpec( + ImmutableList.of( + StringDimensionSchema.create("s"), + new DoubleDimensionSchema("d") + ) + ), + ColumnsFilter.all(), + null, + temporaryFolder.newFolder() + ); + + Assert.assertEquals( + ImmutableList.of( + new MapBasedInputRow( + DateTimes.of("2000"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("__time", DateTimes.of("2000T").getMillis()) + .put("s", "foo") + .put("d", 1.23d) + .put("cnt", 1L) + .put("met_s", makeHLLC("foo")) + .build() + ), + new MapBasedInputRow( + DateTimes.of("2000T01"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("__time", DateTimes.of("2000T01").getMillis()) + .put("s", "bar") + .put("d", 4.56d) + .put("cnt", 1L) + .put("met_s", makeHLLC("bar")) + .build() + ) + ), + readRows(reader) + ); + } + + @Test + public void testReaderAutoTimestampFormat() throws IOException + { + final DruidSegmentReader reader = new DruidSegmentReader( + makeInputEntity(Intervals.of("2000/P1D")), + indexIO, + new TimestampSpec("__time", "auto", DateTimes.of("1971")), + new DimensionsSpec( + ImmutableList.of( + StringDimensionSchema.create("s"), + new DoubleDimensionSchema("d") + ) + ), + ColumnsFilter.all(), + null, + temporaryFolder.newFolder() + ); + + Assert.assertEquals( + ImmutableList.of( + new MapBasedInputRow( + DateTimes.of("2000"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("__time", DateTimes.of("2000T").getMillis()) + .put("s", "foo") + .put("d", 1.23d) + .put("cnt", 1L) + .put("met_s", makeHLLC("foo")) + .build() + ), + new MapBasedInputRow( + DateTimes.of("2000T01"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("__time", DateTimes.of("2000T01").getMillis()) + .put("s", "bar") + .put("d", 4.56d) + .put("cnt", 1L) + .put("met_s", makeHLLC("bar")) + .build() + ) + ), + readRows(reader) + ); + } + + @Test + public void testReaderWithDimensionExclusions() throws IOException + { + final DruidSegmentReader reader = new DruidSegmentReader( + makeInputEntity(Intervals.of("2000/P1D")), + indexIO, + new TimestampSpec("__time", "millis", DateTimes.of("1971")), + new DimensionsSpec( + ImmutableList.of(), + ImmutableList.of("__time", "s", "cnt", "met_s"), + ImmutableList.of() + ), + ColumnsFilter.all(), + null, + temporaryFolder.newFolder() + ); + + Assert.assertEquals( + ImmutableList.of( + new MapBasedInputRow( + DateTimes.of("2000"), + ImmutableList.of("d"), + ImmutableMap.builder() + .put("__time", DateTimes.of("2000T").getMillis()) + .put("s", "foo") + .put("d", 1.23d) + .put("cnt", 1L) + .put("met_s", makeHLLC("foo")) + .build() + ), + new MapBasedInputRow( + DateTimes.of("2000T01"), + ImmutableList.of("d"), + ImmutableMap.builder() + .put("__time", DateTimes.of("2000T01").getMillis()) + .put("s", "bar") + .put("d", 4.56d) + .put("cnt", 1L) + .put("met_s", makeHLLC("bar")) + .build() + ) + ), + readRows(reader) + ); + } + + @Test + public void testReaderWithInclusiveColumnsFilter() throws IOException + { + final DruidSegmentReader reader = new DruidSegmentReader( + makeInputEntity(Intervals.of("2000/P1D")), + indexIO, + new TimestampSpec("__time", "millis", DateTimes.of("1971")), + new DimensionsSpec( + ImmutableList.of( + StringDimensionSchema.create("s"), + new DoubleDimensionSchema("d") + ) + ), + ColumnsFilter.inclusionBased(ImmutableSet.of("__time", "s", "d")), + null, + temporaryFolder.newFolder() + ); + + Assert.assertEquals( + ImmutableList.of( + new MapBasedInputRow( + DateTimes.of("2000"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("__time", DateTimes.of("2000T").getMillis()) + .put("s", "foo") + .put("d", 1.23d) + .build() + ), + new MapBasedInputRow( + DateTimes.of("2000T01"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("__time", DateTimes.of("2000T01").getMillis()) + .put("s", "bar") + .put("d", 4.56d) + .build() + ) + ), + readRows(reader) + ); + } + + @Test + public void testReaderWithInclusiveColumnsFilterNoTimestamp() throws IOException + { + final DruidSegmentReader reader = new DruidSegmentReader( + makeInputEntity(Intervals.of("2000/P1D")), + indexIO, + new TimestampSpec("__time", "millis", DateTimes.of("1971")), + new DimensionsSpec( + ImmutableList.of( + StringDimensionSchema.create("s"), + new DoubleDimensionSchema("d") + ) + ), + ColumnsFilter.inclusionBased(ImmutableSet.of("s", "d")), + null, + temporaryFolder.newFolder() + ); + + Assert.assertEquals( + ImmutableList.of( + new MapBasedInputRow( + DateTimes.of("1971"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("s", "foo") + .put("d", 1.23d) + .build() + ), + new MapBasedInputRow( + DateTimes.of("1971"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("s", "bar") + .put("d", 4.56d) + .build() + ) + ), + readRows(reader) + ); + } + + @Test + public void testReaderWithFilter() throws IOException + { + final DruidSegmentReader reader = new DruidSegmentReader( + makeInputEntity(Intervals.of("2000/P1D")), + indexIO, + new TimestampSpec("__time", "millis", DateTimes.of("1971")), + new DimensionsSpec( + ImmutableList.of( + StringDimensionSchema.create("s"), + new DoubleDimensionSchema("d") + ) + ), + ColumnsFilter.all(), + new SelectorDimFilter("d", "1.23", null), + temporaryFolder.newFolder() + ); + + Assert.assertEquals( + ImmutableList.of( + new MapBasedInputRow( + DateTimes.of("2000"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("__time", DateTimes.of("2000T").getMillis()) + .put("s", "foo") + .put("d", 1.23d) + .put("cnt", 1L) + .put("met_s", makeHLLC("foo")) + .build() + ) + ), + readRows(reader) + ); + } + + @Test + public void testReaderTimestampFromDouble() throws IOException + { + final DruidSegmentReader reader = new DruidSegmentReader( + makeInputEntity(Intervals.of("2000/P1D")), + indexIO, + new TimestampSpec("d", "posix", null), + new DimensionsSpec( + ImmutableList.of( + StringDimensionSchema.create("s"), + new DoubleDimensionSchema("d") + ) + ), + ColumnsFilter.all(), + null, + temporaryFolder.newFolder() + ); + + Assert.assertEquals( + ImmutableList.of( + new MapBasedInputRow( + DateTimes.of("1970-01-01T00:00:01.000Z"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("__time", DateTimes.of("2000T").getMillis()) + .put("s", "foo") + .put("d", 1.23d) + .put("cnt", 1L) + .put("met_s", makeHLLC("foo")) + .build() + ), + new MapBasedInputRow( + DateTimes.of("1970-01-01T00:00:04.000Z"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("__time", DateTimes.of("2000T01").getMillis()) + .put("s", "bar") + .put("d", 4.56d) + .put("cnt", 1L) + .put("met_s", makeHLLC("bar")) + .build() + ) + ), + readRows(reader) + ); + } + + @Test + public void testReaderTimestampAsPosixIncorrectly() throws IOException + { + final DruidSegmentReader reader = new DruidSegmentReader( + makeInputEntity(Intervals.of("2000/P1D")), + indexIO, + new TimestampSpec("__time", "posix", null), + new DimensionsSpec( + ImmutableList.of( + StringDimensionSchema.create("s"), + new DoubleDimensionSchema("d") + ) + ), + ColumnsFilter.all(), + null, + temporaryFolder.newFolder() + ); + + Assert.assertEquals( + ImmutableList.of( + new MapBasedInputRow( + DateTimes.of("31969-04-01T00:00:00.000Z"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("__time", DateTimes.of("2000T").getMillis()) + .put("s", "foo") + .put("d", 1.23d) + .put("cnt", 1L) + .put("met_s", makeHLLC("foo")) + .build() + ), + new MapBasedInputRow( + DateTimes.of("31969-05-12T16:00:00.000Z"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("__time", DateTimes.of("2000T01").getMillis()) + .put("s", "bar") + .put("d", 4.56d) + .put("cnt", 1L) + .put("met_s", makeHLLC("bar")) + .build() + ) + ), + readRows(reader) + ); + } + + @Test + public void testReaderTimestampSpecDefault() throws IOException + { + final DruidSegmentReader reader = new DruidSegmentReader( + makeInputEntity(Intervals.of("2000/P1D")), + indexIO, + new TimestampSpec(null, null, DateTimes.of("1971")), + new DimensionsSpec( + ImmutableList.of( + StringDimensionSchema.create("s"), + new DoubleDimensionSchema("d") + ) + ), + ColumnsFilter.all(), + null, + temporaryFolder.newFolder() + ); + + Assert.assertEquals( + ImmutableList.of( + new MapBasedInputRow( + DateTimes.of("1971"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("__time", DateTimes.of("2000T").getMillis()) + .put("s", "foo") + .put("d", 1.23d) + .put("cnt", 1L) + .put("met_s", makeHLLC("foo")) + .build() + ), + new MapBasedInputRow( + DateTimes.of("1971"), + ImmutableList.of("s", "d"), + ImmutableMap.builder() + .put("__time", DateTimes.of("2000T01").getMillis()) + .put("s", "bar") + .put("d", 4.56d) + .put("cnt", 1L) + .put("met_s", makeHLLC("bar")) + .build() + ) + ), + readRows(reader) + ); + } + @Test public void testMakeCloseableIteratorFromSequenceAndSegmentFileCloseYielderOnClose() throws IOException { @@ -80,4 +584,65 @@ public void close() Assert.assertTrue("File is not closed", isFileClosed.booleanValue()); Assert.assertTrue("Sequence is not closed", isSequenceClosed.booleanValue()); } + + private DruidSegmentInputEntity makeInputEntity(final Interval interval) + { + return new DruidSegmentInputEntity( + new SegmentLoader() + { + @Override + public boolean isSegmentLoaded(DataSegment segment) + { + throw new UnsupportedOperationException("unused"); + } + + @Override + public Segment getSegment(DataSegment segment, boolean lazy) + { + throw new UnsupportedOperationException("unused"); + } + + @Override + public File getSegmentFiles(DataSegment segment) + { + return segmentDirectory; + } + + @Override + public void cleanup(DataSegment segment) + { + throw new UnsupportedOperationException("unused"); + } + }, + DataSegment.builder() + .dataSource("ds") + .dimensions(ImmutableList.of("s", "d")) + .metrics(ImmutableList.of("cnt", "met_s")) + .interval(Intervals.of("2000/P1D")) + .version("1") + .size(0) + .build(), + interval + ); + } + + private List readRows(final DruidSegmentReader reader) throws IOException + { + final List rows = new ArrayList<>(); + try (final CloseableIterator> iterator = reader.intermediateRowIterator()) { + while (iterator.hasNext()) { + rows.addAll(reader.parseInputRows(iterator.next())); + } + } + return rows; + } + + private static HyperLogLogCollector makeHLLC(final String... values) + { + final HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); + for (String value : values) { + collector.add(HyperLogLogHash.getDefault().hash(value)); + } + return collector; + } } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/input/InputRowSchemasTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/input/InputRowSchemasTest.java new file mode 100644 index 000000000000..7241a27cc679 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/input/InputRowSchemasTest.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.input; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import org.apache.druid.common.config.NullHandlingTest; +import org.apache.druid.data.input.ColumnsFilter; +import org.apache.druid.data.input.impl.DimensionsSpec; +import org.apache.druid.data.input.impl.StringDimensionSchema; +import org.apache.druid.data.input.impl.TimestampSpec; +import org.apache.druid.math.expr.ExprMacroTable; +import org.apache.druid.query.aggregation.AggregatorFactory; +import org.apache.druid.query.aggregation.LongSumAggregatorFactory; +import org.apache.druid.query.filter.SelectorDimFilter; +import org.apache.druid.segment.transform.ExpressionTransform; +import org.apache.druid.segment.transform.TransformSpec; +import org.junit.Assert; +import org.junit.Test; + +public class InputRowSchemasTest extends NullHandlingTest +{ + @Test + public void test_createColumnsFilter_normal() + { + final ColumnsFilter columnsFilter = InputRowSchemas.createColumnsFilter( + new TimestampSpec("ts", "auto", null), + new DimensionsSpec( + ImmutableList.of(StringDimensionSchema.create("foo")), + ImmutableList.of(), + ImmutableList.of() + ), + new TransformSpec( + new SelectorDimFilter("bar", "x", null), + ImmutableList.of( + new ExpressionTransform("baz", "qux + 3", ExprMacroTable.nil()) + ) + ), + new AggregatorFactory[]{ + new LongSumAggregatorFactory("billy", "bob") + } + ); + + Assert.assertEquals( + ColumnsFilter.inclusionBased( + ImmutableSet.of( + "ts", + "foo", + "bar", + "qux", + "bob" + ) + ), + columnsFilter + ); + } + + @Test + public void test_createColumnsFilter_schemaless() + { + final ColumnsFilter columnsFilter = InputRowSchemas.createColumnsFilter( + new TimestampSpec("ts", "auto", null), + new DimensionsSpec( + ImmutableList.of(), + ImmutableList.of("ts", "foo", "bar", "qux", "bob"), + ImmutableList.of() + ), + new TransformSpec( + new SelectorDimFilter("bar", "x", null), + ImmutableList.of( + new ExpressionTransform("baz", "qux + 3", ExprMacroTable.nil()) + ) + ), + new AggregatorFactory[]{ + new LongSumAggregatorFactory("billy", "bob") + } + ); + + Assert.assertEquals( + ColumnsFilter.exclusionBased( + ImmutableSet.of( + "foo" + ) + ), + columnsFilter + ); + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/RecordSupplierInputSourceTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/RecordSupplierInputSourceTest.java index 10d6bf857a45..637504b9076e 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/RecordSupplierInputSourceTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/RecordSupplierInputSourceTest.java @@ -21,6 +21,7 @@ import com.google.common.collect.Maps; import org.apache.commons.lang3.RandomStringUtils; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputFormat; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowSchema; @@ -78,7 +79,7 @@ public void testRead() throws IOException new InputRowSchema( new TimestampSpec("col_0", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(colNames.subList(1, colNames.size()))), - Collections.emptyList() + ColumnsFilter.all() ), inputFormat, temporaryFolder.newFolder() diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/StreamChunkParserTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/StreamChunkParserTest.java index 10732921eb8d..04abf8b8e34a 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/StreamChunkParserTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/StreamChunkParserTest.java @@ -20,6 +20,7 @@ package org.apache.druid.indexing.seekablestream; import com.google.common.collect.Iterables; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputEntity; import org.apache.druid.data.input.InputEntityReader; import org.apache.druid.data.input.InputRow; @@ -89,7 +90,7 @@ public void testWithNullParserAndInputformatParseProperly() throws IOException final StreamChunkParser chunkParser = new StreamChunkParser( null, inputFormat, - new InputRowSchema(TIMESTAMP_SPEC, DimensionsSpec.EMPTY, Collections.emptyList()), + new InputRowSchema(TIMESTAMP_SPEC, DimensionsSpec.EMPTY, ColumnsFilter.all()), TransformSpec.NONE, temporaryFolder.newFolder() ); @@ -130,7 +131,7 @@ public void testBothParserAndInputFormatParseProperlyUsingInputFormat() throws I final StreamChunkParser chunkParser = new StreamChunkParser( parser, inputFormat, - new InputRowSchema(TIMESTAMP_SPEC, DimensionsSpec.EMPTY, Collections.emptyList()), + new InputRowSchema(TIMESTAMP_SPEC, DimensionsSpec.EMPTY, ColumnsFilter.all()), TransformSpec.NONE, temporaryFolder.newFolder() ); diff --git a/processing/src/main/java/org/apache/druid/segment/transform/ExpressionTransform.java b/processing/src/main/java/org/apache/druid/segment/transform/ExpressionTransform.java index 16bad318a7fc..2ace9b06bf14 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/ExpressionTransform.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/ExpressionTransform.java @@ -23,6 +23,7 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; +import com.google.common.base.Suppliers; import org.apache.druid.data.input.Row; import org.apache.druid.math.expr.Expr; import org.apache.druid.math.expr.ExprMacroTable; @@ -32,12 +33,15 @@ import java.util.List; import java.util.Objects; +import java.util.Set; +import java.util.function.Supplier; public class ExpressionTransform implements Transform { private final String name; private final String expression; private final ExprMacroTable macroTable; + private final Supplier parsedExpression; @JsonCreator public ExpressionTransform( @@ -49,6 +53,9 @@ public ExpressionTransform( this.name = Preconditions.checkNotNull(name, "name"); this.expression = Preconditions.checkNotNull(expression, "expression"); this.macroTable = macroTable; + this.parsedExpression = Suppliers.memoize( + () -> Parser.parse(expression, Preconditions.checkNotNull(this.macroTable, "macroTable")) + )::get; } @JsonProperty @@ -67,8 +74,13 @@ public String getExpression() @Override public RowFunction getRowFunction() { - final Expr expr = Parser.parse(expression, Preconditions.checkNotNull(this.macroTable, "macroTable")); - return new ExpressionRowFunction(expr); + return new ExpressionRowFunction(parsedExpression.get()); + } + + @Override + public Set getRequiredColumns() + { + return parsedExpression.get().analyzeInputs().getRequiredBindings(); } static class ExpressionRowFunction implements RowFunction diff --git a/processing/src/main/java/org/apache/druid/segment/transform/Transform.java b/processing/src/main/java/org/apache/druid/segment/transform/Transform.java index a481a4c08a11..8b6f75fa2d81 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/Transform.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/Transform.java @@ -23,6 +23,8 @@ import com.fasterxml.jackson.annotation.JsonTypeInfo; import org.apache.druid.guice.annotations.ExtensionPoint; +import java.util.Set; + /** * A row transform that is part of a {@link TransformSpec}. Transforms allow adding new fields to input rows. Each * one has a "name" (the name of the new field) which can be referred to by DimensionSpecs, AggregatorFactories, etc. @@ -52,4 +54,9 @@ public interface Transform * as output. */ RowFunction getRowFunction(); + + /** + * Returns the names of all columns that this transform is going to read. + */ + Set getRequiredColumns(); } diff --git a/processing/src/main/java/org/apache/druid/segment/transform/TransformSpec.java b/processing/src/main/java/org/apache/druid/segment/transform/TransformSpec.java index 6de7ac9363c8..1391da394b01 100644 --- a/processing/src/main/java/org/apache/druid/segment/transform/TransformSpec.java +++ b/processing/src/main/java/org/apache/druid/segment/transform/TransformSpec.java @@ -126,6 +126,21 @@ public Transformer toTransformer() return new Transformer(this); } + public Set getRequiredColumns() + { + final Set requiredColumns = new HashSet<>(); + + if (filter != null) { + requiredColumns.addAll(filter.getRequiredColumns()); + } + + for (Transform transform : transforms) { + requiredColumns.addAll(transform.getRequiredColumns()); + } + + return requiredColumns; + } + @Override public boolean equals(final Object o) { diff --git a/server/src/test/java/org/apache/druid/metadata/input/SqlInputSourceTest.java b/server/src/test/java/org/apache/druid/metadata/input/SqlInputSourceTest.java index 7afa88894f26..1418fc630a98 100644 --- a/server/src/test/java/org/apache/druid/metadata/input/SqlInputSourceTest.java +++ b/server/src/test/java/org/apache/druid/metadata/input/SqlInputSourceTest.java @@ -27,6 +27,7 @@ import nl.jqno.equalsverifier.EqualsVerifier; import org.apache.commons.dbcp2.BasicDataSource; import org.apache.commons.io.FileUtils; +import org.apache.druid.data.input.ColumnsFilter; import org.apache.druid.data.input.InputFormat; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRowSchema; @@ -53,7 +54,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Objects; @@ -79,7 +79,7 @@ public class SqlInputSourceTest new ArrayList<>(), new ArrayList<>() ), - Collections.emptyList() + ColumnsFilter.all() ); @Rule public final TestDerbyConnector.DerbyConnectorRule derbyConnectorRule = new TestDerbyConnector.DerbyConnectorRule(); diff --git a/web-console/src/utils/ingestion-spec.tsx b/web-console/src/utils/ingestion-spec.tsx index d6ec9451f71e..b8cdfe1a143c 100644 --- a/web-console/src/utils/ingestion-spec.tsx +++ b/web-console/src/utils/ingestion-spec.tsx @@ -1129,30 +1129,6 @@ export function getIoConfigFormFields(ingestionComboType: IngestionComboType): F

), }, - { - name: 'inputSource.dimensions', - label: 'Dimensions', - type: 'string-array', - placeholder: '(optional)', - info: ( -

- The list of dimensions to select. If left empty, no dimensions are returned. If left - null or not defined, all dimensions are returned. -

- ), - }, - { - name: 'inputSource.metrics', - label: 'Metrics', - type: 'string-array', - placeholder: '(optional)', - info: ( -

- The list of metrics to select. If left empty, no metrics are returned. If left null or - not defined, all metrics are selected. -

- ), - }, { name: 'inputSource.filter', label: 'Filter', diff --git a/web-console/src/views/load-data-view/load-data-view.tsx b/web-console/src/views/load-data-view/load-data-view.tsx index be13c22dfbfd..21ae13d43d07 100644 --- a/web-console/src/views/load-data-view/load-data-view.tsx +++ b/web-console/src/views/load-data-view/load-data-view.tsx @@ -1127,7 +1127,7 @@ export class LoadDataView extends React.PureComponent Date: Wed, 12 Aug 2020 00:30:56 -0700 Subject: [PATCH 02/24] Various fixups. --- .../druid/data/input/ColumnsFilter.java | 2 +- docs/ingestion/native-batch.md | 6 ++-- .../common/ReingestionTimelineUtils.java | 1 + .../IngestSegmentFirehoseFactory.java | 1 + ...arallel_druid_input_source_index_task.json | 3 +- ...pedia_reindex_druid_input_source_task.json | 2 +- ...uid_input_source_task_with_transforms.json | 2 +- .../segment/indexing/TransformSpecTest.java | 31 ++++++++++++++++++- website/.spelling | 1 + 9 files changed, 41 insertions(+), 8 deletions(-) diff --git a/core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java b/core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java index f391e7e41c40..8506ffb735f9 100644 --- a/core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java +++ b/core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java @@ -62,7 +62,7 @@ public static ColumnsFilter exclusionBased(final Set exclusions) /** * Check if a column should be included or not. */ - public abstract boolean apply(final String column); + public abstract boolean apply(String column); public static class InclusionBased extends ColumnsFilter { diff --git a/docs/ingestion/native-batch.md b/docs/ingestion/native-batch.md index 2dfe96428174..5badf841ab37 100644 --- a/docs/ingestion/native-batch.md +++ b/docs/ingestion/native-batch.md @@ -1269,9 +1269,9 @@ of milliseconds since the epoch (January 1, 1970 00:00:00 UTC). It is common to want the output timestamp to be equivalent to the input timestamp. In this case, set the timestamp column to `__time` and the format to `auto` or `millis`. -It is OK for the input and output datasources to be the same. In this case, the reindexed data will overwrite the -previous data. Generally, if you are going to do this, it is a good idea to test out your reindexing by writing to -a separate datasource before overwriting your main one. +It is OK for the input and output datasources to be the same. In this case, newly generated data will overwrite the +previous data for the intervals specified in the `granularitySpec`. Generally, if you are going to do this, it is a +good idea to test out your reindexing by writing to a separate datasource before overwriting your main one. An example task spec is shown below. It reads from a hypothetical raw datasource `wikipedia_raw` and creates a new rolled-up datasource `wikipedia_rollup` by grouping on hour, "countryName", and "page". diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/ReingestionTimelineUtils.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/ReingestionTimelineUtils.java index b1a2fb5ddd94..8714fa6933ac 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/ReingestionTimelineUtils.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/ReingestionTimelineUtils.java @@ -37,6 +37,7 @@ /** * @deprecated only used by {@link org.apache.druid.indexing.firehose.IngestSegmentFirehoseFactory} */ +@Deprecated public class ReingestionTimelineUtils { /** diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/firehose/IngestSegmentFirehoseFactory.java b/indexing-service/src/main/java/org/apache/druid/indexing/firehose/IngestSegmentFirehoseFactory.java index 1defe67cf14c..7039cb32645a 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/firehose/IngestSegmentFirehoseFactory.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/firehose/IngestSegmentFirehoseFactory.java @@ -64,6 +64,7 @@ /** * @deprecated use {@link DruidInputSource} instead */ +@Deprecated public class IngestSegmentFirehoseFactory implements FiniteFirehoseFactory> { private static final EmittingLogger log = new EmittingLogger(IngestSegmentFirehoseFactory.class); diff --git a/integration-tests/src/test/resources/indexer/wikipedia_parallel_druid_input_source_index_task.json b/integration-tests/src/test/resources/indexer/wikipedia_parallel_druid_input_source_index_task.json index 91702a413574..5a6402e15967 100644 --- a/integration-tests/src/test/resources/indexer/wikipedia_parallel_druid_input_source_index_task.json +++ b/integration-tests/src/test/resources/indexer/wikipedia_parallel_druid_input_source_index_task.json @@ -10,7 +10,8 @@ ] }, "timestampSpec": { - "column": "timestamp" + "column": "__time", + "format": "millis" }, "metricsSpec": [ { diff --git a/integration-tests/src/test/resources/indexer/wikipedia_reindex_druid_input_source_task.json b/integration-tests/src/test/resources/indexer/wikipedia_reindex_druid_input_source_task.json index 3a5934cf4d37..cf2415c2b45c 100644 --- a/integration-tests/src/test/resources/indexer/wikipedia_reindex_druid_input_source_task.json +++ b/integration-tests/src/test/resources/indexer/wikipedia_reindex_druid_input_source_task.json @@ -24,7 +24,7 @@ }, "timestampSpec": { "column": "__time", - "format": "iso" + "format": "millis" }, "dimensionsSpec": { "dimensionExclusions" : ["robot", "continent"] diff --git a/integration-tests/src/test/resources/indexer/wikipedia_reindex_druid_input_source_task_with_transforms.json b/integration-tests/src/test/resources/indexer/wikipedia_reindex_druid_input_source_task_with_transforms.json index 3e8a44c5c592..2c2b0372a56c 100644 --- a/integration-tests/src/test/resources/indexer/wikipedia_reindex_druid_input_source_task_with_transforms.json +++ b/integration-tests/src/test/resources/indexer/wikipedia_reindex_druid_input_source_task_with_transforms.json @@ -24,7 +24,7 @@ }, "timestampSpec": { "column": "__time", - "format": "iso" + "format": "millis" }, "dimensionsSpec": { "dimensions": [ diff --git a/server/src/test/java/org/apache/druid/segment/indexing/TransformSpecTest.java b/server/src/test/java/org/apache/druid/segment/indexing/TransformSpecTest.java index 8102a719e8e6..8532ff03909f 100644 --- a/server/src/test/java/org/apache/druid/segment/indexing/TransformSpecTest.java +++ b/server/src/test/java/org/apache/druid/segment/indexing/TransformSpecTest.java @@ -22,6 +22,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import org.apache.druid.common.config.NullHandlingTest; import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.impl.DimensionsSpec; import org.apache.druid.data.input.impl.InputRowParser; @@ -40,7 +42,7 @@ import java.util.Map; -public class TransformSpecTest +public class TransformSpecTest extends NullHandlingTest { private static final MapInputRowParser PARSER = new MapInputRowParser( new TimeAndDimsParseSpec( @@ -79,6 +81,11 @@ public void testTransforms() ) ); + Assert.assertEquals( + ImmutableSet.of("x", "y", "a", "b", "f", "g"), + transformSpec.getRequiredColumns() + ); + final InputRowParser> parser = transformSpec.decorate(PARSER); final InputRow row = parser.parseBatch(ROW1).get(0); @@ -107,6 +114,11 @@ public void testTransformOverwriteField() ) ); + Assert.assertEquals( + ImmutableSet.of("x", "y"), + transformSpec.getRequiredColumns() + ); + final InputRowParser> parser = transformSpec.decorate(PARSER); final InputRow row = parser.parseBatch(ROW1).get(0); @@ -138,6 +150,12 @@ public void testFilterOnTransforms() ) ); + Assert.assertEquals( + ImmutableSet.of("x", "f", "g", "y", "a", "b"), + transformSpec.getRequiredColumns() + ); + + final InputRowParser> parser = transformSpec.decorate(PARSER); Assert.assertNotNull(parser.parseBatch(ROW1).get(0)); Assert.assertNull(parser.parseBatch(ROW2).get(0)); @@ -153,6 +171,12 @@ public void testTransformTimeFromOtherFields() ) ); + Assert.assertEquals( + ImmutableSet.of("a", "b"), + transformSpec.getRequiredColumns() + ); + + final InputRowParser> parser = transformSpec.decorate(PARSER); final InputRow row = parser.parseBatch(ROW1).get(0); @@ -171,6 +195,11 @@ public void testTransformTimeFromTime() ) ); + Assert.assertEquals( + ImmutableSet.of("__time"), + transformSpec.getRequiredColumns() + ); + final InputRowParser> parser = transformSpec.decorate(PARSER); final InputRow row = parser.parseBatch(ROW1).get(0); diff --git a/website/.spelling b/website/.spelling index 50e27f00c38b..02bd48f3e50b 100644 --- a/website/.spelling +++ b/website/.spelling @@ -995,6 +995,7 @@ baseDir chatHandlerNumRetries chatHandlerTimeout connectorConfig +countryName dataSchema's foldCase forceGuaranteedRollup From 94046615f31796f14a88171e42352189449c8748 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Wed, 12 Aug 2020 02:29:06 -0700 Subject: [PATCH 03/24] Uncomment incorrectly commented lines. --- .../druid/indexing/common/task/InputSourceProcessor.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/InputSourceProcessor.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/InputSourceProcessor.java index 76ac510d6041..3a62a9039c75 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/InputSourceProcessor.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/InputSourceProcessor.java @@ -177,9 +177,9 @@ private void handleParseException(ParseException e) buildSegmentsMeters.incrementUnparseable(); } -// if (logParseExceptions) { + if (logParseExceptions) { LOG.error(e, "Encountered parse exception"); -// } + } if (buildSegmentsSavedParseExceptions != null) { buildSegmentsSavedParseExceptions.add(e); From 6a4a97eb8b803545fcec896d592bfb21d537db86 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Wed, 12 Aug 2020 02:30:55 -0700 Subject: [PATCH 04/24] Move TransformSpecTest to the proper module. --- .../apache/druid/segment/transform}/TransformSpecTest.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) rename {server/src/test/java/org/apache/druid/segment/indexing => processing/src/test/java/org/apache/druid/segment/transform}/TransformSpecTest.java (98%) diff --git a/server/src/test/java/org/apache/druid/segment/indexing/TransformSpecTest.java b/processing/src/test/java/org/apache/druid/segment/transform/TransformSpecTest.java similarity index 98% rename from server/src/test/java/org/apache/druid/segment/indexing/TransformSpecTest.java rename to processing/src/test/java/org/apache/druid/segment/transform/TransformSpecTest.java index 8532ff03909f..aadf39f1900d 100644 --- a/server/src/test/java/org/apache/druid/segment/indexing/TransformSpecTest.java +++ b/processing/src/test/java/org/apache/druid/segment/transform/TransformSpecTest.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.druid.segment.indexing; +package org.apache.druid.segment.transform; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.ImmutableList; @@ -35,8 +35,6 @@ import org.apache.druid.query.filter.AndDimFilter; import org.apache.druid.query.filter.SelectorDimFilter; import org.apache.druid.segment.TestHelper; -import org.apache.druid.segment.transform.ExpressionTransform; -import org.apache.druid.segment.transform.TransformSpec; import org.junit.Assert; import org.junit.Test; From df7342752157ad3581d70d0721ea1c265fd7c835 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Tue, 25 Aug 2020 00:34:07 -0700 Subject: [PATCH 05/24] Add druid.indexer.task.ignoreTimestampSpecForDruidInputSource setting. --- .../druid/data/input/ColumnsFilter.java | 31 ++++++++++++ .../data/input/impl/ColumnsFilterTest.java | 18 +++++++ docs/configuration/index.md | 2 + docs/ingestion/native-batch.md | 5 ++ .../indexing/common/config/TaskConfig.java | 13 ++++- .../indexing/common/task/CompactionTask.java | 3 +- .../indexing/input/DruidInputSource.java | 47 +++++++++++++++++-- .../docker/environment-configs/middlemanager | 4 ++ ...arallel_druid_input_source_index_task.json | 4 +- 9 files changed, 120 insertions(+), 7 deletions(-) diff --git a/core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java b/core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java index 8506ffb735f9..554f3ccf1b55 100644 --- a/core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java +++ b/core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java @@ -20,6 +20,7 @@ package org.apache.druid.data.input; import java.util.Collections; +import java.util.HashSet; import java.util.Objects; import java.util.Set; @@ -64,6 +65,12 @@ public static ColumnsFilter exclusionBased(final Set exclusions) */ public abstract boolean apply(String column); + /** + * Returns a new filter with a particular column added. The returned filter will return true from {@link #apply} + * on this column. + */ + public abstract ColumnsFilter plus(final String column); + public static class InclusionBased extends ColumnsFilter { private final Set inclusions; @@ -79,6 +86,18 @@ public boolean apply(String column) return inclusions.contains(column); } + @Override + public ColumnsFilter plus(String column) + { + if (inclusions.contains(column)) { + return this; + } else { + final Set copy = new HashSet<>(inclusions); + copy.add(column); + return new InclusionBased(copy); + } + } + @Override public boolean equals(Object o) { @@ -122,6 +141,18 @@ public boolean apply(String column) return !exclusions.contains(column); } + @Override + public ColumnsFilter plus(String column) + { + if (!exclusions.contains(column)) { + return this; + } else { + final Set copy = new HashSet<>(exclusions); + copy.remove(column); + return new ExclusionBased(copy); + } + } + @Override public boolean equals(Object o) { diff --git a/core/src/test/java/org/apache/druid/data/input/impl/ColumnsFilterTest.java b/core/src/test/java/org/apache/druid/data/input/impl/ColumnsFilterTest.java index d85e9278de66..00faf4ea5324 100644 --- a/core/src/test/java/org/apache/druid/data/input/impl/ColumnsFilterTest.java +++ b/core/src/test/java/org/apache/druid/data/input/impl/ColumnsFilterTest.java @@ -51,6 +51,15 @@ public void testInclusionBased() ); } + @Test + public void testInclusionBasedPlus() + { + Assert.assertEquals( + ColumnsFilter.inclusionBased(ImmutableSet.of("a", "b", "c")), + ColumnsFilter.inclusionBased(ImmutableSet.of("b", "c")).plus("a").plus("c") + ); + } + @Test public void testExclusionBased() { @@ -60,6 +69,15 @@ public void testExclusionBased() ); } + @Test + public void testExclusionBasedPlus() + { + Assert.assertEquals( + ColumnsFilter.exclusionBased(ImmutableSet.of("b")), + ColumnsFilter.exclusionBased(ImmutableSet.of("b", "c")).plus("a").plus("c") + ); + } + @Test public void testEquals() { diff --git a/docs/configuration/index.md b/docs/configuration/index.md index b02c467141a1..ee782de731c8 100644 --- a/docs/configuration/index.md +++ b/docs/configuration/index.md @@ -1228,6 +1228,7 @@ Additional peon configs include: |`druid.indexer.task.gracefulShutdownTimeout`|Wait this long on middleManager restart for restorable tasks to gracefully exit.|PT5M| |`druid.indexer.task.hadoopWorkingPath`|Temporary working directory for Hadoop tasks.|`/tmp/druid-indexing`| |`druid.indexer.task.restoreTasksOnRestart`|If true, MiddleManagers will attempt to stop tasks gracefully on shutdown and restore them on restart.|false| +|`druid.indexer.task.ignoreTimestampSpecForDruidInputSource`|If true, tasks using the [Druid input source](../ingestion/native-batch.md#druid-input-source) will ignore the provided timestampSpec, and will use the `__time` column of the input datasource. This option is provided for compatibility with ingestion specs written before Druid 0.20.0.|false| |`druid.indexer.server.maxChatRequests`|Maximum number of concurrent requests served by a task's chat handler. Set to 0 to disable limiting.|0| If the peon is running in remote mode, there must be an Overlord up and running. Peons in remote mode can set the following configurations: @@ -1292,6 +1293,7 @@ then the value from the configuration below is used: |`druid.indexer.task.gracefulShutdownTimeout`|Wait this long on Indexer restart for restorable tasks to gracefully exit.|PT5M| |`druid.indexer.task.hadoopWorkingPath`|Temporary working directory for Hadoop tasks.|`/tmp/druid-indexing`| |`druid.indexer.task.restoreTasksOnRestart`|If true, the Indexer will attempt to stop tasks gracefully on shutdown and restore them on restart.|false| +|`druid.indexer.task.ignoreTimestampSpecForDruidInputSource`|If true, tasks using the [Druid input source](../ingestion/native-batch.md#druid-input-source) will ignore the provided timestampSpec, and will use the `__time` column of the input datasource. This option is provided for compatibility with ingestion specs written before Druid 0.20.0.|false| |`druid.peon.taskActionClient.retry.minWait`|The minimum retry time to communicate with Overlord.|PT5S| |`druid.peon.taskActionClient.retry.maxWait`|The maximum retry time to communicate with Overlord.|PT1M| |`druid.peon.taskActionClient.retry.maxRetryCount`|The maximum number of retries to communicate with Overlord.|60| diff --git a/docs/ingestion/native-batch.md b/docs/ingestion/native-batch.md index de5e03f4a63c..c833c15e0eb1 100644 --- a/docs/ingestion/native-batch.md +++ b/docs/ingestion/native-batch.md @@ -1329,6 +1329,11 @@ rolled-up datasource `wikipedia_rollup` by grouping on hour, "countryName", and } ``` +> Note: Older versions (0.19 and earlier) did not respect the timestampSpec when using the Druid input source. If you +> have ingestion specs that rely on this and cannot rewrite them, set +> [`druid.indexer.task.ignoreTimestampSpecForDruidInputSource`](../configuration/index.md#indexer-general-configuration) +> to `true` to enable a compatibility mode where the timestampSpec is ignored. + ### SQL Input Source The SQL input source is used to read data directly from RDBMS. diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/config/TaskConfig.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/config/TaskConfig.java index 7c22dad5b62b..bf887e500e6e 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/config/TaskConfig.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/config/TaskConfig.java @@ -67,6 +67,9 @@ public class TaskConfig @JsonProperty private final List shuffleDataLocations; + @JsonProperty + private final boolean ignoreTimestampSpecForDruidInputSource; + @JsonCreator public TaskConfig( @JsonProperty("baseDir") String baseDir, @@ -77,7 +80,8 @@ public TaskConfig( @JsonProperty("restoreTasksOnRestart") boolean restoreTasksOnRestart, @JsonProperty("gracefulShutdownTimeout") Period gracefulShutdownTimeout, @JsonProperty("directoryLockTimeout") Period directoryLockTimeout, - @JsonProperty("shuffleDataLocations") List shuffleDataLocations + @JsonProperty("shuffleDataLocations") List shuffleDataLocations, + @JsonProperty("ignoreTimestampSpecForDruidInputSource") boolean ignoreTimestampSpecForDruidInputSource ) { this.baseDir = baseDir == null ? System.getProperty("java.io.tmpdir") : baseDir; @@ -102,6 +106,7 @@ public TaskConfig( } else { this.shuffleDataLocations = shuffleDataLocations; } + this.ignoreTimestampSpecForDruidInputSource = ignoreTimestampSpecForDruidInputSource; } @JsonProperty @@ -178,6 +183,12 @@ public List getShuffleDataLocations() return shuffleDataLocations; } + @JsonProperty + public boolean isIgnoreTimestampSpecForDruidInputSource() + { + return ignoreTimestampSpecForDruidInputSource; + } + private String defaultDir(@Nullable String configParameter, final String defaultVal) { if (configParameter == null) { diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CompactionTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CompactionTask.java index d7883b94d627..73cb12b2e2f4 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CompactionTask.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/CompactionTask.java @@ -625,7 +625,8 @@ private static ParallelIndexIOConfig createIoConfig( toolbox.getIndexIO(), coordinatorClient, segmentLoaderFactory, - retryPolicyFactory + retryPolicyFactory, + toolbox.getConfig() ), null, false diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java index 6119b37d2e77..377d224fd5e1 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java @@ -26,6 +26,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; import com.google.common.collect.FluentIterable; +import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterators; import org.apache.druid.client.coordinator.CoordinatorClient; import org.apache.druid.data.input.AbstractInputSource; @@ -39,9 +40,11 @@ import org.apache.druid.data.input.SplitHintSpec; import org.apache.druid.data.input.impl.InputEntityIteratingReader; import org.apache.druid.data.input.impl.SplittableInputSource; +import org.apache.druid.data.input.impl.TimestampSpec; import org.apache.druid.indexing.common.RetryPolicy; import org.apache.druid.indexing.common.RetryPolicyFactory; import org.apache.druid.indexing.common.SegmentLoaderFactory; +import org.apache.druid.indexing.common.config.TaskConfig; import org.apache.druid.indexing.firehose.WindowedSegmentId; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.ISE; @@ -49,6 +52,7 @@ import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.query.filter.DimFilter; import org.apache.druid.segment.IndexIO; +import org.apache.druid.segment.column.ColumnHolder; import org.apache.druid.segment.loading.SegmentLoader; import org.apache.druid.timeline.DataSegment; import org.apache.druid.timeline.TimelineObjectHolder; @@ -68,6 +72,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import java.util.concurrent.ThreadLocalRandom; @@ -82,6 +87,11 @@ public class DruidInputSource extends AbstractInputSource implements SplittableI { private static final Logger LOG = new Logger(DruidInputSource.class); + /** + * Timestamp formats that the standard __time column can be parsed with. + */ + private static final Set STANDARD_TIME_COLUMN_FORMATS = ImmutableSet.of("millis", "__time"); + private final String dataSource; // Exactly one of interval and segmentIds should be non-null. Typically 'interval' is specified directly // by the user creating this firehose and 'segmentIds' is used for sub-tasks if it is split for parallel @@ -95,6 +105,7 @@ public class DruidInputSource extends AbstractInputSource implements SplittableI private final CoordinatorClient coordinatorClient; private final SegmentLoaderFactory segmentLoaderFactory; private final RetryPolicyFactory retryPolicyFactory; + private final TaskConfig taskConfig; /** * Included for serde backwards-compatibility only. Not used. @@ -119,7 +130,8 @@ public DruidInputSource( @JacksonInject IndexIO indexIO, @JacksonInject CoordinatorClient coordinatorClient, @JacksonInject SegmentLoaderFactory segmentLoaderFactory, - @JacksonInject RetryPolicyFactory retryPolicyFactory + @JacksonInject RetryPolicyFactory retryPolicyFactory, + @JacksonInject TaskConfig taskConfig ) { Preconditions.checkNotNull(dataSource, "dataSource"); @@ -136,6 +148,7 @@ public DruidInputSource( this.coordinatorClient = Preconditions.checkNotNull(coordinatorClient, "null CoordinatorClient"); this.segmentLoaderFactory = Preconditions.checkNotNull(segmentLoaderFactory, "null SegmentLoaderFactory"); this.retryPolicyFactory = Preconditions.checkNotNull(retryPolicyFactory, "null RetryPolicyFactory"); + this.taskConfig = Preconditions.checkNotNull(taskConfig, "null taskConfig"); } @JsonProperty @@ -206,8 +219,35 @@ protected InputSourceReader fixedFormatReader(InputRowSchema inputRowSchema, @Nu final DruidSegmentInputFormat inputFormat = new DruidSegmentInputFormat(indexIO, dimFilter); + final InputRowSchema inputRowSchemaToUse; + + if (taskConfig.isIgnoreTimestampSpecForDruidInputSource()) { + // Legacy compatibility mode; see https://github.com/apache/druid/pull/10267. + LOG.warn("Ignoring the provided timestampSpec and reading the __time column instead. To use timestampSpecs with " + + "the 'druid' input source, set druid.indexer.task.ignoreTimestampSpecForDruidInputSource to false."); + + inputRowSchemaToUse = new InputRowSchema( + new TimestampSpec(ColumnHolder.TIME_COLUMN_NAME, "millis", null), + inputRowSchema.getDimensionsSpec(), + inputRowSchema.getColumnsFilter().plus(ColumnHolder.TIME_COLUMN_NAME) + ); + } else { + inputRowSchemaToUse = inputRowSchema; + } + + if (ColumnHolder.TIME_COLUMN_NAME.equals(inputRowSchemaToUse.getTimestampSpec().getTimestampColumn()) + && !STANDARD_TIME_COLUMN_FORMATS.contains(inputRowSchemaToUse.getTimestampSpec().getTimestampFormat())) { + // Slight chance the user did this intentionally, but not likely. Log a warning. + LOG.warn( + "The provided timestampSpec refers to the %s column without using format %s. If you wanted to read the " + + "column as-is, switch formats.", + inputRowSchemaToUse.getTimestampSpec().getTimestampColumn(), + STANDARD_TIME_COLUMN_FORMATS + ); + } + return new InputEntityIteratingReader( - inputRowSchema, + inputRowSchemaToUse, inputFormat, entityIterator, temporaryDirectory @@ -279,7 +319,8 @@ public SplittableInputSource> withSplit(InputSplit Date: Tue, 25 Aug 2020 00:39:44 -0700 Subject: [PATCH 06/24] Fix. --- .../org/apache/druid/indexing/input/DruidInputSource.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java index 377d224fd5e1..ff78d79683f6 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java @@ -26,7 +26,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; import com.google.common.collect.FluentIterable; -import com.google.common.collect.ImmutableSet; +import com.google.common.collect.ImmutableSortedSet; import com.google.common.collect.Iterators; import org.apache.druid.client.coordinator.CoordinatorClient; import org.apache.druid.data.input.AbstractInputSource; @@ -90,7 +90,7 @@ public class DruidInputSource extends AbstractInputSource implements SplittableI /** * Timestamp formats that the standard __time column can be parsed with. */ - private static final Set STANDARD_TIME_COLUMN_FORMATS = ImmutableSet.of("millis", "__time"); + private static final Set STANDARD_TIME_COLUMN_FORMATS = ImmutableSortedSet.of("auto", "millis"); private final String dataSource; // Exactly one of interval and segmentIds should be non-null. Typically 'interval' is specified directly @@ -227,7 +227,7 @@ protected InputSourceReader fixedFormatReader(InputRowSchema inputRowSchema, @Nu + "the 'druid' input source, set druid.indexer.task.ignoreTimestampSpecForDruidInputSource to false."); inputRowSchemaToUse = new InputRowSchema( - new TimestampSpec(ColumnHolder.TIME_COLUMN_NAME, "millis", null), + new TimestampSpec(ColumnHolder.TIME_COLUMN_NAME, STANDARD_TIME_COLUMN_FORMATS.iterator().next(), null), inputRowSchema.getDimensionsSpec(), inputRowSchema.getColumnsFilter().plus(ColumnHolder.TIME_COLUMN_NAME) ); From cf68ace26924bb910038b57e4dd4d1d0b064e7de Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Tue, 25 Aug 2020 01:24:24 -0700 Subject: [PATCH 07/24] Fix build. --- .../druid/indexing/kafka/KafkaIndexTaskTest.java | 3 ++- .../indexing/kinesis/KinesisIndexTaskTest.java | 3 ++- .../druid/indexing/common/TaskToolboxTest.java | 2 +- .../AppenderatorDriverRealtimeIndexTaskTest.java | 13 ++++++++++++- .../druid/indexing/common/task/HadoopTaskTest.java | 3 ++- .../indexing/common/task/RealtimeIndexTaskTest.java | 13 ++++++++++++- .../AbstractParallelIndexSupervisorTaskTest.java | 3 ++- .../overlord/SingleTaskBackgroundRunnerTest.java | 3 ++- .../druid/indexing/overlord/TaskLifecycleTest.java | 2 +- .../IntermediaryDataManagerAutoCleanupTest.java | 3 ++- ...termediaryDataManagerManualAddAndDeleteTest.java | 3 ++- .../worker/ShuffleDataSegmentPusherTest.java | 3 ++- .../indexing/worker/WorkerTaskManagerTest.java | 3 ++- .../indexing/worker/WorkerTaskMonitorTest.java | 3 ++- 14 files changed, 46 insertions(+), 14 deletions(-) diff --git a/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/KafkaIndexTaskTest.java b/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/KafkaIndexTaskTest.java index c257df296790..e158801b4a99 100644 --- a/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/KafkaIndexTaskTest.java +++ b/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/KafkaIndexTaskTest.java @@ -2600,7 +2600,8 @@ private void makeToolboxFactory() throws IOException true, null, null, - null + null, + false ); final TestDerbyConnector derbyConnector = derby.getConnector(); derbyConnector.createDataSourceTable(); diff --git a/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisIndexTaskTest.java b/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisIndexTaskTest.java index c893c9c738ee..33bfb769427f 100644 --- a/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisIndexTaskTest.java +++ b/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisIndexTaskTest.java @@ -2859,7 +2859,8 @@ private void makeToolboxFactory() throws IOException true, null, null, - null + null, + false ); final TestDerbyConnector derbyConnector = derby.getConnector(); derbyConnector.createDataSourceTable(); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/TaskToolboxTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/TaskToolboxTest.java index e99e4287441f..d84eb46d6777 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/TaskToolboxTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/TaskToolboxTest.java @@ -97,7 +97,7 @@ public void setUp() throws IOException EasyMock.replay(task, mockHandoffNotifierFactory); taskToolbox = new TaskToolboxFactory( - new TaskConfig(temporaryFolder.newFile().toString(), null, null, 50000, null, false, null, null, null), + new TaskConfig(temporaryFolder.newFile().toString(), null, null, 50000, null, false, null, null, null, false), new DruidNode("druid/middlemanager", "localhost", false, 8091, null, true, false), mockTaskActionClientFactory, mockEmitter, diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/AppenderatorDriverRealtimeIndexTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/AppenderatorDriverRealtimeIndexTaskTest.java index 576235ee4f38..93884de6cdbc 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/AppenderatorDriverRealtimeIndexTaskTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/AppenderatorDriverRealtimeIndexTaskTest.java @@ -1514,7 +1514,18 @@ public SegmentPublishResult announceHistoricalSegments( }; taskLockbox = new TaskLockbox(taskStorage, mdc); - final TaskConfig taskConfig = new TaskConfig(directory.getPath(), null, null, 50000, null, true, null, null, null); + final TaskConfig taskConfig = new TaskConfig( + directory.getPath(), + null, + null, + 50000, + null, + true, + null, + null, + null, + false + ); final TaskActionToolbox taskActionToolbox = new TaskActionToolbox( taskLockbox, diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/HadoopTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/HadoopTaskTest.java index 990888e5a96c..caaeea253c73 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/HadoopTaskTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/HadoopTaskTest.java @@ -116,7 +116,8 @@ public TaskStatus runTask(TaskToolbox toolbox) false, null, null, - null + null, + false )).once(); EasyMock.replay(toolbox); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RealtimeIndexTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RealtimeIndexTaskTest.java index 432457674635..6b18ce0a145a 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RealtimeIndexTaskTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/RealtimeIndexTaskTest.java @@ -885,7 +885,18 @@ private TaskToolbox makeToolbox( final File directory ) { - final TaskConfig taskConfig = new TaskConfig(directory.getPath(), null, null, 50000, null, true, null, null, null); + final TaskConfig taskConfig = new TaskConfig( + directory.getPath(), + null, + null, + 50000, + null, + true, + null, + null, + null, + false + ); final TaskLockbox taskLockbox = new TaskLockbox(taskStorage, mdc); try { taskStorage.insert(task, TaskStatus.running(task.getId())); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractParallelIndexSupervisorTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractParallelIndexSupervisorTaskTest.java index 3ba9441b2f3f..d7f111f430ea 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractParallelIndexSupervisorTaskTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractParallelIndexSupervisorTaskTest.java @@ -203,7 +203,8 @@ public void setUpAbstractParallelIndexSupervisorTaskTest() throws IOException false, null, null, - ImmutableList.of(new StorageLocationConfig(temporaryFolder.newFolder(), null, null)) + ImmutableList.of(new StorageLocationConfig(temporaryFolder.newFolder(), null, null)), + false ), null ); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/SingleTaskBackgroundRunnerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/SingleTaskBackgroundRunnerTest.java index 4cfa87d8a360..826c09e606e7 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/SingleTaskBackgroundRunnerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/SingleTaskBackgroundRunnerTest.java @@ -78,7 +78,8 @@ public void setup() throws IOException true, null, null, - null + null, + false ); final ServiceEmitter emitter = new NoopServiceEmitter(); final TaskToolboxFactory toolboxFactory = new TaskToolboxFactory( diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/TaskLifecycleTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/TaskLifecycleTest.java index 2bdc0b7e6107..020f0063b84c 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/TaskLifecycleTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/TaskLifecycleTest.java @@ -594,7 +594,7 @@ private TaskToolboxFactory setUpTaskToolboxFactory( new TaskAuditLogConfig(true) ); File tmpDir = temporaryFolder.newFolder(); - taskConfig = new TaskConfig(tmpDir.toString(), null, null, 50000, null, false, null, null, null); + taskConfig = new TaskConfig(tmpDir.toString(), null, null, 50000, null, false, null, null, null, false); SegmentLoaderConfig segmentLoaderConfig = new SegmentLoaderConfig() { diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/worker/IntermediaryDataManagerAutoCleanupTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/worker/IntermediaryDataManagerAutoCleanupTest.java index 7d0233b6b16d..3c59e943d464 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/worker/IntermediaryDataManagerAutoCleanupTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/worker/IntermediaryDataManagerAutoCleanupTest.java @@ -87,7 +87,8 @@ public Period getIntermediaryPartitionTimeout() false, null, null, - ImmutableList.of(new StorageLocationConfig(tempDir.newFolder(), null, null)) + ImmutableList.of(new StorageLocationConfig(tempDir.newFolder(), null, null)), + false ); final IndexingServiceClient indexingServiceClient = new NoopIndexingServiceClient() { diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/worker/IntermediaryDataManagerManualAddAndDeleteTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/worker/IntermediaryDataManagerManualAddAndDeleteTest.java index 15aad92b6a3c..fe6d615714fd 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/worker/IntermediaryDataManagerManualAddAndDeleteTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/worker/IntermediaryDataManagerManualAddAndDeleteTest.java @@ -70,7 +70,8 @@ public void setup() throws IOException false, null, null, - ImmutableList.of(new StorageLocationConfig(intermediarySegmentsLocation, 600L, null)) + ImmutableList.of(new StorageLocationConfig(intermediarySegmentsLocation, 600L, null)), + false ); final IndexingServiceClient indexingServiceClient = new NoopIndexingServiceClient(); intermediaryDataManager = new IntermediaryDataManager(workerConfig, taskConfig, indexingServiceClient); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/worker/ShuffleDataSegmentPusherTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/worker/ShuffleDataSegmentPusherTest.java index 153192633967..509fa39ab06b 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/worker/ShuffleDataSegmentPusherTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/worker/ShuffleDataSegmentPusherTest.java @@ -69,7 +69,8 @@ public void setup() throws IOException false, null, null, - ImmutableList.of(new StorageLocationConfig(temporaryFolder.newFolder(), null, null)) + ImmutableList.of(new StorageLocationConfig(temporaryFolder.newFolder(), null, null)), + false ); final IndexingServiceClient indexingServiceClient = new NoopIndexingServiceClient(); intermediaryDataManager = new IntermediaryDataManager(workerConfig, taskConfig, indexingServiceClient); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/worker/WorkerTaskManagerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/worker/WorkerTaskManagerTest.java index 3ae7d96e5da3..2da185bef93c 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/worker/WorkerTaskManagerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/worker/WorkerTaskManagerTest.java @@ -87,7 +87,8 @@ private WorkerTaskManager createWorkerTaskManager() false, null, null, - null + null, + false ); TaskActionClientFactory taskActionClientFactory = EasyMock.createNiceMock(TaskActionClientFactory.class); TaskActionClient taskActionClient = EasyMock.createNiceMock(TaskActionClient.class); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/worker/WorkerTaskMonitorTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/worker/WorkerTaskMonitorTest.java index 2fdca5f2c335..099755147ef7 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/worker/WorkerTaskMonitorTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/worker/WorkerTaskMonitorTest.java @@ -157,7 +157,8 @@ private WorkerTaskMonitor createTaskMonitor() false, null, null, - null + null, + false ); TaskActionClientFactory taskActionClientFactory = EasyMock.createNiceMock(TaskActionClientFactory.class); TaskActionClient taskActionClient = EasyMock.createNiceMock(TaskActionClient.class); From 6369cc0aa5e756c661ce9767262aa0fa255d9148 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Tue, 25 Aug 2020 11:18:44 -0700 Subject: [PATCH 08/24] Checkstyle. --- .../main/java/org/apache/druid/data/input/ColumnsFilter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java b/core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java index 554f3ccf1b55..b01001f8eec1 100644 --- a/core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java +++ b/core/src/main/java/org/apache/druid/data/input/ColumnsFilter.java @@ -69,7 +69,7 @@ public static ColumnsFilter exclusionBased(final Set exclusions) * Returns a new filter with a particular column added. The returned filter will return true from {@link #apply} * on this column. */ - public abstract ColumnsFilter plus(final String column); + public abstract ColumnsFilter plus(String column); public static class InclusionBased extends ColumnsFilter { From 14efe00d0546ab06b026c21f7ce118d9b72f35e7 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Tue, 25 Aug 2020 14:54:28 -0700 Subject: [PATCH 09/24] Misc fixes. --- .../druid/indexing/common/task/CompactionTaskRunTest.java | 3 ++- .../apache/druid/indexing/common/task/CompactionTaskTest.java | 2 +- .../apache/druid/indexing/common/task/IngestionTestBase.java | 3 ++- .../parallel/AbstractParallelIndexSupervisorTaskTest.java | 2 +- website/.spelling | 1 + 5 files changed, 7 insertions(+), 4 deletions(-) diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskRunTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskRunTest.java index e08361450fb9..6301c15de212 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskRunTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskRunTest.java @@ -41,6 +41,7 @@ import org.apache.druid.indexing.common.SegmentLoaderFactory; import org.apache.druid.indexing.common.TaskToolbox; import org.apache.druid.indexing.common.TestUtils; +import org.apache.druid.indexing.common.config.TaskConfig; import org.apache.druid.indexing.common.stats.RowIngestionMetersFactory; import org.apache.druid.indexing.common.task.CompactionTask.Builder; import org.apache.druid.indexing.firehose.IngestSegmentFirehoseFactory; @@ -873,7 +874,7 @@ public List getLocations() ); return new TaskToolbox( - null, + new TaskConfig(null, null, null, null, null, false, null, null, null, false), null, createActionClient(task), null, diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskTest.java index 88850031ca99..984c464f80f4 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskTest.java @@ -1284,7 +1284,7 @@ private static class TestTaskToolbox extends TaskToolbox ) { super( - null, + new TaskConfig(null, null, null, null, null, false, null, null, null, false), null, taskActionClient, null, diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IngestionTestBase.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IngestionTestBase.java index 881c44d5f1c4..86c304476fcf 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IngestionTestBase.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IngestionTestBase.java @@ -32,6 +32,7 @@ import org.apache.druid.indexing.common.actions.SegmentTransactionalInsertAction; import org.apache.druid.indexing.common.actions.TaskAction; import org.apache.druid.indexing.common.actions.TaskActionToolbox; +import org.apache.druid.indexing.common.config.TaskConfig; import org.apache.druid.indexing.common.config.TaskStorageConfig; import org.apache.druid.indexing.common.stats.RowIngestionMetersFactory; import org.apache.druid.indexing.overlord.HeapMemoryTaskStorage; @@ -292,7 +293,7 @@ public ListenableFuture run(Task task) ); final TaskToolbox box = new TaskToolbox( - null, + new TaskConfig(null, null, null, null, null, false, null, null, null, false), new DruidNode("druid/middlemanager", "localhost", false, 8091, null, true, false), taskActionClient, null, diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractParallelIndexSupervisorTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractParallelIndexSupervisorTaskTest.java index d7f111f430ea..ac287660fe1f 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractParallelIndexSupervisorTaskTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractParallelIndexSupervisorTaskTest.java @@ -570,7 +570,7 @@ public static void prepareObjectMapper( protected TaskToolbox createTaskToolbox(Task task, TaskActionClient actionClient) throws IOException { return new TaskToolbox( - null, + new TaskConfig(null, null, null, null, null, false, null, null, null, false), new DruidNode("druid/middlemanager", "localhost", false, 8091, null, true, false), actionClient, null, diff --git a/website/.spelling b/website/.spelling index dd0ad828d5db..888d26c9332f 100644 --- a/website/.spelling +++ b/website/.spelling @@ -1724,6 +1724,7 @@ successfulSending taskBlackListCleanupPeriod tasklogs timeBoundary +timestampSpec tmp tmpfs truststore From 7c6cf83327feef6deea1369f630b519a26f47d7f Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Thu, 27 Aug 2020 15:32:56 -0700 Subject: [PATCH 10/24] Fix test. --- .../parallel/AbstractParallelIndexSupervisorTaskTest.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractParallelIndexSupervisorTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractParallelIndexSupervisorTaskTest.java index 2140ee954592..b232fee9c314 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractParallelIndexSupervisorTaskTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractParallelIndexSupervisorTaskTest.java @@ -513,6 +513,8 @@ public Set getPublishedSegments(Task task) public void prepareObjectMapper(ObjectMapper objectMapper, IndexIO indexIO) { + final TaskConfig taskConfig = new TaskConfig(null, null, null, null, null, false, null, null, null, false); + objectMapper.setInjectableValues( new InjectableValues.Std() .addValue(ExprMacroTable.class, LookupEnabledTestExprMacroTable.INSTANCE) @@ -529,6 +531,7 @@ public void prepareObjectMapper(ObjectMapper objectMapper, IndexIO indexIO) .addValue(CoordinatorClient.class, coordinatorClient) .addValue(SegmentLoaderFactory.class, new SegmentLoaderFactory(indexIO, objectMapper)) .addValue(RetryPolicyFactory.class, new RetryPolicyFactory(new RetryPolicyConfig())) + .addValue(TaskConfig.class, taskConfig) ); objectMapper.registerSubtypes( new NamedType(ParallelIndexSupervisorTask.class, ParallelIndexSupervisorTask.TYPE), From 530eb3280a04fa9db720e8d07e69126cd1583d22 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Sat, 29 Aug 2020 14:21:59 -0700 Subject: [PATCH 11/24] Move config. --- integration-tests/docker/environment-configs/common | 6 +++++- integration-tests/docker/environment-configs/middlemanager | 4 ---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/integration-tests/docker/environment-configs/common b/integration-tests/docker/environment-configs/common index aba937b9f541..26b36ab45259 100644 --- a/integration-tests/docker/environment-configs/common +++ b/integration-tests/docker/environment-configs/common @@ -66,4 +66,8 @@ druid_zk_service_host=druid-zookeeper-kafka druid_auth_basic_common_maxSyncRetries=20 druid_indexer_logs_directory=/shared/tasklogs druid_sql_enable=true -druid_extensions_hadoopDependenciesDir=/shared/hadoop-dependencies \ No newline at end of file +druid_extensions_hadoopDependenciesDir=/shared/hadoop-dependencies + +# Testing the legacy config from https://github.com/apache/druid/pull/10267 +# Can remove this when the flag is no longer needed +druid_indexer_task_ignoreTimestampSpecForDruidInputSource=true diff --git a/integration-tests/docker/environment-configs/middlemanager b/integration-tests/docker/environment-configs/middlemanager index 1888d7a17751..c92cfd783caa 100644 --- a/integration-tests/docker/environment-configs/middlemanager +++ b/integration-tests/docker/environment-configs/middlemanager @@ -37,7 +37,3 @@ druid_auth_basic_common_cacheDirectory=/tmp/authCache/middleManager druid_startup_logging_logProperties=true druid_server_https_crlPath=/tls/revocations.crl druid_worker_capacity=20 - -# Testing the legacy config from https://github.com/apache/druid/pull/10267 -# Can remove this when the flag is no longer needed -druid_indexer_task_ignoreTimestampSpecForDruidInputSource=true From 94f293017b51c2ff2fa03bcdb1f794df199f3078 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Tue, 22 Sep 2020 21:27:22 -0700 Subject: [PATCH 12/24] Fix imports. --- .../apache/druid/indexing/common/task/InputSourceProcessor.java | 1 - 1 file changed, 1 deletion(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/InputSourceProcessor.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/InputSourceProcessor.java index 1cc3388fefd9..63ebba8daf7c 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/InputSourceProcessor.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/InputSourceProcessor.java @@ -28,7 +28,6 @@ import org.apache.druid.indexer.partitions.DynamicPartitionsSpec; import org.apache.druid.indexer.partitions.PartitionsSpec; import org.apache.druid.indexing.common.task.batch.parallel.iterator.IndexTaskInputRowIteratorBuilder; -import org.apache.druid.indexing.input.InputRowSchemas; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.java.util.common.parsers.CloseableIterator; From 34d47920168624071c5e8bf17a5bab3ce3ec4b45 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Tue, 22 Sep 2020 22:20:28 -0700 Subject: [PATCH 13/24] Fixup. --- .../druid/indexing/input/DruidSegmentReader.java | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentReader.java b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentReader.java index 3eb57b30597d..6e91b3a1a8fa 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentReader.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentReader.java @@ -28,6 +28,7 @@ import org.apache.druid.data.input.InputEntity; import org.apache.druid.data.input.InputEntity.CleanableFile; import org.apache.druid.data.input.InputRow; +import org.apache.druid.data.input.InputRowSchema; import org.apache.druid.data.input.IntermediateRowParsingReader; import org.apache.druid.data.input.impl.DimensionsSpec; import org.apache.druid.data.input.impl.MapInputRowParser; @@ -146,7 +147,16 @@ protected CloseableIterator> intermediateRowIterator() throw @Override protected List parseInputRows(Map intermediateRow) throws ParseException { - return Collections.singletonList(MapInputRowParser.parse(timestampSpec, dimensionsSpec, intermediateRow)); + return Collections.singletonList( + MapInputRowParser.parse( + new InputRowSchema( + timestampSpec, + dimensionsSpec, + columnsFilter + ), + intermediateRow + ) + ); } @Override From 96747f4cb0fede4f850a4517d97644788a2dd689 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Mon, 19 Oct 2020 16:31:09 -0700 Subject: [PATCH 14/24] Fix ShuffleResourceTest. --- .../druid/indexing/worker/shuffle/ShuffleResourceTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/worker/shuffle/ShuffleResourceTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/worker/shuffle/ShuffleResourceTest.java index bd1b2117042f..741956a53f7b 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/worker/shuffle/ShuffleResourceTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/worker/shuffle/ShuffleResourceTest.java @@ -95,7 +95,8 @@ public Period getIntermediaryPartitionTimeout() false, null, null, - ImmutableList.of(new StorageLocationConfig(tempDir.newFolder(), null, null)) + ImmutableList.of(new StorageLocationConfig(tempDir.newFolder(), null, null)), + false ); final IndexingServiceClient indexingServiceClient = new NoopIndexingServiceClient() { From 2e753d12e5dd3028a1c0ca3c21ec975b1f34b05f Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Mon, 19 Oct 2020 17:00:03 -0700 Subject: [PATCH 15/24] Add import. --- .../java/org/apache/druid/data/input/avro/AvroOCFReaderTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroOCFReaderTest.java b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroOCFReaderTest.java index ed039072dfcc..4841483c4e1a 100644 --- a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroOCFReaderTest.java +++ b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroOCFReaderTest.java @@ -45,6 +45,7 @@ import java.io.File; import java.io.IOException; +import java.util.List; import java.util.Map; public class AvroOCFReaderTest From be8a38950524036e25013e52088405fd3eaeb58e Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Fri, 4 Dec 2020 00:29:24 -0800 Subject: [PATCH 16/24] Smarter exclusions. --- .../overlord/sampler/InputSourceSampler.java | 1 - .../druid/segment/indexing/DataSchema.java | 67 +++++++++++-------- .../segment/indexing/DataSchemaTest.java | 6 +- 3 files changed, 42 insertions(+), 32 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java index b306b3b1195c..94a3e52e82b2 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java @@ -50,7 +50,6 @@ import javax.annotation.Nullable; import java.io.File; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; diff --git a/server/src/main/java/org/apache/druid/segment/indexing/DataSchema.java b/server/src/main/java/org/apache/druid/segment/indexing/DataSchema.java index 3fc0fa26b272..42b19157577f 100644 --- a/server/src/main/java/org/apache/druid/segment/indexing/DataSchema.java +++ b/server/src/main/java/org/apache/druid/segment/indexing/DataSchema.java @@ -27,8 +27,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; -import com.google.common.collect.Sets; import org.apache.druid.common.utils.IdUtils; +import org.apache.druid.data.input.impl.DimensionSchema; import org.apache.druid.data.input.impl.DimensionsSpec; import org.apache.druid.data.input.impl.InputRowParser; import org.apache.druid.data.input.impl.ParseSpec; @@ -36,6 +36,7 @@ import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.query.aggregation.AggregatorFactory; +import org.apache.druid.segment.column.ColumnHolder; import org.apache.druid.segment.indexing.granularity.GranularitySpec; import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec; import org.apache.druid.segment.transform.TransformSpec; @@ -45,8 +46,7 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; -import java.util.regex.Pattern; -import java.util.stream.Collectors; +import java.util.stream.Stream; /** @@ -55,7 +55,6 @@ public class DataSchema { private static final Logger log = new Logger(DataSchema.class); - private static final Pattern INVALIDCHARS = Pattern.compile("(?s).*[^\\S ].*"); private final String dataSource; private final AggregatorFactory[] aggregators; private final GranularitySpec granularitySpec; @@ -150,35 +149,47 @@ private static void validateDatasourceName(String dataSource) IdUtils.validateId("dataSource", dataSource); } + /** + * Computes the {@link DimensionsSpec} that we will actually use. It is derived from, but not necessarily identical + * to, the one that we were given. + */ private static DimensionsSpec computeDimensionsSpec( - TimestampSpec timestampSpec, - DimensionsSpec dimensionsSpec, - AggregatorFactory[] aggregators + final TimestampSpec timestampSpec, + final DimensionsSpec dimensionsSpec, + final AggregatorFactory[] aggregators ) { - final Set dimensionExclusions = new HashSet<>(); - - final String timestampColumn = timestampSpec.getTimestampColumn(); - if (!(dimensionsSpec.hasCustomDimensions() && dimensionsSpec.getDimensionNames().contains(timestampColumn))) { - dimensionExclusions.add(timestampColumn); - } - - for (AggregatorFactory aggregator : aggregators) { - dimensionExclusions.addAll(aggregator.requiredFields()); - dimensionExclusions.add(aggregator.getName()); - } + final Set inputFieldNames = new HashSet<>(); + final Set outputFieldNames = new HashSet<>(); + + // Populate inputFieldNames. + inputFieldNames.add(timestampSpec.getTimestampColumn()); + inputFieldNames.addAll(dimensionsSpec.getDimensionNames()); + Arrays.stream(aggregators) + .flatMap(aggregator -> aggregator.requiredFields().stream()) + .forEach(inputFieldNames::add); + + // Populate outputFieldNames, validating along the way for lack of duplicates. + outputFieldNames.add(ColumnHolder.TIME_COLUMN_NAME); + + Stream.concat( + dimensionsSpec.getDimensions().stream().map(DimensionSchema::getName), + Arrays.stream(aggregators).map(AggregatorFactory::getName) + ).forEach( + field -> { + if (!outputFieldNames.add(field)) { + throw new IAE("Cannot specify field [%s] more than once", field); + } + } + ); - final Set metSet = Arrays.stream(aggregators).map(AggregatorFactory::getName).collect(Collectors.toSet()); - final Set dimSet = new HashSet<>(dimensionsSpec.getDimensionNames()); - final Set overlap = Sets.intersection(metSet, dimSet); - if (!overlap.isEmpty()) { - throw new IAE( - "Cannot have overlapping dimensions and metrics of the same name. Please change the name of the metric. Overlap: %s", - overlap - ); - } + // Set up additional exclusions: all inputs and outputs, minus defined dimensions. + final Set additionalDimensionExclusions = new HashSet<>(); + additionalDimensionExclusions.addAll(inputFieldNames); + additionalDimensionExclusions.addAll(outputFieldNames); + additionalDimensionExclusions.removeAll(dimensionsSpec.getDimensionNames()); - return dimensionsSpec.withDimensionExclusions(Sets.difference(dimensionExclusions, dimSet)); + return dimensionsSpec.withDimensionExclusions(additionalDimensionExclusions); } @JsonProperty diff --git a/server/src/test/java/org/apache/druid/segment/indexing/DataSchemaTest.java b/server/src/test/java/org/apache/druid/segment/indexing/DataSchemaTest.java index 13bf27d2c178..67bed5084b5f 100644 --- a/server/src/test/java/org/apache/druid/segment/indexing/DataSchemaTest.java +++ b/server/src/test/java/org/apache/druid/segment/indexing/DataSchemaTest.java @@ -97,7 +97,7 @@ public void testDefaultExclusions() ); Assert.assertEquals( - ImmutableSet.of("time", "col1", "col2", "metric1", "metric2"), + ImmutableSet.of("__time", "time", "col1", "col2", "metric1", "metric2"), schema.getDimensionsSpec().getDimensionExclusions() ); } @@ -135,7 +135,7 @@ public void testExplicitInclude() ); Assert.assertEquals( - ImmutableSet.of("dimC", "col1", "metric1", "metric2"), + ImmutableSet.of("__time", "dimC", "col1", "metric1", "metric2"), schema.getParser().getParseSpec().getDimensionsSpec().getDimensionExclusions() ); } @@ -405,7 +405,7 @@ public void testSerde() throws Exception actual.getParser().getParseSpec(), new JSONParseSpec( new TimestampSpec("xXx", null, null), - new DimensionsSpec(null, Arrays.asList("metric1", "xXx", "col1"), null), + new DimensionsSpec(null, Arrays.asList("__time", "metric1", "xXx", "col1"), null), null, null, null From 76ccfd3f65ef65d0f14d694581d7231c7246fa1b Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Sat, 5 Dec 2020 16:26:09 -0800 Subject: [PATCH 17/24] Fixes based on tests. Also, add TIME_COLUMN constant in the web console. --- .../indexing/input/DruidInputSource.java | 5 ++--- .../duty/ITAutoCompactionTest.java | 4 ++-- .../src/druid-models/timestamp-spec.tsx | 11 +++++++++-- web-console/src/utils/sampler.ts | 18 +++++++++++------- .../views/load-data-view/load-data-view.tsx | 19 +++++++++++-------- 5 files changed, 35 insertions(+), 22 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java index ff78d79683f6..f01f46dde965 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java @@ -26,7 +26,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; import com.google.common.collect.FluentIterable; -import com.google.common.collect.ImmutableSortedSet; +import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterators; import org.apache.druid.client.coordinator.CoordinatorClient; import org.apache.druid.data.input.AbstractInputSource; @@ -72,7 +72,6 @@ import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import java.util.concurrent.ThreadLocalRandom; @@ -90,7 +89,7 @@ public class DruidInputSource extends AbstractInputSource implements SplittableI /** * Timestamp formats that the standard __time column can be parsed with. */ - private static final Set STANDARD_TIME_COLUMN_FORMATS = ImmutableSortedSet.of("auto", "millis"); + private static final List STANDARD_TIME_COLUMN_FORMATS = ImmutableList.of("millis", "auto"); private final String dataSource; // Exactly one of interval and segmentIds should be non-null. Typically 'interval' is specified directly diff --git a/integration-tests/src/test/java/org/apache/druid/tests/coordinator/duty/ITAutoCompactionTest.java b/integration-tests/src/test/java/org/apache/druid/tests/coordinator/duty/ITAutoCompactionTest.java index 5d1d55ba7b63..5fbe36cef2ff 100644 --- a/integration-tests/src/test/java/org/apache/druid/tests/coordinator/duty/ITAutoCompactionTest.java +++ b/integration-tests/src/test/java/org/apache/druid/tests/coordinator/duty/ITAutoCompactionTest.java @@ -121,7 +121,7 @@ public void testAutoCompactionDutySubmitAndVerifyCompaction() throws Exception fullDatasourceName, AutoCompactionSnapshot.AutoCompactionScheduleStatus.RUNNING, 0, - 22489, + 22488, 0, 0, 3, @@ -267,7 +267,7 @@ public void testAutoCompactionDutyCanUpdateTaskSlots() throws Exception fullDatasourceName, AutoCompactionSnapshot.AutoCompactionScheduleStatus.RUNNING, 0, - 22489, + 22488, 0, 0, 3, diff --git a/web-console/src/druid-models/timestamp-spec.tsx b/web-console/src/druid-models/timestamp-spec.tsx index b6c595b17dab..f6a8263998fa 100644 --- a/web-console/src/druid-models/timestamp-spec.tsx +++ b/web-console/src/druid-models/timestamp-spec.tsx @@ -32,11 +32,18 @@ import { Transform } from './transform-spec'; const NO_SUCH_COLUMN = '!!!_no_such_column_!!!'; +export const TIME_COLUMN = '__time'; + export const PLACEHOLDER_TIMESTAMP_SPEC: TimestampSpec = { column: NO_SUCH_COLUMN, missingValue: '1970-01-01T00:00:00Z', }; +export const REINDEX_TIMESTAMP_SPEC: TimestampSpec = { + column: TIME_COLUMN, + format: 'millis', +}; + export const CONSTANT_TIMESTAMP_SPEC: TimestampSpec = { column: NO_SUCH_COLUMN, missingValue: '2010-01-01T00:00:00Z', @@ -48,7 +55,7 @@ export function getTimestampSchema(spec: IngestionSpec): TimestampSchema { const transforms: Transform[] = deepGet(spec, 'spec.dataSchema.transformSpec.transforms') || EMPTY_ARRAY; - const timeTransform = transforms.find(transform => transform.name === '__time'); + const timeTransform = transforms.find(transform => transform.name === TIME_COLUMN); if (timeTransform) return 'expression'; const timestampSpec = deepGet(spec, 'spec.dataSchema.timestampSpec') || EMPTY_OBJECT; @@ -74,7 +81,7 @@ export function getTimestampSpecExpressionFromSpec(spec: IngestionSpec): string const transforms: Transform[] = deepGet(spec, 'spec.dataSchema.transformSpec.transforms') || EMPTY_ARRAY; - const timeTransform = transforms.find(transform => transform.name === '__time'); + const timeTransform = transforms.find(transform => transform.name === TIME_COLUMN); if (!timeTransform) return; return timeTransform.expression; } diff --git a/web-console/src/utils/sampler.ts b/web-console/src/utils/sampler.ts index 5f8b97eb34e5..43fe5666df56 100644 --- a/web-console/src/utils/sampler.ts +++ b/web-console/src/utils/sampler.ts @@ -27,6 +27,8 @@ import { isDruidSource, MetricSpec, PLACEHOLDER_TIMESTAMP_SPEC, + REINDEX_TIMESTAMP_SPEC, + TIME_COLUMN, TimestampSpec, Transform, TransformSpec, @@ -150,13 +152,13 @@ export function headerFromSampleResponse(options: HeaderFromSampleResponseOption let columns = sortWithPrefixSuffix( dedupe(sampleResponse.data.flatMap(s => (s.parsed ? Object.keys(s.parsed) : []))).sort(), - columnOrder || ['__time'], + columnOrder || [TIME_COLUMN], suffixColumnOrder || [], alphanumericCompare, ); if (ignoreTimeColumn) { - columns = columns.filter(c => c !== '__time'); + columns = columns.filter(c => c !== TIME_COLUMN); } return columns; @@ -287,7 +289,7 @@ export async function sampleForConnect( ioConfig, dataSchema: { dataSource: 'sample', - timestampSpec: PLACEHOLDER_TIMESTAMP_SPEC, + timestampSpec: reingestMode ? REINDEX_TIMESTAMP_SPEC : PLACEHOLDER_TIMESTAMP_SPEC, dimensionsSpec: {}, }, } as any, @@ -335,13 +337,15 @@ export async function sampleForParser( sampleStrategy, ); + const reingestMode = isDruidSource(spec); + const sampleSpec: SampleSpec = { type: samplerType, spec: { ioConfig, dataSchema: { dataSource: 'sample', - timestampSpec: PLACEHOLDER_TIMESTAMP_SPEC, + timestampSpec: reingestMode ? REINDEX_TIMESTAMP_SPEC : PLACEHOLDER_TIMESTAMP_SPEC, dimensionsSpec: {}, }, }, @@ -395,7 +399,7 @@ export async function sampleForTimestamp( dimensionsSpec: {}, timestampSpec, transformSpec: { - transforms: transforms.filter(transform => transform.name === '__time'), + transforms: transforms.filter(transform => transform.name === TIME_COLUMN), }, }, }, @@ -456,7 +460,7 @@ export async function sampleForTransform( headerFromSampleResponse({ sampleResponse: sampleResponseHack, ignoreTimeColumn: true, - columnOrder: ['__time'].concat(inputFormatColumns), + columnOrder: [TIME_COLUMN].concat(inputFormatColumns), }).concat(transforms.map(t => t.name)), ); } @@ -515,7 +519,7 @@ export async function sampleForFilter( headerFromSampleResponse({ sampleResponse: sampleResponseHack, ignoreTimeColumn: true, - columnOrder: ['__time'].concat(inputFormatColumns), + columnOrder: [TIME_COLUMN].concat(inputFormatColumns), }).concat(transforms.map(t => t.name)), ); } diff --git a/web-console/src/views/load-data-view/load-data-view.tsx b/web-console/src/views/load-data-view/load-data-view.tsx index a9d6a6274c3c..0cfa0bc881e2 100644 --- a/web-console/src/views/load-data-view/load-data-view.tsx +++ b/web-console/src/views/load-data-view/load-data-view.tsx @@ -54,6 +54,7 @@ import { } from '../../components'; import { FormGroupWithInfo } from '../../components/form-group-with-info/form-group-with-info'; import { AsyncActionDialog } from '../../dialogs'; +import { TIME_COLUMN } from '../../druid-models'; import { addTimestampTransform, CONSTANT_TIMESTAMP_SPEC, @@ -1221,7 +1222,7 @@ export class LoadDataView extends React.PureComponent k !== '__time' && !aggregators[k]) + .filter(k => k !== TIME_COLUMN && !aggregators[k]) .map(k => ({ name: k, type: String(inputData.columns![k].type || 'string').toLowerCase(), @@ -1453,7 +1454,7 @@ export class LoadDataView extends React.PureComponent Date: Sun, 6 Dec 2020 19:13:12 -0800 Subject: [PATCH 18/24] Adjustments for tests. --- .../overlord/sampler/InputSourceSampler.java | 6 +-- web-console/e2e-tests/reindexing.spec.ts | 52 +++++++++---------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java index 94a3e52e82b2..05e31b9a18ee 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java @@ -50,7 +50,7 @@ import javax.annotation.Nullable; import java.io.File; import java.util.ArrayList; -import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Objects; @@ -162,10 +162,10 @@ public SamplerResponse sample( columnNames.remove(SamplerInputRow.SAMPLER_ORDERING_COLUMN); for (Row row : index) { - Map parsed = new HashMap<>(); + Map parsed = new LinkedHashMap<>(); - columnNames.forEach(k -> parsed.put(k, row.getRaw(k))); parsed.put(ColumnHolder.TIME_COLUMN_NAME, row.getTimestampFromEpoch()); + columnNames.forEach(k -> parsed.put(k, row.getRaw(k))); Number sortKey = row.getMetric(SamplerInputRow.SAMPLER_ORDERING_COLUMN); if (sortKey != null) { diff --git a/web-console/e2e-tests/reindexing.spec.ts b/web-console/e2e-tests/reindexing.spec.ts index ae45b735965f..23be54cf74e6 100644 --- a/web-console/e2e-tests/reindexing.spec.ts +++ b/web-console/e2e-tests/reindexing.spec.ts @@ -115,50 +115,50 @@ function validateConnectLocalData(preview: string) { expect(firstLine).toBe( 'Druid row: {' + '"__time":1442018818771' + - ',"isRobot":"false"' + - ',"countryIsoCode":null' + - ',"added":"36"' + - ',"regionName":null' + ',"channel":"#en.wikipedia"' + - ',"delta":"36"' + - ',"isUnpatrolled":"false"' + - ',"isNew":"false"' + - ',"isMinor":"false"' + - ',"isAnonymous":"false"' + - ',"deleted":"0"' + ',"cityName":null' + - ',"metroCode":null' + - ',"namespace":"Talk"' + ',"comment":"added project"' + + ',"countryIsoCode":null' + ',"countryName":null' + + ',"isAnonymous":"false"' + + ',"isMinor":"false"' + + ',"isNew":"false"' + + ',"isRobot":"false"' + + ',"isUnpatrolled":"false"' + + ',"metroCode":null' + + ',"namespace":"Talk"' + ',"page":"Talk:Oswald Tilghman"' + - ',"user":"GELongstreet"' + ',"regionIsoCode":null' + + ',"regionName":null' + + ',"user":"GELongstreet"' + + ',"added":"36"' + + ',"deleted":"0"' + + ',"delta":"36"' + '}', ); const lastLine = lines[lines.length - 1]; expect(lastLine).toBe( 'Druid row: {' + '"__time":1442020314823' + - ',"isRobot":"false"' + - ',"countryIsoCode":null' + - ',"added":"1"' + - ',"regionName":null' + ',"channel":"#en.wikipedia"' + - ',"delta":"1"' + - ',"isUnpatrolled":"false"' + - ',"isNew":"false"' + - ',"isMinor":"true"' + - ',"isAnonymous":"false"' + - ',"deleted":"0"' + ',"cityName":null' + - ',"metroCode":null' + - ',"namespace":"Main"' + ',"comment":"/* History */[[WP:AWB/T|Typo fixing]], [[WP:AWB/T|typo(s) fixed]]: nothern → northern using [[Project:AWB|AWB]]"' + + ',"countryIsoCode":null' + ',"countryName":null' + + ',"isAnonymous":"false"' + + ',"isMinor":"true"' + + ',"isNew":"false"' + + ',"isRobot":"false"' + + ',"isUnpatrolled":"false"' + + ',"metroCode":null' + + ',"namespace":"Main"' + ',"page":"Hapoel Katamon Jerusalem F.C."' + - ',"user":"The Quixotic Potato"' + ',"regionIsoCode":null' + + ',"regionName":null' + + ',"user":"The Quixotic Potato"' + + ',"added":"1"' + + ',"deleted":"0"' + + ',"delta":"1"' + '}', ); } From 7695d8a5e67ad7a8a8414805b7ff389ad93724cd Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Sun, 6 Dec 2020 21:15:47 -0800 Subject: [PATCH 19/24] Reorder test data. --- web-console/e2e-tests/reindexing.spec.ts | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/web-console/e2e-tests/reindexing.spec.ts b/web-console/e2e-tests/reindexing.spec.ts index 23be54cf74e6..6c6f68f6f5cc 100644 --- a/web-console/e2e-tests/reindexing.spec.ts +++ b/web-console/e2e-tests/reindexing.spec.ts @@ -116,24 +116,24 @@ function validateConnectLocalData(preview: string) { 'Druid row: {' + '"__time":1442018818771' + ',"channel":"#en.wikipedia"' + - ',"cityName":null' + ',"comment":"added project"' + - ',"countryIsoCode":null' + - ',"countryName":null' + ',"isAnonymous":"false"' + ',"isMinor":"false"' + ',"isNew":"false"' + ',"isRobot":"false"' + ',"isUnpatrolled":"false"' + - ',"metroCode":null' + ',"namespace":"Talk"' + ',"page":"Talk:Oswald Tilghman"' + - ',"regionIsoCode":null' + - ',"regionName":null' + ',"user":"GELongstreet"' + ',"added":"36"' + ',"deleted":"0"' + ',"delta":"36"' + + ',"cityName":null' + + ',"countryIsoCode":null' + + ',"countryName":null' + + ',"regionIsoCode":null' + + ',"regionName":null' + + ',"metroCode":null' + '}', ); const lastLine = lines[lines.length - 1]; @@ -141,24 +141,24 @@ function validateConnectLocalData(preview: string) { 'Druid row: {' + '"__time":1442020314823' + ',"channel":"#en.wikipedia"' + - ',"cityName":null' + ',"comment":"/* History */[[WP:AWB/T|Typo fixing]], [[WP:AWB/T|typo(s) fixed]]: nothern → northern using [[Project:AWB|AWB]]"' + - ',"countryIsoCode":null' + - ',"countryName":null' + ',"isAnonymous":"false"' + ',"isMinor":"true"' + ',"isNew":"false"' + ',"isRobot":"false"' + ',"isUnpatrolled":"false"' + - ',"metroCode":null' + ',"namespace":"Main"' + ',"page":"Hapoel Katamon Jerusalem F.C."' + - ',"regionIsoCode":null' + - ',"regionName":null' + ',"user":"The Quixotic Potato"' + ',"added":"1"' + ',"deleted":"0"' + ',"delta":"1"' + + ',"cityName":null' + + ',"countryIsoCode":null' + + ',"countryName":null' + + ',"regionIsoCode":null' + + ',"regionName":null' + + ',"metroCode":null' + '}', ); } From 02dfb64633adee53d3fef0e7b915ccbf795b51e1 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Mon, 7 Dec 2020 18:28:35 -0800 Subject: [PATCH 20/24] Update docs. --- docs/configuration/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/configuration/index.md b/docs/configuration/index.md index 7b1c4e3db2b6..2812ec73414a 100644 --- a/docs/configuration/index.md +++ b/docs/configuration/index.md @@ -1249,7 +1249,7 @@ Additional peon configs include: |`druid.indexer.task.gracefulShutdownTimeout`|Wait this long on middleManager restart for restorable tasks to gracefully exit.|PT5M| |`druid.indexer.task.hadoopWorkingPath`|Temporary working directory for Hadoop tasks.|`/tmp/druid-indexing`| |`druid.indexer.task.restoreTasksOnRestart`|If true, MiddleManagers will attempt to stop tasks gracefully on shutdown and restore them on restart.|false| -|`druid.indexer.task.ignoreTimestampSpecForDruidInputSource`|If true, tasks using the [Druid input source](../ingestion/native-batch.md#druid-input-source) will ignore the provided timestampSpec, and will use the `__time` column of the input datasource. This option is provided for compatibility with ingestion specs written before Druid 0.20.0.|false| +|`druid.indexer.task.ignoreTimestampSpecForDruidInputSource`|If true, tasks using the [Druid input source](../ingestion/native-batch.md#druid-input-source) will ignore the provided timestampSpec, and will use the `__time` column of the input datasource. This option is provided for compatibility with ingestion specs written before Druid 0.21.0.|false| |`druid.indexer.server.maxChatRequests`|Maximum number of concurrent requests served by a task's chat handler. Set to 0 to disable limiting.|0| If the peon is running in remote mode, there must be an Overlord up and running. Peons in remote mode can set the following configurations: @@ -1314,7 +1314,7 @@ then the value from the configuration below is used: |`druid.indexer.task.gracefulShutdownTimeout`|Wait this long on Indexer restart for restorable tasks to gracefully exit.|PT5M| |`druid.indexer.task.hadoopWorkingPath`|Temporary working directory for Hadoop tasks.|`/tmp/druid-indexing`| |`druid.indexer.task.restoreTasksOnRestart`|If true, the Indexer will attempt to stop tasks gracefully on shutdown and restore them on restart.|false| -|`druid.indexer.task.ignoreTimestampSpecForDruidInputSource`|If true, tasks using the [Druid input source](../ingestion/native-batch.md#druid-input-source) will ignore the provided timestampSpec, and will use the `__time` column of the input datasource. This option is provided for compatibility with ingestion specs written before Druid 0.20.0.|false| +|`druid.indexer.task.ignoreTimestampSpecForDruidInputSource`|If true, tasks using the [Druid input source](../ingestion/native-batch.md#druid-input-source) will ignore the provided timestampSpec, and will use the `__time` column of the input datasource. This option is provided for compatibility with ingestion specs written before Druid 0.21.0.|false| |`druid.peon.taskActionClient.retry.minWait`|The minimum retry time to communicate with Overlord.|PT5S| |`druid.peon.taskActionClient.retry.maxWait`|The maximum retry time to communicate with Overlord.|PT1M| |`druid.peon.taskActionClient.retry.maxRetryCount`|The maximum number of retries to communicate with Overlord.|60| From 8ce44bd2c43eea9dcfdde9b498bb1416ee32f112 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Fri, 29 Jan 2021 10:30:40 -0800 Subject: [PATCH 21/24] Update docs to say Druid 0.22.0 instead of 0.21.0. --- docs/configuration/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/configuration/index.md b/docs/configuration/index.md index 95159a11c0a9..a0a33b07e2b6 100644 --- a/docs/configuration/index.md +++ b/docs/configuration/index.md @@ -1251,7 +1251,7 @@ Additional peon configs include: |`druid.indexer.task.gracefulShutdownTimeout`|Wait this long on middleManager restart for restorable tasks to gracefully exit.|PT5M| |`druid.indexer.task.hadoopWorkingPath`|Temporary working directory for Hadoop tasks.|`/tmp/druid-indexing`| |`druid.indexer.task.restoreTasksOnRestart`|If true, MiddleManagers will attempt to stop tasks gracefully on shutdown and restore them on restart.|false| -|`druid.indexer.task.ignoreTimestampSpecForDruidInputSource`|If true, tasks using the [Druid input source](../ingestion/native-batch.md#druid-input-source) will ignore the provided timestampSpec, and will use the `__time` column of the input datasource. This option is provided for compatibility with ingestion specs written before Druid 0.21.0.|false| +|`druid.indexer.task.ignoreTimestampSpecForDruidInputSource`|If true, tasks using the [Druid input source](../ingestion/native-batch.md#druid-input-source) will ignore the provided timestampSpec, and will use the `__time` column of the input datasource. This option is provided for compatibility with ingestion specs written before Druid 0.22.0.|false| |`druid.indexer.server.maxChatRequests`|Maximum number of concurrent requests served by a task's chat handler. Set to 0 to disable limiting.|0| If the peon is running in remote mode, there must be an Overlord up and running. Peons in remote mode can set the following configurations: @@ -1316,7 +1316,7 @@ then the value from the configuration below is used: |`druid.indexer.task.gracefulShutdownTimeout`|Wait this long on Indexer restart for restorable tasks to gracefully exit.|PT5M| |`druid.indexer.task.hadoopWorkingPath`|Temporary working directory for Hadoop tasks.|`/tmp/druid-indexing`| |`druid.indexer.task.restoreTasksOnRestart`|If true, the Indexer will attempt to stop tasks gracefully on shutdown and restore them on restart.|false| -|`druid.indexer.task.ignoreTimestampSpecForDruidInputSource`|If true, tasks using the [Druid input source](../ingestion/native-batch.md#druid-input-source) will ignore the provided timestampSpec, and will use the `__time` column of the input datasource. This option is provided for compatibility with ingestion specs written before Druid 0.21.0.|false| +|`druid.indexer.task.ignoreTimestampSpecForDruidInputSource`|If true, tasks using the [Druid input source](../ingestion/native-batch.md#druid-input-source) will ignore the provided timestampSpec, and will use the `__time` column of the input datasource. This option is provided for compatibility with ingestion specs written before Druid 0.22.0.|false| |`druid.peon.taskActionClient.retry.minWait`|The minimum retry time to communicate with Overlord.|PT5S| |`druid.peon.taskActionClient.retry.maxWait`|The maximum retry time to communicate with Overlord.|PT1M| |`druid.peon.taskActionClient.retry.maxRetryCount`|The maximum number of retries to communicate with Overlord.|60| From ade207e73bbc5410f3c7fa577e1fabce4d82000a Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Thu, 25 Feb 2021 08:54:55 -0800 Subject: [PATCH 22/24] Fix test. --- .../apache/druid/indexing/input/DruidSegmentReaderTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/input/DruidSegmentReaderTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/input/DruidSegmentReaderTest.java index 3a01ec939fc3..9270f5f8a573 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/input/DruidSegmentReaderTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/input/DruidSegmentReaderTest.java @@ -47,6 +47,7 @@ import org.apache.druid.segment.IndexIO; import org.apache.druid.segment.IndexSpec; import org.apache.druid.segment.Segment; +import org.apache.druid.segment.SegmentLazyLoadFailCallback; import org.apache.druid.segment.TestHelper; import org.apache.druid.segment.incremental.IncrementalIndex; import org.apache.druid.segment.incremental.IncrementalIndexSchema; @@ -597,7 +598,7 @@ public boolean isSegmentLoaded(DataSegment segment) } @Override - public Segment getSegment(DataSegment segment, boolean lazy) + public Segment getSegment(DataSegment segment, boolean lazy, SegmentLazyLoadFailCallback loadFailed) { throw new UnsupportedOperationException("unused"); } From 8fb44d7cd1e7f7f19ada2357baf18ae773a2d454 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Thu, 25 Feb 2021 13:05:20 -0800 Subject: [PATCH 23/24] Fix ITAutoCompactionTest. --- .../druid/tests/coordinator/duty/ITAutoCompactionTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration-tests/src/test/java/org/apache/druid/tests/coordinator/duty/ITAutoCompactionTest.java b/integration-tests/src/test/java/org/apache/druid/tests/coordinator/duty/ITAutoCompactionTest.java index a68e09cd7796..cdf7c396c70d 100644 --- a/integration-tests/src/test/java/org/apache/druid/tests/coordinator/duty/ITAutoCompactionTest.java +++ b/integration-tests/src/test/java/org/apache/druid/tests/coordinator/duty/ITAutoCompactionTest.java @@ -129,7 +129,7 @@ public void testAutoCompactionDutySubmitAndVerifyCompaction() throws Exception fullDatasourceName, AutoCompactionSnapshot.AutoCompactionScheduleStatus.RUNNING, 0, - 22488, + 22481, 0, 0, 3, @@ -275,7 +275,7 @@ public void testAutoCompactionDutyCanUpdateTaskSlots() throws Exception fullDatasourceName, AutoCompactionSnapshot.AutoCompactionScheduleStatus.RUNNING, 0, - 22488, + 22481, 0, 0, 3, From 9bc0481e42ebcdc31f466871fea408b114da6a33 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Wed, 24 Mar 2021 18:30:10 -0700 Subject: [PATCH 24/24] Changes from review & from merging. --- .../druid/data/input/impl/JsonReaderTest.java | 2 +- docs/ingestion/native-batch.md | 5 +- .../indexing/input/DruidInputSource.java | 45 +++- .../indexing/input/DruidSegmentReader.java | 21 +- .../indexing/input/DruidInputSourceTest.java | 224 ++++++++++++++++++ .../seekablestream/StreamChunkParserTest.java | 4 +- 6 files changed, 276 insertions(+), 25 deletions(-) create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/input/DruidInputSourceTest.java diff --git a/core/src/test/java/org/apache/druid/data/input/impl/JsonReaderTest.java b/core/src/test/java/org/apache/druid/data/input/impl/JsonReaderTest.java index 7e5e71b672a1..7ab52a095d51 100644 --- a/core/src/test/java/org/apache/druid/data/input/impl/JsonReaderTest.java +++ b/core/src/test/java/org/apache/druid/data/input/impl/JsonReaderTest.java @@ -379,7 +379,7 @@ public void testEmptyJSONText() throws IOException new InputRowSchema( new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("bar", "foo"))), - Collections.emptyList() + ColumnsFilter.all() ), source, null diff --git a/docs/ingestion/native-batch.md b/docs/ingestion/native-batch.md index b4c7ff302e65..dece5bf260c4 100644 --- a/docs/ingestion/native-batch.md +++ b/docs/ingestion/native-batch.md @@ -1308,6 +1308,8 @@ and the format to `auto` or `millis`. It is OK for the input and output datasources to be the same. In this case, newly generated data will overwrite the previous data for the intervals specified in the `granularitySpec`. Generally, if you are going to do this, it is a good idea to test out your reindexing by writing to a separate datasource before overwriting your main one. +Alternatively, if your goals can be satisfied by [compaction](compaction.md), consider that instead as a simpler +approach. An example task spec is shown below. It reads from a hypothetical raw datasource `wikipedia_raw` and creates a new rolled-up datasource `wikipedia_rollup` by grouping on hour, "countryName", and "page". @@ -1353,8 +1355,7 @@ rolled-up datasource `wikipedia_rollup` by grouping on hour, "countryName", and "tuningConfig": { "type": "index_parallel", "partitionsSpec": { - "type": "hashed", - "numShards": 1 + "type": "hashed" }, "forceGuaranteedRollup": true, "maxNumConcurrentSubTasks": 1 diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java index bf08c42f785d..c9d0f4e464b2 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java @@ -22,7 +22,6 @@ import com.fasterxml.jackson.annotation.JacksonInject; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonInclude; -import com.fasterxml.jackson.annotation.JsonInclude.Include; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; import com.google.common.collect.FluentIterable; @@ -73,6 +72,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.SortedMap; import java.util.TreeMap; import java.util.concurrent.ThreadLocalRandom; @@ -83,6 +83,7 @@ * * Used internally by {@link org.apache.druid.indexing.common.task.CompactionTask}, and can also be used directly. */ +@JsonInclude(JsonInclude.Include.NON_NULL) public class DruidInputSource extends AbstractInputSource implements SplittableInputSource> { private static final Logger LOG = new Logger(DruidInputSource.class); @@ -184,7 +185,6 @@ public String getDataSource() @Nullable @JsonProperty - @JsonInclude(Include.NON_NULL) public Interval getInterval() { return interval; @@ -192,14 +192,12 @@ public Interval getInterval() @Nullable @JsonProperty("segments") - @JsonInclude(Include.NON_NULL) public List getSegmentIds() { return segmentIds; } @JsonProperty("filter") - @JsonInclude(Include.NON_NULL) public DimFilter getDimFilter() { return dimFilter; @@ -209,7 +207,6 @@ public DimFilter getDimFilter() * Included for serde backwards-compatibility only. Not used. */ @JsonProperty - @JsonInclude(Include.NON_NULL) public List getDimensions() { return dimensions; @@ -219,7 +216,6 @@ public List getDimensions() * Included for serde backwards-compatibility only. Not used. */ @JsonProperty - @JsonInclude(Include.NON_NULL) public List getMetrics() { return metrics; @@ -355,6 +351,43 @@ public boolean needsFormat() return false; } + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + DruidInputSource that = (DruidInputSource) o; + return Objects.equals(dataSource, that.dataSource) + && Objects.equals(interval, that.interval) + && Objects.equals(segmentIds, that.segmentIds) + && Objects.equals(dimFilter, that.dimFilter) + && Objects.equals(dimensions, that.dimensions) + && Objects.equals(metrics, that.metrics); + } + + @Override + public int hashCode() + { + return Objects.hash(dataSource, interval, segmentIds, dimFilter, dimensions, metrics); + } + + @Override + public String toString() + { + return "DruidInputSource{" + + "dataSource='" + dataSource + '\'' + + ", interval=" + interval + + ", segmentIds=" + segmentIds + + ", dimFilter=" + dimFilter + + (dimensions != null ? ", dimensions=" + dimensions : "") + + (metrics != null ? ", metrics=" + metrics : "") + + '}'; + } + public static Iterator>> createSplits( CoordinatorClient coordinatorClient, RetryPolicyFactory retryPolicyFactory, diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentReader.java b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentReader.java index c1ddc26eff77..8e3bfe7108a2 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentReader.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidSegmentReader.java @@ -75,9 +75,8 @@ public class DruidSegmentReader extends IntermediateRowParsingReader> intermediateRowIterator() throw @Override protected List parseInputRows(Map intermediateRow) throws ParseException { - return Collections.singletonList( - MapInputRowParser.parse( - new InputRowSchema( - timestampSpec, - dimensionsSpec, - columnsFilter - ), - intermediateRow - ) - ); + return Collections.singletonList(MapInputRowParser.parse(inputRowSchema, intermediateRow)); } @Override diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/input/DruidInputSourceTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/input/DruidInputSourceTest.java new file mode 100644 index 000000000000..dcdc537e9cd8 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/input/DruidInputSourceTest.java @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.input; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.InjectableValues; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.ImmutableList; +import org.apache.druid.client.coordinator.CoordinatorClient; +import org.apache.druid.data.input.InputSource; +import org.apache.druid.guice.IndexingServiceInputSourceModule; +import org.apache.druid.indexing.common.RetryPolicyFactory; +import org.apache.druid.indexing.common.SegmentLoaderFactory; +import org.apache.druid.indexing.common.config.TaskConfig; +import org.apache.druid.indexing.firehose.WindowedSegmentId; +import org.apache.druid.java.util.common.Intervals; +import org.apache.druid.segment.IndexIO; +import org.apache.druid.segment.TestHelper; +import org.easymock.EasyMock; +import org.hamcrest.CoreMatchers; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +public class DruidInputSourceTest +{ + private final IndexIO indexIO = EasyMock.createMock(IndexIO.class); + private final CoordinatorClient coordinatorClient = EasyMock.createMock(CoordinatorClient.class); + private final SegmentLoaderFactory segmentLoaderFactory = EasyMock.createMock(SegmentLoaderFactory.class); + private final RetryPolicyFactory retryPolicyFactory = EasyMock.createMock(RetryPolicyFactory.class); + private final TaskConfig taskConfig = EasyMock.createMock(TaskConfig.class); + + private ObjectMapper mapper = null; + + @Rule + public ExpectedException expectedException = ExpectedException.none(); + + @Before + public void setUp() + { + mapper = TestHelper.makeJsonMapper(); + mapper.registerModules(new IndexingServiceInputSourceModule().getJacksonModules()); + + final InjectableValues.Std injectableValues = (InjectableValues.Std) mapper.getInjectableValues(); + injectableValues.addValue(IndexIO.class, indexIO); + injectableValues.addValue(CoordinatorClient.class, coordinatorClient); + injectableValues.addValue(SegmentLoaderFactory.class, segmentLoaderFactory); + injectableValues.addValue(RetryPolicyFactory.class, retryPolicyFactory); + injectableValues.addValue(TaskConfig.class, taskConfig); + } + + @Test + public void testSerdeUsingIntervals() throws Exception + { + final String json = "{" + + "\"type\":\"druid\"," + + "\"dataSource\":\"foo\"," + + "\"interval\":\"2000-01-01T00:00:00.000Z/2001-01-01T00:00:00.000Z\"" + + "}"; + + final InputSource inputSource = mapper.readValue(json, InputSource.class); + + Assert.assertThat(inputSource, CoreMatchers.instanceOf(DruidInputSource.class)); + Assert.assertEquals( + new DruidInputSource( + "foo", + Intervals.of("2000/2001"), + null, + null, + null, + null, + indexIO, + coordinatorClient, + segmentLoaderFactory, + retryPolicyFactory, + taskConfig + ), + inputSource + ); + + Assert.assertEquals(json, mapper.writeValueAsString(inputSource)); + } + + @Test + public void testSerdeUsingIntervalsAndLegacyDimensionsMetrics() throws Exception + { + final String json = "{" + + "\"type\":\"druid\"," + + "\"dataSource\":\"foo\"," + + "\"interval\":\"2000-01-01T00:00:00.000Z/2001-01-01T00:00:00.000Z\"," + + "\"dimensions\":[\"a\"]," + + "\"metrics\":[\"b\"]" + + "}"; + + final InputSource inputSource = mapper.readValue(json, InputSource.class); + + Assert.assertThat(inputSource, CoreMatchers.instanceOf(DruidInputSource.class)); + Assert.assertEquals( + new DruidInputSource( + "foo", + Intervals.of("2000/2001"), + null, + null, + ImmutableList.of("a"), + ImmutableList.of("b"), + indexIO, + coordinatorClient, + segmentLoaderFactory, + retryPolicyFactory, + taskConfig + ), + inputSource + ); + + Assert.assertEquals(json, mapper.writeValueAsString(inputSource)); + } + + @Test + public void testSerdeUsingSegments() throws Exception + { + final String json = "{" + + "\"type\":\"druid\"," + + "\"dataSource\":\"foo\"," + + "\"segments\":[" + + "{\"segmentId\":\"foo_2000-01-01T00:00:00.000Z_2000-01-01T01:00:00.000Z_abc123\"," + + "\"intervals\":[\"2000-01-01T00:00:00.000Z/2000-01-01T12:00:00.000Z\"]}" + + "]" + + "}"; + + final InputSource inputSource = mapper.readValue(json, InputSource.class); + + Assert.assertThat(inputSource, CoreMatchers.instanceOf(DruidInputSource.class)); + Assert.assertEquals( + new DruidInputSource( + "foo", + null, + ImmutableList.of( + new WindowedSegmentId( + "foo_2000-01-01T00:00:00.000Z_2000-01-01T01:00:00.000Z_abc123", + ImmutableList.of(Intervals.of("2000-01-01T00/2000-01-01T12")) + ) + ), + null, + null, + null, + indexIO, + coordinatorClient, + segmentLoaderFactory, + retryPolicyFactory, + taskConfig + ), + inputSource + ); + + Assert.assertEquals(json, mapper.writeValueAsString(inputSource)); + } + + @Test + public void testSerdeUsingBothIntervalsAndSegments() throws Exception + { + final String json = "{" + + "\"type\":\"druid\"," + + "\"dataSource\":\"foo\"," + + "\"interval\":\"2000-01-01T00:00:00.000Z/2001-01-01T00:00:00.000Z\"," + + "\"segments\":[" + + " {\"segmentId\":\"foo_2000-01-01T00:00:00.000Z_2000-01-01T01:00:00.000Z_abc123\"," + + " \"intervals\":[\"2000-01-01T00:00:00.000Z/2000-01-01T12:00:00.000Z\"]}" + + "]" + + "}"; + + + expectedException.expect(JsonProcessingException.class); + expectedException.expectMessage("Specify exactly one of 'interval' and 'segments'"); + + mapper.readValue(json, InputSource.class); + } + + @Test + public void testSerdeUsingNeitherIntervalsNorSegments() throws Exception + { + final String json = "{" + + "\"type\":\"druid\"," + + "\"dataSource\":\"foo\"" + + "}"; + + expectedException.expect(JsonProcessingException.class); + expectedException.expectMessage("Specify exactly one of 'interval' and 'segments'"); + + mapper.readValue(json, InputSource.class); + } + + @Test + public void testSerdeUsingNoDataSource() throws Exception + { + final String json = "{" + + "\"type\":\"druid\"," + + "\"interval\":\"2000-01-01T00:00:00.000Z/2001-01-01T00:00:00.000Z\"" + + "}"; + + expectedException.expect(JsonProcessingException.class); + expectedException.expectMessage("dataSource"); + + mapper.readValue(json, InputSource.class); + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/StreamChunkParserTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/StreamChunkParserTest.java index 92af29e17880..1cab704a2eff 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/StreamChunkParserTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/StreamChunkParserTest.java @@ -180,7 +180,7 @@ public void parseEmptyNotEndOfShard() throws IOException final StreamChunkParser chunkParser = new StreamChunkParser<>( null, inputFormat, - new InputRowSchema(TIMESTAMP_SPEC, DimensionsSpec.EMPTY, Collections.emptyList()), + new InputRowSchema(TIMESTAMP_SPEC, DimensionsSpec.EMPTY, ColumnsFilter.all()), TransformSpec.NONE, temporaryFolder.newFolder(), row -> true, @@ -203,7 +203,7 @@ public void parseEmptyEndOfShard() throws IOException final StreamChunkParser chunkParser = new StreamChunkParser<>( null, inputFormat, - new InputRowSchema(TIMESTAMP_SPEC, DimensionsSpec.EMPTY, Collections.emptyList()), + new InputRowSchema(TIMESTAMP_SPEC, DimensionsSpec.EMPTY, ColumnsFilter.all()), TransformSpec.NONE, temporaryFolder.newFolder(), row -> true,