From 203fc7a376216ccd11ea7cbea39a444f83ba4b58 Mon Sep 17 00:00:00 2001 From: cryptoe Date: Thu, 10 Feb 2022 17:47:44 +0530 Subject: [PATCH 1/5] Adding new config for disabling group by on multiValue column --- .../query/groupby/GroupByQueryConfig.java | 14 +++++++++ .../query/groupby/GroupByQueryEngine.java | 8 ++++- .../epinephelinae/GroupByQueryEngineV2.java | 15 ++++++++-- .../query/groupby/GroupByQueryRunnerTest.java | 29 +++++++++++++++++++ .../CalciteMultiValueStringQueryTest.java | 25 +++++++++++++++- 5 files changed, 87 insertions(+), 4 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java index 7bf2d0d3b445..276dabd1758d 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java @@ -34,6 +34,7 @@ public class GroupByQueryConfig public static final String CTX_KEY_FORCE_PUSH_DOWN_NESTED_QUERY = "forcePushDownNestedQuery"; public static final String CTX_KEY_EXECUTING_NESTED_QUERY = "executingNestedQuery"; public static final String CTX_KEY_ARRAY_RESULT_ROWS = "resultAsArray"; + public static final String CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING = "groupByEnableMultiValueUnnesting"; private static final String CTX_KEY_IS_SINGLE_THREADED = "groupByIsSingleThreaded"; private static final String CTX_KEY_MAX_INTERMEDIATE_ROWS = "maxIntermediateRows"; private static final String CTX_KEY_MAX_RESULTS = "maxResults"; @@ -97,6 +98,9 @@ public class GroupByQueryConfig @JsonProperty private boolean vectorize = true; + @JsonProperty + private boolean enableMultiValueUnnesting = true; + public String getDefaultStrategy() { return defaultStrategy; @@ -192,6 +196,11 @@ public boolean isForcePushDownNestedQuery() return forcePushDownNestedQuery; } + public boolean isMultiValueUnnestingEnabled() + { + return enableMultiValueUnnesting; + } + public GroupByQueryConfig withOverrides(final GroupByQuery query) { final GroupByQueryConfig newConfig = new GroupByQueryConfig(); @@ -244,6 +253,10 @@ public GroupByQueryConfig withOverrides(final GroupByQuery query) getNumParallelCombineThreads() ); newConfig.vectorize = query.getContextBoolean(QueryContexts.VECTORIZE_KEY, isVectorize()); + newConfig.enableMultiValueUnnesting = query.getContextValue( + CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, + isMultiValueUnnestingEnabled() + ); return newConfig; } @@ -266,6 +279,7 @@ public String toString() ", numParallelCombineThreads=" + numParallelCombineThreads + ", vectorize=" + vectorize + ", forcePushDownNestedQuery=" + forcePushDownNestedQuery + + ", enableMultiValueUnnesting=" + enableMultiValueUnnesting + '}'; } } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java index d3e072e2e4f1..a1fdf7051497 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java @@ -91,7 +91,13 @@ public Sequence process(final GroupByQuery query, final StorageAdapter stor "Null storage adapter found. Probably trying to issue a query against a segment being memory unmapped." ); } - + if (!query.getContextValue(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, true)) { + throw new UnsupportedOperationException(String.format( + "GroupBy v1 does not support %s as false. Set %s to true", + GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, + GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING + )); + } final List intervals = query.getQuerySegmentSpec().getIntervals(); if (intervals.size() != 1) { throw new IAE("Should only have one interval, got[%s]", intervals); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java index 5a5263efff1f..e7a9b1d62bee 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java @@ -228,6 +228,17 @@ public GroupByEngineIterator make() processingBuffer ); + final boolean allSingleValueDims = hasNoExplodingDimensions(columnSelectorFactory, + query.getDimensions()); + if (!(query.getContextValue(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, true)) + && !allSingleValueDims) { + throw new ISE( + "Group by on multi value columns not allowed." + + " Please set %s in query context to true", + GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY + ); + } + if (cardinalityForArrayAggregation >= 0) { return new ArrayAggregateIterator( query, @@ -236,7 +247,7 @@ public GroupByEngineIterator make() processingBuffer, fudgeTimestamp, dims, - hasNoExplodingDimensions(columnSelectorFactory, query.getDimensions()), + allSingleValueDims, cardinalityForArrayAggregation ); } else { @@ -247,7 +258,7 @@ public GroupByEngineIterator make() processingBuffer, fudgeTimestamp, dims, - hasNoExplodingDimensions(columnSelectorFactory, query.getDimensions()) + allSingleValueDims ); } } diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java index 5b89eb3a85b2..ae56cbb07caf 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java @@ -1310,6 +1310,35 @@ public void testMultiValueDimension() TestHelper.assertExpectedObjects(expectedResults, results, "multi-value-dim"); } + @Test + public void testMultiValueDimensionNotAllowed() + { + + if (config.getDefaultStrategy().equals(GroupByStrategySelector.STRATEGY_V1)) { + expectedException.expect(UnsupportedOperationException.class); + expectedException.expectMessage(String.format( + "GroupBy v1 does not support %s as false", + GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING + )); + } else if (!vectorize) { + expectedException.expect(RuntimeException.class); + expectedException.expectMessage("Group by on multi value columns not allowed"); + } else { + cannotVectorize(); + } + + GroupByQuery query = makeQueryBuilder() + .setDataSource(QueryRunnerTestHelper.DATA_SOURCE) + .setQuerySegmentSpec(QueryRunnerTestHelper.FIRST_TO_THIRD) + .setDimensions(new DefaultDimensionSpec("placementish", "alias")) + .setAggregatorSpecs(QueryRunnerTestHelper.ROWS_COUNT, new LongSumAggregatorFactory("idx", "index")) + .setGranularity(QueryRunnerTestHelper.ALL_GRAN) + .overrideContext(ImmutableMap.of(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, false)) + .build(); + + GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); + } + @Test public void testMultiValueDimensionAsArray() { diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java index e7203331ccc1..bd1ccd6bb2dd 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java @@ -32,6 +32,7 @@ import org.apache.druid.query.filter.LikeDimFilter; import org.apache.druid.query.filter.SelectorDimFilter; import org.apache.druid.query.groupby.GroupByQuery; +import org.apache.druid.query.groupby.GroupByQueryConfig; import org.apache.druid.query.groupby.orderby.DefaultLimitSpec; import org.apache.druid.query.groupby.orderby.OrderByColumnSpec; import org.apache.druid.query.ordering.StringComparators; @@ -43,7 +44,9 @@ import org.apache.druid.sql.calcite.util.CalciteTests; import org.junit.Test; +import java.util.HashMap; import java.util.List; +import java.util.Map; public class CalciteMultiValueStringQueryTest extends BaseCalciteQueryTest { @@ -53,7 +56,8 @@ public void testMultiValueStringWorksLikeStringGroupBy() throws Exception { // Cannot vectorize due to usage of expressions. cannotVectorize(); - + Map groupByOnMultiValueColumnEnabled = new HashMap<>(QUERY_CONTEXT_DEFAULT); + groupByOnMultiValueColumnEnabled.put(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, true); List expected; if (NullHandling.replaceWithDefault()) { expected = ImmutableList.of( @@ -76,6 +80,7 @@ public void testMultiValueStringWorksLikeStringGroupBy() throws Exception } testQuery( "SELECT concat(dim3, 'foo'), SUM(cnt) FROM druid.numfoo GROUP BY 1 ORDER BY 2 DESC", + groupByOnMultiValueColumnEnabled, ImmutableList.of( GroupByQuery.builder() .setDataSource(CalciteTests.DATASOURCE3) @@ -103,6 +108,24 @@ public void testMultiValueStringWorksLikeStringGroupBy() throws Exception ); } + @Test + public void testMultiValueStringGroupByDoesNotWork() throws Exception + { + // Cannot vectorize due to usage of expressions. + cannotVectorize(); + Map groupByOnMultiValueColumnDisabled = new HashMap<>(QUERY_CONTEXT_DEFAULT); + groupByOnMultiValueColumnDisabled.put(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, false); + testQueryThrows( + "SELECT concat(dim3, 'foo'), SUM(cnt) FROM druid.numfoo GROUP BY 1 ORDER BY 2 DESC", + groupByOnMultiValueColumnDisabled, + ImmutableList.of(), + exception -> { + exception.expect(RuntimeException.class); + exception.expectMessage("Group by on multi value columns not allowed"); + } + ); + } + @Test public void testMultiValueStringWorksLikeStringGroupByWithFilter() throws Exception { From f4f07408ae8d0f1a9c67cfe435bf053365b8b483 Mon Sep 17 00:00:00 2001 From: cryptoe Date: Fri, 11 Feb 2022 15:07:30 +0530 Subject: [PATCH 2/5] Addressing review comments and doc changes --- docs/querying/multi-value-dimensions.md | 5 ++ .../query/groupby/GroupByQueryConfig.java | 2 +- .../query/groupby/GroupByQueryEngine.java | 5 +- .../epinephelinae/GroupByQueryEngineV2.java | 56 ++++++++++++------- .../query/groupby/GroupByQueryRunnerTest.java | 10 +++- .../CalciteMultiValueStringQueryTest.java | 9 ++- 6 files changed, 62 insertions(+), 25 deletions(-) diff --git a/docs/querying/multi-value-dimensions.md b/docs/querying/multi-value-dimensions.md index b33a2959795c..44101b19867f 100644 --- a/docs/querying/multi-value-dimensions.md +++ b/docs/querying/multi-value-dimensions.md @@ -375,3 +375,8 @@ This query returns the following result: Note that, for groupBy queries, you could get similar result with a [having spec](having.md) but using a filtered `dimensionSpec` is much more efficient because that gets applied at the lowest level in the query processing pipeline. Having specs are applied at the outermost level of groupBy query processing. + +## Disable GroupBy on multivalue columns + +As grouping on multivalue columns causes implicit unnest, users can avoid this behaviour by setting +`groupByEnableMultiValueUnnesting` in the query context to `false`. This will result the query to error out. \ No newline at end of file diff --git a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java index 276dabd1758d..5e32e88f2303 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java @@ -253,7 +253,7 @@ public GroupByQueryConfig withOverrides(final GroupByQuery query) getNumParallelCombineThreads() ); newConfig.vectorize = query.getContextBoolean(QueryContexts.VECTORIZE_KEY, isVectorize()); - newConfig.enableMultiValueUnnesting = query.getContextValue( + newConfig.enableMultiValueUnnesting = query.getContextBoolean( CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, isMultiValueUnnestingEnabled() ); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java index a1fdf7051497..2ec557ce45dc 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java @@ -32,6 +32,7 @@ import org.apache.druid.guice.annotations.Global; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.ISE; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.guava.BaseSequence; import org.apache.druid.java.util.common.guava.FunctionalIterator; import org.apache.druid.java.util.common.guava.Sequence; @@ -92,8 +93,8 @@ public Sequence process(final GroupByQuery query, final StorageAdapter stor ); } if (!query.getContextValue(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, true)) { - throw new UnsupportedOperationException(String.format( - "GroupBy v1 does not support %s as false. Set %s to true", + throw new UnsupportedOperationException(StringUtils.format( + "GroupBy v1 does not support %s as false. Set %s to true or use groupBy v2", GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING )); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java index e7a9b1d62bee..a055c34b273d 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java @@ -80,6 +80,7 @@ import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; +import java.util.stream.Collectors; import java.util.stream.Stream; /** @@ -228,13 +229,19 @@ public GroupByEngineIterator make() processingBuffer ); - final boolean allSingleValueDims = hasNoExplodingDimensions(columnSelectorFactory, - query.getDimensions()); + final List explodingDimensions = findAllProbableExplodingDimensions( + columnSelectorFactory, + query.getDimensions() + ); + + final boolean allSingleValueDims = explodingDimensions.size() == 0; if (!(query.getContextValue(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, true)) && !allSingleValueDims) { throw new ISE( - "Group by on multi value columns not allowed." - + " Please set %s in query context to true", + "Encountered multi-value dimensions %s that cannot be processed with %s set to false." + + " Consider changing these dimensions to arrays or setting %s to true.", + explodingDimensions.toString(), + GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY, GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY ); } @@ -338,35 +345,46 @@ public static int getCardinalityForArrayAggregation( } /** - * Checks whether all "dimensions" are either single-valued, or if the input column or output dimension spec has - * specified a type that {@link ColumnType#isArray()}. Both cases indicate we don't want to explode the under-lying - * multi value column. Since selectors on non-existent columns will show up as full of nulls, they are effectively - * single valued, however capabilites on columns can also be null, for example during broker merge with an 'inline' - * datasource subquery, so we only return true from this method when capabilities are fully known. + * Returns all dimension names that are or could be multi valued, or if the input column specified a type that + * {@link ColumnType#isArray()}. Both cases indicate we want to explode the under-lying multi value column. Since + * selectors on non-existent columns will show up as full of nulls, they are effectively single valued, however + * capabilites on columns can also be null, for example during broker merge with an 'inline' datasource subquery. We + * mark columns with null capabilites as candidates for explosion. */ - public static boolean hasNoExplodingDimensions( + public static List findAllProbableExplodingDimensions( final ColumnInspector inspector, final List dimensions ) { return dimensions .stream() - .allMatch( + .filter( dimension -> { if (dimension.mustDecorate()) { // DimensionSpecs that decorate may turn singly-valued columns into multi-valued selectors. - // To be safe, we must return false here. + // To be safe, we must return true here. + return true; + } + + // DimensionSpecs of type arrays do not explode + if (dimension.getOutputType().isArray()) { return false; } - // Now check column capabilities, which must be present and explicitly not multi-valued and not arrays + // Now check column capabilities, which if present may be multi-valued or explicitly arrays final ColumnCapabilities columnCapabilities = inspector.getColumnCapabilities(dimension.getDimension()); - return dimension.getOutputType().isArray() - || (columnCapabilities != null - && columnCapabilities.hasMultipleValues().isFalse() - && !columnCapabilities.isArray() - ); - }); + + // if the column capabilites are null then the col might be multi value + if (columnCapabilities == null) { + return true; + } else if (columnCapabilities.hasMultipleValues().isMaybeTrue()) { + return true; + } else if (columnCapabilities.isArray()) { + return true; + } else { + return false; + } + }).map(dimension -> dimension.getDimension()).collect(Collectors.toList()); } public static void convertRowTypesToOutputTypes( diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java index ae56cbb07caf..4889e2a55bb6 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java @@ -1316,13 +1316,19 @@ public void testMultiValueDimensionNotAllowed() if (config.getDefaultStrategy().equals(GroupByStrategySelector.STRATEGY_V1)) { expectedException.expect(UnsupportedOperationException.class); - expectedException.expectMessage(String.format( + expectedException.expectMessage(StringUtils.format( "GroupBy v1 does not support %s as false", GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING )); } else if (!vectorize) { expectedException.expect(RuntimeException.class); - expectedException.expectMessage("Group by on multi value columns not allowed"); + expectedException.expectMessage(StringUtils.format( + "Encountered multi-value dimensions %s that cannot be processed with %s set to false." + + " Consider changing these dimensions to arrays or setting %s to true.", + ImmutableList.of("placementish").toString(), + GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY, + GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY + )); } else { cannotVectorize(); } diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java index bd1ccd6bb2dd..4f268fbe6251 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java @@ -22,6 +22,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import org.apache.druid.common.config.NullHandling; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.granularity.Granularities; import org.apache.druid.math.expr.ExpressionProcessing; import org.apache.druid.query.Druids; @@ -121,7 +122,13 @@ public void testMultiValueStringGroupByDoesNotWork() throws Exception ImmutableList.of(), exception -> { exception.expect(RuntimeException.class); - exception.expectMessage("Group by on multi value columns not allowed"); + exception.expectMessage(StringUtils.format( + "Encountered multi-value dimensions %s that cannot be processed with %s set to false." + + " Consider changing these dimensions to arrays or setting %s to true.", + "[v0]", + GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY, + GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY + )); } ); } From 26308747a2fa5036eeb213656be7786036a29ab1 Mon Sep 17 00:00:00 2001 From: cryptoe Date: Tue, 15 Feb 2022 00:07:26 +0530 Subject: [PATCH 3/5] Addressing review comments and doc changes --- docs/querying/groupbyquery.md | 1 + docs/querying/multi-value-dimensions.md | 6 +- .../epinephelinae/GroupByQueryEngineV2.java | 94 +++++++++---------- .../query/groupby/GroupByQueryRunnerTest.java | 6 +- .../CalciteMultiValueStringQueryTest.java | 8 +- 5 files changed, 58 insertions(+), 57 deletions(-) diff --git a/docs/querying/groupbyquery.md b/docs/querying/groupbyquery.md index 6128b2bcbc78..79250e593c93 100644 --- a/docs/querying/groupbyquery.md +++ b/docs/querying/groupbyquery.md @@ -436,6 +436,7 @@ Supported query contexts: |`sortByDimsFirst`|Sort the results first by dimension values and then by timestamp.|false| |`forceLimitPushDown`|When all fields in the orderby are part of the grouping key, the Broker will push limit application down to the Historical processes. When the sorting order uses fields that are not in the grouping key, applying this optimization can result in approximate results with unknown accuracy, so this optimization is disabled by default in that case. Enabling this context flag turns on limit push down for limit/orderbys that contain non-grouping key columns.|false| |`applyLimitPushDownToSegment`|If Broker pushes limit down to queryable nodes (historicals, peons) then limit results during segment scan. This context value can be used to override `druid.query.groupBy.applyLimitPushDownToSegment`.|true| +|`groupByEnableMultiValueUnnesting`|Safety flag to enable/disable the implicit unnesting on multi value column's as part of the grouping key. 'true' indicates multi-value grouping keys are unnested. 'false' returns an error if a multi value column is found as part of the grouping key.|true| #### GroupBy v1 configurations diff --git a/docs/querying/multi-value-dimensions.md b/docs/querying/multi-value-dimensions.md index 44101b19867f..2e5e360dabd3 100644 --- a/docs/querying/multi-value-dimensions.md +++ b/docs/querying/multi-value-dimensions.md @@ -378,5 +378,7 @@ Having specs are applied at the outermost level of groupBy query processing. ## Disable GroupBy on multivalue columns -As grouping on multivalue columns causes implicit unnest, users can avoid this behaviour by setting -`groupByEnableMultiValueUnnesting` in the query context to `false`. This will result the query to error out. \ No newline at end of file +You can disable the implicit unnesting behavior for groupBy by setting groupByEnableMultiValueUnnesting: false in your +query context. In this mode, the groupBy engine will return an error instead of completing the query. This is a safety +feature for situations where you believe that all dimensions are singly-valued and want the engine to reject any +multi-valued dimensions that were inadvertently included. \ No newline at end of file diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java index a055c34b273d..a3be85c40ed3 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java @@ -80,7 +80,6 @@ import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; -import java.util.stream.Collectors; import java.util.stream.Stream; /** @@ -229,23 +228,6 @@ public GroupByEngineIterator make() processingBuffer ); - final List explodingDimensions = findAllProbableExplodingDimensions( - columnSelectorFactory, - query.getDimensions() - ); - - final boolean allSingleValueDims = explodingDimensions.size() == 0; - if (!(query.getContextValue(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, true)) - && !allSingleValueDims) { - throw new ISE( - "Encountered multi-value dimensions %s that cannot be processed with %s set to false." - + " Consider changing these dimensions to arrays or setting %s to true.", - explodingDimensions.toString(), - GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY, - GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY - ); - } - if (cardinalityForArrayAggregation >= 0) { return new ArrayAggregateIterator( query, @@ -254,7 +236,7 @@ public GroupByEngineIterator make() processingBuffer, fudgeTimestamp, dims, - allSingleValueDims, + hasNoExplodingDimensions(columnSelectorFactory, query.getDimensions()), cardinalityForArrayAggregation ); } else { @@ -265,7 +247,7 @@ public GroupByEngineIterator make() processingBuffer, fudgeTimestamp, dims, - allSingleValueDims + hasNoExplodingDimensions(columnSelectorFactory, query.getDimensions()) ); } } @@ -345,46 +327,35 @@ public static int getCardinalityForArrayAggregation( } /** - * Returns all dimension names that are or could be multi valued, or if the input column specified a type that - * {@link ColumnType#isArray()}. Both cases indicate we want to explode the under-lying multi value column. Since - * selectors on non-existent columns will show up as full of nulls, they are effectively single valued, however - * capabilites on columns can also be null, for example during broker merge with an 'inline' datasource subquery. We - * mark columns with null capabilites as candidates for explosion. + * Checks whether all "dimensions" are either single-valued, or if the input column or output dimension spec has + * specified a type that {@link ColumnType#isArray()}. Both cases indicate we don't want to explode the under-lying + * multi value column. Since selectors on non-existent columns will show up as full of nulls, they are effectively + * single valued, however capabilites on columns can also be null, for example during broker merge with an 'inline' + * datasource subquery, so we only return true from this method when capabilities are fully known. */ - public static List findAllProbableExplodingDimensions( + public static boolean hasNoExplodingDimensions( final ColumnInspector inspector, final List dimensions ) { return dimensions .stream() - .filter( + .allMatch( dimension -> { if (dimension.mustDecorate()) { // DimensionSpecs that decorate may turn singly-valued columns into multi-valued selectors. - // To be safe, we must return true here. - return true; - } - - // DimensionSpecs of type arrays do not explode - if (dimension.getOutputType().isArray()) { + // To be safe, we must return false here. return false; } - // Now check column capabilities, which if present may be multi-valued or explicitly arrays + // Now check column capabilities, which must be present and explicitly not multi-valued and not arrays final ColumnCapabilities columnCapabilities = inspector.getColumnCapabilities(dimension.getDimension()); - - // if the column capabilites are null then the col might be multi value - if (columnCapabilities == null) { - return true; - } else if (columnCapabilities.hasMultipleValues().isMaybeTrue()) { - return true; - } else if (columnCapabilities.isArray()) { - return true; - } else { - return false; - } - }).map(dimension -> dimension.getDimension()).collect(Collectors.toList()); + return dimension.getOutputType().isArray() + || (columnCapabilities != null + && columnCapabilities.hasMultipleValues().isFalse() + && !columnCapabilities.isArray() + ); + }); } public static void convertRowTypesToOutputTypes( @@ -488,6 +459,8 @@ private abstract static class GroupByEngineIterator implements Iterator @Nullable protected CloseableGrouperIterator delegate = null; protected final boolean allSingleValueDims; + protected final boolean allowMultiValueGrouping; + public GroupByEngineIterator( final GroupByQuery query, @@ -509,6 +482,10 @@ public GroupByEngineIterator( // Time is the same for every row in the cursor this.timestamp = fudgeTimestamp != null ? fudgeTimestamp : cursor.getTime(); this.allSingleValueDims = allSingleValueDims; + this.allowMultiValueGrouping = query.getContextBoolean( + GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, + true + ); } private CloseableGrouperIterator initNewDelegate() @@ -622,6 +599,19 @@ protected int getSingleValue(IndexedInts indexedInts) return indexedInts.size() == 1 ? indexedInts.get(0) : GroupByColumnSelectorStrategy.GROUP_BY_MISSING_VALUE; } + protected void checkIfMultiValueGroupingIsAllowed(String dimName) + { + if (!allowMultiValueGrouping) { + throw new ISE( + "Encountered multi-value dimension %s that cannot be processed with %s set to false." + + " Consider setting %s to true.", + dimName, + GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY, + GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY + ); + } + } + } private static class HashAggregateIterator extends GroupByEngineIterator @@ -793,6 +783,9 @@ protected void aggregateMultiValueDims(Grouper grouper) ); if (doAggregate) { + // this check is done during the row aggregation as a dimension can become multi-value col if + // {@link org.apache.druid.segment.column.ColumnCapabilities} is unkown. + checkIfMultiValueGroupingIsAllowed(dims[stackPointer].getName()); stack[stackPointer]++; for (int i = stackPointer + 1; i < stack.length; i++) { dims[i].getColumnSelectorStrategy().initGroupingKeyColumnValue( @@ -918,12 +911,17 @@ private void aggregateMultiValueDims(IntGrouper grouper) } while (!cursor.isDone()) { - int multiValuesSize = multiValues.size(); + final int multiValuesSize = multiValues.size(); if (multiValuesSize == 0) { if (!grouper.aggregate(GroupByColumnSelectorStrategy.GROUP_BY_MISSING_VALUE).isOk()) { return; } } else { + if (multiValuesSize > 1) { + // this check is done during the row aggregation as a dimension can become multi-value col if + // {@link org.apache.druid.segment.column.ColumnCapabilities} is unkown. + checkIfMultiValueGroupingIsAllowed(dim.getName()); + } for (; nextValIndex < multiValuesSize; nextValIndex++) { if (!grouper.aggregate(multiValues.get(nextValIndex)).isOk()) { return; @@ -1060,4 +1058,4 @@ public void reset() // No state, nothing to reset } } -} +} \ No newline at end of file diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java index 4889e2a55bb6..a5053825a56f 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java @@ -1323,9 +1323,9 @@ public void testMultiValueDimensionNotAllowed() } else if (!vectorize) { expectedException.expect(RuntimeException.class); expectedException.expectMessage(StringUtils.format( - "Encountered multi-value dimensions %s that cannot be processed with %s set to false." - + " Consider changing these dimensions to arrays or setting %s to true.", - ImmutableList.of("placementish").toString(), + "Encountered multi-value dimension %s that cannot be processed with %s set to false." + + " Consider setting %s to true.", + "placementish", GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY, GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY )); diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java index 4f268fbe6251..b28f418e9643 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java @@ -122,10 +122,10 @@ public void testMultiValueStringGroupByDoesNotWork() throws Exception ImmutableList.of(), exception -> { exception.expect(RuntimeException.class); - exception.expectMessage(StringUtils.format( - "Encountered multi-value dimensions %s that cannot be processed with %s set to false." - + " Consider changing these dimensions to arrays or setting %s to true.", - "[v0]", + expectedException.expectMessage(StringUtils.format( + "Encountered multi-value dimension %s that cannot be processed with %s set to false." + + " Consider setting %s to true.", + "v0", GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY, GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY )); From dc09ce0453867b59f41f6b1c3de6abbbf5c07979 Mon Sep 17 00:00:00 2001 From: cryptoe Date: Wed, 16 Feb 2022 10:12:27 +0530 Subject: [PATCH 4/5] Fixing static checks --- docs/querying/multi-value-dimensions.md | 2 +- .../org/apache/druid/query/groupby/GroupByQueryEngine.java | 6 +++--- .../query/groupby/epinephelinae/GroupByQueryEngineV2.java | 2 +- .../apache/druid/query/groupby/GroupByQueryRunnerTest.java | 3 ++- website/.spelling | 4 ++++ 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/docs/querying/multi-value-dimensions.md b/docs/querying/multi-value-dimensions.md index 2e5e360dabd3..2ecf4f75846d 100644 --- a/docs/querying/multi-value-dimensions.md +++ b/docs/querying/multi-value-dimensions.md @@ -376,7 +376,7 @@ Note that, for groupBy queries, you could get similar result with a [having spec `dimensionSpec` is much more efficient because that gets applied at the lowest level in the query processing pipeline. Having specs are applied at the outermost level of groupBy query processing. -## Disable GroupBy on multivalue columns +## Disable GroupBy on multi-value columns You can disable the implicit unnesting behavior for groupBy by setting groupByEnableMultiValueUnnesting: false in your query context. In this mode, the groupBy engine will return an error instead of completing the query. This is a safety diff --git a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java index 2ec557ce45dc..62a82dc84413 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java @@ -32,7 +32,7 @@ import org.apache.druid.guice.annotations.Global; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.ISE; -import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.java.util.common.UOE; import org.apache.druid.java.util.common.guava.BaseSequence; import org.apache.druid.java.util.common.guava.FunctionalIterator; import org.apache.druid.java.util.common.guava.Sequence; @@ -93,11 +93,11 @@ public Sequence process(final GroupByQuery query, final StorageAdapter stor ); } if (!query.getContextValue(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, true)) { - throw new UnsupportedOperationException(StringUtils.format( + throw new UOE( "GroupBy v1 does not support %s as false. Set %s to true or use groupBy v2", GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING - )); + ); } final List intervals = query.getQuerySegmentSpec().getIntervals(); if (intervals.size() != 1) { diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java index a3be85c40ed3..ddc2c4a8cb9e 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java @@ -1058,4 +1058,4 @@ public void reset() // No state, nothing to reset } } -} \ No newline at end of file +} diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java index a5053825a56f..4b958d3c45e9 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java @@ -40,6 +40,7 @@ import org.apache.druid.java.util.common.Intervals; import org.apache.druid.java.util.common.Pair; import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.java.util.common.UOE; import org.apache.druid.java.util.common.concurrent.Execs; import org.apache.druid.java.util.common.granularity.DurationGranularity; import org.apache.druid.java.util.common.granularity.Granularities; @@ -1315,7 +1316,7 @@ public void testMultiValueDimensionNotAllowed() { if (config.getDefaultStrategy().equals(GroupByStrategySelector.STRATEGY_V1)) { - expectedException.expect(UnsupportedOperationException.class); + expectedException.expect(UOE.class); expectedException.expectMessage(StringUtils.format( "GroupBy v1 does not support %s as false", GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING diff --git a/website/.spelling b/website/.spelling index 54fb7d01a667..4a24183081bb 100644 --- a/website/.spelling +++ b/website/.spelling @@ -1472,6 +1472,8 @@ outputName pushdown row1 subtotalsSpec +unnested +unnesting - ../docs/querying/having.md HavingSpec HavingSpecs @@ -1502,6 +1504,8 @@ row4 t3 t4 t5 +groupByEnableMultiValueUnnesting +unnesting - ../docs/querying/multitenancy.md 500ms tenant_id From e8916466f39d62a9a21bf228a4df1864ad3a4b8c Mon Sep 17 00:00:00 2001 From: cryptoe Date: Wed, 16 Feb 2022 12:31:15 +0530 Subject: [PATCH 5/5] Fixing typos --- .../query/groupby/epinephelinae/GroupByQueryEngineV2.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java index ddc2c4a8cb9e..f38f6ce2c104 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java @@ -783,8 +783,8 @@ protected void aggregateMultiValueDims(Grouper grouper) ); if (doAggregate) { - // this check is done during the row aggregation as a dimension can become multi-value col if - // {@link org.apache.druid.segment.column.ColumnCapabilities} is unkown. + // this check is done during the row aggregation as a dimension can become multi-value col if column + // capabilities is unknown. checkIfMultiValueGroupingIsAllowed(dims[stackPointer].getName()); stack[stackPointer]++; for (int i = stackPointer + 1; i < stack.length; i++) { @@ -918,8 +918,8 @@ private void aggregateMultiValueDims(IntGrouper grouper) } } else { if (multiValuesSize > 1) { - // this check is done during the row aggregation as a dimension can become multi-value col if - // {@link org.apache.druid.segment.column.ColumnCapabilities} is unkown. + // this check is done during the row aggregation as a dimension can become multi-value col if column + // capabilities is unknown. checkIfMultiValueGroupingIsAllowed(dim.getName()); } for (; nextValIndex < multiValuesSize; nextValIndex++) {