From 86c7ba6d934dccab3f97330dfa79e80be19c548e Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Tue, 2 Jul 2019 15:45:05 -0700 Subject: [PATCH 1/3] GroupBy: Fix improper uses of StorageAdapter#getColumnCapabilities. 1) A usage in "isArrayAggregateApplicable" that would potentially incorrectly use array-based aggregation on a virtual column that shadows a real column. 2) A usage in "process" that would potentially use the more expensive multi-value aggregation path on a singly-valued virtual column. (No correctness issue, but a performance issue.) --- .../epinephelinae/GroupByQueryEngineV2.java | 55 +++++++++++++------ 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java index e3c609e22c83..7594b9048c4f 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java @@ -48,11 +48,13 @@ import org.apache.druid.query.groupby.epinephelinae.column.NullableValueGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.StringGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.strategy.GroupByStrategyV2; +import org.apache.druid.segment.ColumnSelectorFactory; import org.apache.druid.segment.ColumnValueSelector; import org.apache.druid.segment.Cursor; import org.apache.druid.segment.DimensionHandlerUtils; import org.apache.druid.segment.DimensionSelector; import org.apache.druid.segment.StorageAdapter; +import org.apache.druid.segment.VirtualColumns; import org.apache.druid.segment.column.ColumnCapabilities; import org.apache.druid.segment.column.ValueType; import org.apache.druid.segment.data.IndexedInts; @@ -115,14 +117,6 @@ public static Sequence process( null ); - final boolean allSingleValueDims = query - .getDimensions() - .stream() - .allMatch(dimension -> { - final ColumnCapabilities columnCapabilities = storageAdapter.getColumnCapabilities(dimension.getDimension()); - return columnCapabilities != null && !columnCapabilities.hasMultipleValues(); - }); - final ResourceHolder bufferHolder = intermediateResultsBufferPool.take(); final String fudgeTimestampString = NullHandling.emptyToNullIfNeeded( @@ -140,18 +134,38 @@ public static Sequence process( @Override public GroupByEngineIterator make() { + final ColumnSelectorFactory columnSelectorFactory = cursor.getColumnSelectorFactory(); + final boolean allSingleValueDims = query + .getDimensions() + .stream() + .allMatch(dimension -> { + final ColumnCapabilities columnCapabilities = columnSelectorFactory.getColumnCapabilities( + dimension.getDimension() + ); + return columnCapabilities != null && !columnCapabilities.hasMultipleValues(); + }); + ColumnSelectorPlus[] selectorPlus = DimensionHandlerUtils .createColumnSelectorPluses( STRATEGY_FACTORY, query.getDimensions(), - cursor.getColumnSelectorFactory() + columnSelectorFactory ); GroupByColumnSelectorPlus[] dims = createGroupBySelectorPlus(selectorPlus); final ByteBuffer buffer = bufferHolder.get(); - // Check array-based aggregation is applicable - if (isArrayAggregateApplicable(querySpecificConfig, query, dims, storageAdapter, buffer)) { + // Check if array-based aggregation is applicable + final boolean useArrayAggregation = isArrayAggregateApplicable( + querySpecificConfig, + query, + dims, + storageAdapter, + query.getVirtualColumns(), + buffer + ); + + if (useArrayAggregation) { return new ArrayAggregateIterator( query, querySpecificConfig, @@ -191,6 +205,7 @@ private static boolean isArrayAggregateApplicable( GroupByQuery query, GroupByColumnSelectorPlus[] dims, StorageAdapter storageAdapter, + VirtualColumns virtualColumns, ByteBuffer buffer ) { @@ -206,11 +221,19 @@ private static boolean isArrayAggregateApplicable( columnCapabilities = null; cardinality = 1; } else if (dims.length == 1) { + // Only real columns can use array-based aggregation, since virtual columns cannot currently report their + // cardinality. We need to check if a virtual column exists with the same name, since virtual columns can shadow + // real columns, and we might miss that since we're going directly to the StorageAdapter (which only knows about + // real columns). + if (virtualColumns.exists(dims[0].getName())) { + return false; + } + columnCapabilities = storageAdapter.getColumnCapabilities(dims[0].getName()); cardinality = storageAdapter.getDimensionCardinality(dims[0].getName()); } else { - columnCapabilities = null; - cardinality = -1; // ArrayAggregateIterator is not available + // Cannot use array-based aggregation with more than one dimension. + return false; } // Choose array-based aggregation if the grouping key is a single string dimension of a @@ -225,11 +248,11 @@ private static boolean isArrayAggregateApplicable( aggregatorFactories ); - // Check that all keys and aggregated values can be contained the buffer + // Check that all keys and aggregated values can be contained in the buffer return requiredBufferCapacity <= buffer.capacity(); + } else { + return false; } - - return false; } private static class GroupByStrategyFactory implements ColumnSelectorStrategyFactory From eb334482c34119d401c5cce669ba76912b01ad5c Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Tue, 2 Jul 2019 15:58:08 -0700 Subject: [PATCH 2/3] Add addl javadoc. --- .../main/java/org/apache/druid/segment/StorageAdapter.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/processing/src/main/java/org/apache/druid/segment/StorageAdapter.java b/processing/src/main/java/org/apache/druid/segment/StorageAdapter.java index 3319d09c0075..1dfc00ec12c9 100644 --- a/processing/src/main/java/org/apache/druid/segment/StorageAdapter.java +++ b/processing/src/main/java/org/apache/druid/segment/StorageAdapter.java @@ -56,6 +56,11 @@ public interface StorageAdapter extends CursorFactory * the column does exist but the capabilities are unknown. The latter is possible with dynamically discovered * columns. * + * Note that StorageAdapters are representations of "real" segments, so they are not aware of any virtual columns + * that may be involved in a query. In general, query engines should instead use the method + * {@link ColumnSelectorFactory#getColumnCapabilities(String)}, which returns capabilities for virtual columns as + * well. + * * @param column column name * * @return capabilities, or null From 0be85139e846ec2cfb020d0ded0dad8759dd31c0 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Thu, 4 Jul 2019 13:51:27 -0700 Subject: [PATCH 3/3] ExpressionVirtualColumn: Set multi-value flag. --- .../druid/segment/virtual/ExpressionVirtualColumn.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionVirtualColumn.java b/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionVirtualColumn.java index d71ac28f7db2..c5780b79c67c 100644 --- a/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionVirtualColumn.java +++ b/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionVirtualColumn.java @@ -105,7 +105,10 @@ public ColumnValueSelector makeColumnValueSelector(String columnName, ColumnS @Override public ColumnCapabilities capabilities(String columnName) { - return new ColumnCapabilitiesImpl().setType(outputType); + // Note: Ideally we would only "setHasMultipleValues(true)" if the expression in question could potentially return + // multiple values. However, we don't currently have a good way of determining this, so to be safe we always + // set the flag. + return new ColumnCapabilitiesImpl().setType(outputType).setHasMultipleValues(true); } @Override