apache · gianm · Feb 26, 2020 · Jan 3, 2020 · Jan 3, 2020 · Feb 21, 2020
diff --git a/docs/querying/sql.md b/docs/querying/sql.md
@@ -65,7 +65,7 @@ Druid SQL supports SELECT queries with the following structure:
 SELECT [ ALL | DISTINCT ] { * | exprs }
 FROM table
 [ WHERE expr ]
-[ GROUP BY exprs ]
+[ GROUP BY [ exprs | GROUPING SETS ( (exprs), ... ) | ROLLUP (exprs) | CUBE (exprs) ] ]
-[ GROUP BY [ exprs | GROUPING SETS ( (exprs), ... ) | ROLLUP (exprs) | CUBE (exprs) ] ]
+[ GROUP BY [ exprs | GROUPING SETS ( \(exprs\), ... ) | ROLLUP \(exprs\) | CUBE \(exprs\) ] ]
-[ GROUP BY [ exprs | GROUPING SETS ( (exprs), ... ) | ROLLUP (exprs) | CUBE (exprs) ] ]
+[ GROUP BY [ exprs | GROUPING SETS ( \(exprs\), ... ) | ROLLUP \(exprs\) | CUBE \(exprs\) ] ]
 [ HAVING expr ]
 [ ORDER BY expr [ ASC | DESC ], expr [ ASC | DESC ], ... ]
 [ LIMIT limit ]
@@ -86,6 +86,22 @@ trigger an aggregation query using one of Druid's [three native aggregation quer
 can refer to an expression or a select clause ordinal position (like `GROUP BY 2` to group by the second selected
 column).
 
+The GROUP BY clause can also refer to multiple grouping sets in three ways. The most flexible is GROUP BY GROUPING SETS,
+for example `GROUP BY GROUPING SETS ( (country, city), () )`. This example is equivalent to a `GROUP BY country, city`
+followed by `GROUP BY ()` (a grand total). With GROUPING SETS, the underlying data is only scanned one time, leading to
+better efficiency. Second, GROUP BY ROLLUP computes a grouping set for each level of the grouping expressions. For
+example `GROUP BY ROLLUP (country, city)` is equivalent to `GROUP BY GROUPING SETS ( (country, city), (country), () )`
+and will produce grouped rows for each country / city pair, along with subtotals for each country, along with a grand
+total. Finally, GROUP BY CUBE computes a grouping set for each combination of grouping expressions. For example,
+`GROUP BY CUBE (country, city)` is equivalent to `GROUP BY GROUPING SETS ( (country, city), (country), (city), () )`.
+Grouping columns that do not apply to a particular row will contain `NULL`. For example, when computing
+`GROUP BY GROUPING SETS ( (country, city), () )`, the grand total row corresponding to `()` will have `NULL` for the
+"country" and "city" columns.
+
+When using GROUP BY GROUPING SETS, GROUP BY ROLLUP, or GROUP BY CUBE, be aware that results may not be generated in the
-When using GROUP BY GROUPING SETS, GROUP BY ROLLUP, or GROUP BY CUBE, be aware that results may not be generated in the
+When using `GROUP BY GROUPING SETS`, `GROUP BY ROLLUP`, or `GROUP BY CUBE`, be aware that results may not be generated in the
-When using GROUP BY GROUPING SETS, GROUP BY ROLLUP, or GROUP BY CUBE, be aware that results may not be generated in the
+When using `GROUP BY GROUPING SETS`, `GROUP BY ROLLUP`, or `GROUP BY CUBE`, be aware that results may not be generated in the
+order that you specify your grouping sets in the query. If you need results to be generated in a particular order, use
+the ORDER BY clause.
+
 The HAVING clause refers to columns that are present after execution of GROUP BY. It can be used to filter on either
 grouping expressions or aggregated values. It can only be used together with GROUP BY.
 

diff --git a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQuery.java b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQuery.java
@@ -794,6 +794,11 @@ public GroupByQuery withQuerySegmentSpec(QuerySegmentSpec spec)
     return new Builder(this).setQuerySegmentSpec(spec).build();
   }
 
+  public GroupByQuery withVirtualColumns(final VirtualColumns virtualColumns)
+  {
+    return new Builder(this).setVirtualColumns(virtualColumns).build();
+  }
+
   public GroupByQuery withDimFilter(@Nullable final DimFilter dimFilter)
   {
     return new Builder(this).setDimFilter(dimFilter).build();
@@ -1198,6 +1203,7 @@ public String toString()
            ", dimensions=" + dimensions +
            ", aggregatorSpecs=" + aggregatorSpecs +
            ", postAggregatorSpecs=" + postAggregatorSpecs +
+           (subtotalsSpec != null ? (", subtotalsSpec=" + subtotalsSpec) : "") +
            ", havingSpec=" + havingSpec +
            ", context=" + getContext() +
            '}';
@@ -1222,7 +1228,8 @@ public boolean equals(final Object o)
            Objects.equals(dimFilter, that.dimFilter) &&
            Objects.equals(dimensions, that.dimensions) &&
            Objects.equals(aggregatorSpecs, that.aggregatorSpecs) &&
-           Objects.equals(postAggregatorSpecs, that.postAggregatorSpecs);
+           Objects.equals(postAggregatorSpecs, that.postAggregatorSpecs) &&
+           Objects.equals(subtotalsSpec, that.subtotalsSpec);
   }
 
   @Override
@@ -1236,7 +1243,8 @@ public int hashCode()
         dimFilter,
         dimensions,
         aggregatorSpecs,
-        postAggregatorSpecs
+        postAggregatorSpecs,
+        subtotalsSpec
     );
   }
 }
diff --git a/processing/src/main/java/org/apache/druid/query/groupby/strategy/GroupByStrategyV2.java b/processing/src/main/java/org/apache/druid/query/groupby/strategy/GroupByStrategyV2.java
@@ -67,6 +67,7 @@
 import org.apache.druid.query.groupby.resource.GroupByQueryResource;
 import org.apache.druid.query.spec.MultipleIntervalSegmentSpec;
 import org.apache.druid.segment.StorageAdapter;
+import org.apache.druid.segment.VirtualColumns;
 
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
@@ -355,7 +356,11 @@ public Sequence<ResultRow> processSubtotalsSpec(
     GroupByRowProcessor.ResultSupplier resultSupplierOne = null;
 
     try {
-      GroupByQuery queryWithoutSubtotalsSpec = query
+      // baseSubtotalQuery is the original query with dimensions and aggregators rewritten to apply to the *results*
+      // rather than *inputs* of that query. It has its virtual columns and dim filter removed, because those only
+      // make sense when applied to inputs. Finally, it has subtotalsSpec removed, since we'll be computing them
+      // one-by-one soon enough.
+      GroupByQuery baseSubtotalQuery = query
           .withDimensionSpecs(query.getDimensions().stream().map(
               dimSpec -> new DefaultDimensionSpec(
                   dimSpec.getOutputName(),
@@ -369,13 +374,13 @@ public Sequence<ResultRow> processSubtotalsSpec(
                    .map(AggregatorFactory::getCombiningFactory)
                    .collect(Collectors.toList())
           )
-          .withSubtotalsSpec(null)
-          .withDimFilter(null);
-
+          .withVirtualColumns(VirtualColumns.EMPTY)
+          .withDimFilter(null)
+          .withSubtotalsSpec(null);
 
       resultSupplierOne = GroupByRowProcessor.process(
-          queryWithoutSubtotalsSpec,
-          queryWithoutSubtotalsSpec,
+          baseSubtotalQuery,
+          baseSubtotalQuery,
           queryResult,
           configSupplier.get(),
           resource,
@@ -384,13 +389,13 @@ public Sequence<ResultRow> processSubtotalsSpec(
           processingConfig.intermediateComputeSizeBytes()
       );
 
-      List<String> queryDimNames = queryWithoutSubtotalsSpec.getDimensions().stream().map(DimensionSpec::getOutputName)
-                                                            .collect(Collectors.toList());
+      List<String> queryDimNames = baseSubtotalQuery.getDimensions().stream().map(DimensionSpec::getOutputName)
+                                                 .collect(Collectors.toList());
 
       // Only needed to make LimitSpec.filterColumns(..) call later in case base query has a non default LimitSpec.
       Set<String> aggsAndPostAggs = null;
-      if (queryWithoutSubtotalsSpec.getLimitSpec() != null && !(queryWithoutSubtotalsSpec.getLimitSpec() instanceof NoopLimitSpec)) {
-        aggsAndPostAggs = getAggregatorAndPostAggregatorNames(queryWithoutSubtotalsSpec);
+      if (!(baseSubtotalQuery.getLimitSpec() instanceof NoopLimitSpec)) {
+        aggsAndPostAggs = getAggregatorAndPostAggregatorNames(baseSubtotalQuery);
       }
 
       List<List<String>> subtotals = query.getSubtotalsSpec();
@@ -425,14 +430,14 @@ public Sequence<ResultRow> processSubtotalsSpec(
 
         // Create appropriate LimitSpec for subtotal query
         LimitSpec subtotalQueryLimitSpec = NoopLimitSpec.instance();
-        if (queryWithoutSubtotalsSpec.getLimitSpec() != null && !(queryWithoutSubtotalsSpec.getLimitSpec() instanceof NoopLimitSpec)) {
-          Set<String> columns = new HashSet(aggsAndPostAggs);
+        if (!(baseSubtotalQuery.getLimitSpec() instanceof NoopLimitSpec)) {
+          Set<String> columns = new HashSet<>(aggsAndPostAggs);
           columns.addAll(subtotalSpec);
 
-          subtotalQueryLimitSpec = queryWithoutSubtotalsSpec.getLimitSpec().filterColumns(columns);
+          subtotalQueryLimitSpec = baseSubtotalQuery.getLimitSpec().filterColumns(columns);
         }
 
-        GroupByQuery subtotalQuery = queryWithoutSubtotalsSpec
+        GroupByQuery subtotalQuery = baseSubtotalQuery
             .withLimitSpec(subtotalQueryLimitSpec)
             .withDimensionSpecs(newDimensions);
 
@@ -451,7 +456,7 @@ public Sequence<ResultRow> processSubtotalsSpec(
           // Also note, we can't create the ResultSupplier eagerly here or as we don't want to eagerly allocate
           // merge buffers for processing subtotal.
           Supplier<GroupByRowProcessor.ResultSupplier> resultSupplierTwo = () -> GroupByRowProcessor.process(
-              queryWithoutSubtotalsSpec,
+              baseSubtotalQuery,
               subtotalQuery,
               resultSupplierOneFinal.results(subtotalSpec),
               configSupplier.get(),
@@ -468,7 +473,7 @@ public Sequence<ResultRow> processSubtotalsSpec(
       }
 
       return Sequences.withBaggage(
-          Sequences.concat(subtotalsResults),
+          query.postProcess(Sequences.concat(subtotalsResults)),
           resultSupplierOne //this will close resources allocated by resultSupplierOne after sequence read
       );
     }
@@ -489,21 +494,17 @@ private Sequence<ResultRow> processSubtotalsResultAndOptionallyClose(
     // on sequence read if closeOnSequenceRead is true.
     try {
       Supplier<GroupByRowProcessor.ResultSupplier> memoizedSupplier = Suppliers.memoize(baseResultsSupplier);
-      return applyPostProcessing(
-          mergeResults(
-              (queryPlus, responseContext) ->
-                  new LazySequence<>(
-                      () -> Sequences.withBaggage(
-                          memoizedSupplier.get().results(dimsToInclude),
-                          closeOnSequenceRead ? () -> CloseQuietly.close(memoizedSupplier.get()) : () -> {}
-                      )
-                  ),
-              subtotalQuery,
-              null
-          ),
-          subtotalQuery
+      return mergeResults(
+          (queryPlus, responseContext) ->
+              new LazySequence<>(
+                  () -> Sequences.withBaggage(
+                      memoizedSupplier.get().results(dimsToInclude),
+                      closeOnSequenceRead ? () -> CloseQuietly.close(memoizedSupplier.get()) : () -> {}
+                  )
+              ),
+          subtotalQuery,
+          null
       );
-
     }
     catch (Exception ex) {
       CloseQuietly.close(baseResultsSupplier.get());

diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java
@@ -412,6 +412,8 @@ public ByteBuffer get()
   @Parameterized.Parameters(name = "{0}")
   public static Collection<Object[]> constructorFeeder()
   {
+    NullHandling.initializeForTests();
+
     final List<Object[]> constructors = new ArrayList<>();
     for (GroupByQueryConfig config : testConfigs()) {
       final Pair<GroupByQueryRunnerFactory, Closer> factoryAndCloser = makeQueryRunnerFactory(config);
@@ -7196,38 +7198,13 @@ public void testGroupByWithSubtotalsSpecWithOrderLimit()
         .addOrderByColumn("idx")
         .addOrderByColumn("alias")
         .addOrderByColumn("market")
-        .setLimit(1)
+        .setLimit(3)
         .build();
 
     List<ResultRow> expectedResults = Arrays.asList(
-        makeRow(
-            query,
-            "2011-04-01",
-            "alias",
-            "technology",
-            "rows",
-            1L,
-            "idx",
-            78L
-        ),
-        makeRow(
-            query,
-            "2011-04-01T00:00:00.000Z",
-            "market",
-            "spot",
-            "rows",
-            9L,
-            "idx",
-            1102L
-        ),
-        makeRow(
-            query,
-            "2011-04-01T00:00:00.000Z",
-            "rows",
-            13L,
-            "idx",
-            6619L
-        )
+        makeRow(query, "2011-04-01", "alias", "technology", "rows", 1L, "idx", 78L),
+        makeRow(query, "2011-04-01", "alias", "business", "rows", 1L, "idx", 118L),
+        makeRow(query, "2011-04-01", "alias", "travel", "rows", 1L, "idx", 119L)
     );
 
     Iterable<ResultRow> results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query);

diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryTest.java
@@ -22,6 +22,8 @@
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Ordering;
+import nl.jqno.equalsverifier.EqualsVerifier;
+import nl.jqno.equalsverifier.Warning;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.query.BaseQuery;
@@ -121,4 +123,23 @@ public void testSegmentLookUpForNestedQueries()
         .build();
     Assert.assertEquals(innerQuerySegmentSpec, BaseQuery.getQuerySegmentSpecForLookUp(query));
   }
+
+  @Test
+  public void testEquals()
+  {
+    EqualsVerifier.forClass(GroupByQuery.class)
+                  .usingGetClass()
+                  // The 'duration' field is used by equals via getDuration(), which computes it lazily in a way
+                  // that confuses EqualsVerifier.
+                  .suppress(Warning.NULL_FIELDS, Warning.NONFINAL_FIELDS)
+                  // Fields derived from other fields are not included in equals/hashCode
+                  .withIgnoredFields(
+                      "applyLimitPushDown",
+                      "postProcessingFn",
+                      "resultRowOrder",
+                      "resultRowPositionLookup",
+                      "universalTimestamp"
+                  )
+                  .verify();
+  }
 }
diff --git a/sql/src/main/java/org/apache/druid/sql/calcite/planner/Rules.java b/sql/src/main/java/org/apache/druid/sql/calcite/planner/Rules.java
@@ -231,8 +231,8 @@ private static List<RelOptRule> baseRuleSet(final PlannerContext plannerContext)
     rules.addAll(SUB_QUERY_REMOVE_RULES);
 
     if (!plannerConfig.isUseApproximateCountDistinct()) {
-      // We'll need this to expand COUNT DISTINCTs.
-      // Avoid AggregateExpandDistinctAggregatesRule.INSTANCE; it uses grouping sets and we don't support those.
+      // For some reason, even though we support grouping sets, using AggregateExpandDistinctAggregatesRule.INSTANCE
+      // here causes CalciteQueryTest#testExactCountDistinctWithGroupingAndOtherAggregators to fail.
       rules.add(AggregateExpandDistinctAggregatesRule.JOIN);
     }