diff --git a/docs/content/querying/sql.md b/docs/content/querying/sql.md index c662b99d8524..f8b87758dc10 100644 --- a/docs/content/querying/sql.md +++ b/docs/content/querying/sql.md @@ -106,15 +106,16 @@ The following SQL queries and features may be executed using approximate algorit - `COUNT(DISTINCT col)` and `APPROX_COUNT_DISTINCT(col)` aggregations use [HyperLogLog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf), a fast approximate distinct counting -algorithm. If you need exact distinct counts, you can instead use -`SELECT COUNT(*) FROM (SELECT DISTINCT col FROM data_source)`, which will use a slower and more resource intensive exact -algorithm. +algorithm. If you need exact distinct counts, set "useApproximateCountDistinct" to "false", either through query +context or through broker configuration. - TopN-style queries with a single grouping column, like `SELECT col1, SUM(col2) FROM data_source GROUP BY col1 ORDER BY SUM(col2) DESC LIMIT 100`, by default will be executed as [TopN queries](topnquery.html), which use an approximate algorithm. To disable this behavior, and use exact algorithms for topN-style queries, set "useApproximateTopN" to "false", either through query context or through broker configuration. +In both cases, the exact algorithms are generally slower and more resource intensive. + ### Time functions Druid's SQL language supports a number of time operations, including: diff --git a/sql/src/main/java/io/druid/sql/calcite/planner/Rules.java b/sql/src/main/java/io/druid/sql/calcite/planner/Rules.java index 475af34b085c..e7ee76209627 100644 --- a/sql/src/main/java/io/druid/sql/calcite/planner/Rules.java +++ b/sql/src/main/java/io/druid/sql/calcite/planner/Rules.java @@ -31,6 +31,7 @@ import org.apache.calcite.interpreter.Bindables; import org.apache.calcite.plan.RelOptRule; import org.apache.calcite.plan.volcano.AbstractConverter; +import org.apache.calcite.rel.rules.AggregateExpandDistinctAggregatesRule; import org.apache.calcite.rel.rules.AggregateJoinTransposeRule; import org.apache.calcite.rel.rules.AggregateProjectMergeRule; import org.apache.calcite.rel.rules.AggregateProjectPullUpConstantsRule; @@ -202,6 +203,12 @@ private static List baseRuleSet( rules.addAll(VOLCANO_ABSTRACT_RULES); rules.addAll(RELOPTUTIL_ABSTRACT_RULES); + if (!plannerConfig.isUseApproximateCountDistinct()) { + // We'll need this to expand COUNT DISTINCTs. + // Avoid AggregateExpandDistinctAggregatesRule.INSTANCE; it uses grouping sets and we don't support those. + rules.add(AggregateExpandDistinctAggregatesRule.JOIN); + } + if (plannerConfig.isUseFallback()) { rules.add(DruidRelToBindableRule.instance()); } diff --git a/sql/src/test/java/io/druid/sql/calcite/CalciteQueryTest.java b/sql/src/test/java/io/druid/sql/calcite/CalciteQueryTest.java index cc406ea60a75..1c742427bc57 100644 --- a/sql/src/test/java/io/druid/sql/calcite/CalciteQueryTest.java +++ b/sql/src/test/java/io/druid/sql/calcite/CalciteQueryTest.java @@ -122,6 +122,14 @@ public int getMaxTopNLimit() return 0; } }; + private static final PlannerConfig PLANNER_CONFIG_NO_HLL = new PlannerConfig() + { + @Override + public boolean isUseApproximateCountDistinct() + { + return false; + } + }; private static final PlannerConfig PLANNER_CONFIG_SELECT_PAGING = new PlannerConfig() { @Override @@ -721,6 +729,22 @@ public void testUnplannableQueries() throws Exception } } + @Test + public void testUnplannableExactCountDistinctQueries() throws Exception + { + // All of these queries are unplannable in exact COUNT DISTINCT mode. + + final List queries = ImmutableList.of( + "SELECT COUNT(distinct dim1), COUNT(distinct dim2) FROM druid.foo", // two COUNT DISTINCTs, same query + "SELECT dim1, COUNT(distinct dim1), COUNT(distinct dim2) FROM druid.foo GROUP BY dim1", // two COUNT DISTINCTs + "SELECT COUNT(distinct unique_dim1) FROM druid.foo" // COUNT DISTINCT on sketch cannot be exact + ); + + for (final String query : queries) { + assertQueryIsUnplannable(PLANNER_CONFIG_NO_HLL, query); + } + } + private void assertQueryIsUnplannable(final String sql) { assertQueryIsUnplannable(PLANNER_CONFIG_DEFAULT, sql); @@ -1869,6 +1893,84 @@ public void testCountDistinct() throws Exception ); } + @Test + public void testExactCountDistinct() throws Exception + { + // When HLL is disabled, do exact count distinct through a nested query. + + testQuery( + PLANNER_CONFIG_NO_HLL, + "SELECT COUNT(distinct dim2) FROM druid.foo", + ImmutableList.of( + GroupByQuery.builder() + .setDataSource( + new QueryDataSource( + GroupByQuery.builder() + .setDataSource(CalciteTests.DATASOURCE1) + .setInterval(QSS(Filtration.eternity())) + .setGranularity(Granularities.ALL) + .setDimensions(DIMS(new DefaultDimensionSpec("dim2", "d0"))) + .setContext(QUERY_CONTEXT_DEFAULT) + .build() + ) + ) + .setInterval(QSS(Filtration.eternity())) + .setGranularity(Granularities.ALL) + .setAggregatorSpecs(AGGS( + new CountAggregatorFactory("a0") + )) + .setContext(QUERY_CONTEXT_DEFAULT) + .build() + ), + ImmutableList.of( + new Object[]{3L} + ) + ); + } + + @Test + public void testExactCountDistinctWithGroupingAndOtherAggregators() throws Exception + { + // When HLL is disabled, do exact count distinct through a nested query. + + testQuery( + PLANNER_CONFIG_NO_HLL, + "SELECT dim2, SUM(cnt), COUNT(distinct dim1) FROM druid.foo GROUP BY dim2", + ImmutableList.of( + GroupByQuery.builder() + .setDataSource( + new QueryDataSource( + GroupByQuery.builder() + .setDataSource(CalciteTests.DATASOURCE1) + .setInterval(QSS(Filtration.eternity())) + .setGranularity(Granularities.ALL) + .setDimensions(DIMS( + new DefaultDimensionSpec("dim2", "d0"), + new DefaultDimensionSpec("dim1", "d1") + )) + .setAggregatorSpecs(AGGS(new LongSumAggregatorFactory("a0", "cnt"))) + .setContext(QUERY_CONTEXT_DEFAULT) + .build() + ) + ) + .setInterval(QSS(Filtration.eternity())) + .setGranularity(Granularities.ALL) + .setDimensions(DIMS(new DefaultDimensionSpec("d0", "d0"))) + .setAggregatorSpecs(AGGS( + new LongSumAggregatorFactory("a0", "a0"), + new CountAggregatorFactory("a1") + )) + .setContext(QUERY_CONTEXT_DEFAULT) + .build() + ), + ImmutableList.of( + new Object[]{"", 3L, 3L}, + new Object[]{"a", 2L, 2L}, + new Object[]{"abc", 1L, 1L} + ) + ); + } + @Test public void testApproxCountDistinct() throws Exception { @@ -2844,7 +2946,12 @@ public void testGroupByFloorWithOrderBy() throws Exception .setGranularity(Granularities.ALL) .setDimensions( DIMS( - new ExtractionDimensionSpec("dim1", "d0", ValueType.FLOAT, new BucketExtractionFn(1.0, 0.0)) + new ExtractionDimensionSpec( + "dim1", + "d0", + ValueType.FLOAT, + new BucketExtractionFn(1.0, 0.0) + ) ) ) .setAggregatorSpecs(AGGS(new CountAggregatorFactory("a0")))