Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions docs/content/querying/sql.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,15 +106,16 @@ The following SQL queries and features may be executed using approximate algorit

- `COUNT(DISTINCT col)` and `APPROX_COUNT_DISTINCT(col)` aggregations use
[HyperLogLog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf), a fast approximate distinct counting
algorithm. If you need exact distinct counts, you can instead use
`SELECT COUNT(*) FROM (SELECT DISTINCT col FROM data_source)`, which will use a slower and more resource intensive exact
algorithm.
algorithm. If you need exact distinct counts, set "useApproximateCountDistinct" to "false", either through query
context or through broker configuration.
- TopN-style queries with a single grouping column, like
`SELECT col1, SUM(col2) FROM data_source GROUP BY col1 ORDER BY SUM(col2) DESC LIMIT 100`, by default will be executed
as [TopN queries](topnquery.html), which use an approximate algorithm. To disable this behavior, and use exact
algorithms for topN-style queries, set "useApproximateTopN" to "false", either through query context or through broker
configuration.

In both cases, the exact algorithms are generally slower and more resource intensive.

### Time functions

Druid's SQL language supports a number of time operations, including:
Expand Down
7 changes: 7 additions & 0 deletions sql/src/main/java/io/druid/sql/calcite/planner/Rules.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.calcite.interpreter.Bindables;
import org.apache.calcite.plan.RelOptRule;
import org.apache.calcite.plan.volcano.AbstractConverter;
import org.apache.calcite.rel.rules.AggregateExpandDistinctAggregatesRule;
import org.apache.calcite.rel.rules.AggregateJoinTransposeRule;
import org.apache.calcite.rel.rules.AggregateProjectMergeRule;
import org.apache.calcite.rel.rules.AggregateProjectPullUpConstantsRule;
Expand Down Expand Up @@ -202,6 +203,12 @@ private static List<RelOptRule> baseRuleSet(
rules.addAll(VOLCANO_ABSTRACT_RULES);
rules.addAll(RELOPTUTIL_ABSTRACT_RULES);

if (!plannerConfig.isUseApproximateCountDistinct()) {
// We'll need this to expand COUNT DISTINCTs.
// Avoid AggregateExpandDistinctAggregatesRule.INSTANCE; it uses grouping sets and we don't support those.
rules.add(AggregateExpandDistinctAggregatesRule.JOIN);
}

if (plannerConfig.isUseFallback()) {
rules.add(DruidRelToBindableRule.instance());
}
Expand Down
109 changes: 108 additions & 1 deletion sql/src/test/java/io/druid/sql/calcite/CalciteQueryTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,14 @@ public int getMaxTopNLimit()
return 0;
}
};
private static final PlannerConfig PLANNER_CONFIG_NO_HLL = new PlannerConfig()
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor nit: rename to PLANNER_CONFIG_NO_APPROXIMATION ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's also PLANNER_CONFIG_NO_TOPN to disable topn approximations, so I want to keep them different for that reason.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The two kinds of approximations can be toggled separately

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok.

{
@Override
public boolean isUseApproximateCountDistinct()
{
return false;
}
};
private static final PlannerConfig PLANNER_CONFIG_SELECT_PAGING = new PlannerConfig()
{
@Override
Expand Down Expand Up @@ -721,6 +729,22 @@ public void testUnplannableQueries() throws Exception
}
}

@Test
public void testUnplannableExactCountDistinctQueries() throws Exception
{
// All of these queries are unplannable in exact COUNT DISTINCT mode.

final List<String> queries = ImmutableList.of(
"SELECT COUNT(distinct dim1), COUNT(distinct dim2) FROM druid.foo", // two COUNT DISTINCTs, same query
"SELECT dim1, COUNT(distinct dim1), COUNT(distinct dim2) FROM druid.foo GROUP BY dim1", // two COUNT DISTINCTs
"SELECT COUNT(distinct unique_dim1) FROM druid.foo" // COUNT DISTINCT on sketch cannot be exact
);

for (final String query : queries) {
assertQueryIsUnplannable(PLANNER_CONFIG_NO_HLL, query);
}
}

private void assertQueryIsUnplannable(final String sql)
{
assertQueryIsUnplannable(PLANNER_CONFIG_DEFAULT, sql);
Expand Down Expand Up @@ -1869,6 +1893,84 @@ public void testCountDistinct() throws Exception
);
}

@Test
public void testExactCountDistinct() throws Exception
{
// When HLL is disabled, do exact count distinct through a nested query.

testQuery(
PLANNER_CONFIG_NO_HLL,
"SELECT COUNT(distinct dim2) FROM druid.foo",
ImmutableList.<Query>of(
GroupByQuery.builder()
.setDataSource(
new QueryDataSource(
GroupByQuery.builder()
.setDataSource(CalciteTests.DATASOURCE1)
.setInterval(QSS(Filtration.eternity()))
.setGranularity(Granularities.ALL)
.setDimensions(DIMS(new DefaultDimensionSpec("dim2", "d0")))
.setContext(QUERY_CONTEXT_DEFAULT)
.build()
)
)
.setInterval(QSS(Filtration.eternity()))
.setGranularity(Granularities.ALL)
.setAggregatorSpecs(AGGS(
new CountAggregatorFactory("a0")
))
.setContext(QUERY_CONTEXT_DEFAULT)
.build()
),
ImmutableList.of(
new Object[]{3L}
)
);
}

@Test
public void testExactCountDistinctWithGroupingAndOtherAggregators() throws Exception
{
// When HLL is disabled, do exact count distinct through a nested query.

testQuery(
PLANNER_CONFIG_NO_HLL,
"SELECT dim2, SUM(cnt), COUNT(distinct dim1) FROM druid.foo GROUP BY dim2",
ImmutableList.<Query>of(
GroupByQuery.builder()
.setDataSource(
new QueryDataSource(
GroupByQuery.builder()
.setDataSource(CalciteTests.DATASOURCE1)
.setInterval(QSS(Filtration.eternity()))
.setGranularity(Granularities.ALL)
.setDimensions(DIMS(
new DefaultDimensionSpec("dim2", "d0"),
new DefaultDimensionSpec("dim1", "d1")
))
.setAggregatorSpecs(AGGS(new LongSumAggregatorFactory("a0", "cnt")))
.setContext(QUERY_CONTEXT_DEFAULT)
.build()
)
)
.setInterval(QSS(Filtration.eternity()))
.setGranularity(Granularities.ALL)
.setDimensions(DIMS(new DefaultDimensionSpec("d0", "d0")))
.setAggregatorSpecs(AGGS(
new LongSumAggregatorFactory("a0", "a0"),
new CountAggregatorFactory("a1")
))
.setContext(QUERY_CONTEXT_DEFAULT)
.build()
),
ImmutableList.of(
new Object[]{"", 3L, 3L},
new Object[]{"a", 2L, 2L},
new Object[]{"abc", 1L, 1L}
)
);
}

@Test
public void testApproxCountDistinct() throws Exception
{
Expand Down Expand Up @@ -2844,7 +2946,12 @@ public void testGroupByFloorWithOrderBy() throws Exception
.setGranularity(Granularities.ALL)
.setDimensions(
DIMS(
new ExtractionDimensionSpec("dim1", "d0", ValueType.FLOAT, new BucketExtractionFn(1.0, 0.0))
new ExtractionDimensionSpec(
"dim1",
"d0",
ValueType.FLOAT,
new BucketExtractionFn(1.0, 0.0)
)
)
)
.setAggregatorSpecs(AGGS(new CountAggregatorFactory("a0")))
Expand Down