Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -178,13 +178,20 @@ public String getFormatString()
// 26: group by string expr with non-expr agg
"SELECT CONCAT(string2, '-', long2), SUM(double1) FROM foo GROUP BY 1 ORDER BY 2",
// 27: group by string expr with expr agg
"SELECT CONCAT(string2, '-', long2), SUM(long1 * double4) FROM foo GROUP BY 1 ORDER BY 2"
"SELECT CONCAT(string2, '-', long2), SUM(long1 * double4) FROM foo GROUP BY 1 ORDER BY 2",
// 28: group by single input string low cardinality expr with expr agg
"SELECT CONCAT(string2, '-', 'foo'), SUM(long1 * long4) FROM foo GROUP BY 1 ORDER BY 2",
// 28: group by single input string high cardinality expr with expr agg
"SELECT CONCAT(string3, '-', 'foo'), SUM(long1 * long4) FROM foo GROUP BY 1 ORDER BY 2"
);

@Param({"5000000"})
private int rowsPerSegment;

@Param({"false", "force"})
@Param({
"false",
"force"
})
private String vectorize;

@Param({
Expand Down Expand Up @@ -217,7 +224,9 @@ public String getFormatString()
"24",
"25",
"26",
"27"
"27",
"28",
"29"
})
private String query;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ public boolean canStop()
}

/**
* Finalizes stopping the the LifecycleLock. This method must be called before exit from stop() on this object,
* Finalizes stopping the LifecycleLock. This method must be called before exit from stop() on this object,
* usually in a finally block. If you're using a restartable object, use {@link #exitStopAndReset()} instead.
*
* @throws IllegalMonitorStateException if {@link #canStop()} is not yet called on this LifecycleLock
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ public ExprEval apply(List<Expr> args, Expr.ObjectBinding bindings)
if (evals.isEmpty()) {
// The GREATEST/LEAST functions are not in the SQL standard. Emulate the behavior of postgres (return null if
// all expressions are null, otherwise skip null values) since it is used as a base for a wide number of
// databases. This also matches the behavior the the long/double greatest/least post aggregators. Some other
// databases. This also matches the behavior the long/double greatest/least post aggregators. Some other
// databases (e.g., MySQL) return null if any expression is null.
// https://www.postgresql.org/docs/9.5/functions-conditional.html
// https://dev.mysql.com/doc/refman/8.0/en/comparison-operators.html#function_least
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ public List<String> getIdentifiers()
return args.stream().map(IdentifierExpr::toString).collect(Collectors.toList());
}

public List<String> stringifyIdentifiers()
{
return args.stream().map(IdentifierExpr::stringify).collect(Collectors.toList());
}

ImmutableList<IdentifierExpr> getIdentifierExprs()
{
return args;
Expand Down Expand Up @@ -99,7 +104,7 @@ public ExprEval eval(ObjectBinding bindings)
@Override
public String stringify()
{
return StringUtils.format("(%s) -> %s", ARG_JOINER.join(getIdentifiers()), expr.stringify());
return StringUtils.format("(%s) -> %s", ARG_JOINER.join(stringifyIdentifiers()), expr.stringify());
}

@Override
Expand Down
2 changes: 1 addition & 1 deletion docs/operations/security-overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ For more information, see [TLS support](tls-support.md) and [Simple SSLContext P

## Authentication and authorization

You can configure authentication and authorization to control access to the the Druid APIs. Then configure users, roles, and permissions, as described in the following sections. Make the configuration changes in the `common.runtime.properties` file on all Druid servers in the cluster.
You can configure authentication and authorization to control access to the Druid APIs. Then configure users, roles, and permissions, as described in the following sections. Make the configuration changes in the `common.runtime.properties` file on all Druid servers in the cluster.

Within Druid's operating context, authenticators control the way user identities are verified. Authorizers employ user roles to relate authenticated users to the datasources they are permitted to access. You can set the finest-grained permissions on a per-datasource basis.

Expand Down
2 changes: 1 addition & 1 deletion docs/querying/caching.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ For instance, whole-query caching is a good option when you have queries that in

- On Historicals, the default. Enable segment-level cache population on Historicals for larger production clusters to prevent Brokers from having to merge all query results. When you enable cache population on Historicals instead of Brokers, the Historicals merge their own local results and put less strain on the Brokers.

- On ingestion tasks in the Peon or Indexer service. Larger production clusters should enable segment-level cache population on task services only to prevent Brokers from having to merge all query results. When you enable cache population on task execution services instead of Brokers, the the task execution services to merge their own local results and put less strain on the Brokers.
- On ingestion tasks in the Peon or Indexer service. Larger production clusters should enable segment-level cache population on task services only to prevent Brokers from having to merge all query results. When you enable cache population on task execution services instead of Brokers, the task execution services to merge their own local results and put less strain on the Brokers.

Task executor services only support caches that store data locally. For example the `caffeine` cache. This restriction exists because the cache stores results at the level of intermediate partial segments generated by the ingestion tasks. These intermediate partial segments may not be identical across task replicas. Therefore task executor services ignore remote cache types such as `memcached`.

Expand Down
4 changes: 2 additions & 2 deletions docs/querying/datasource.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ The table datasource is the most common type. This is the kind of datasource you
[data ingestion](../ingestion/index.md). They are split up into segments, distributed around the cluster,
and queried in parallel.

In [Druid SQL](sql.md#from), table datasources reside in the the `druid` schema. This is the default schema, so table
In [Druid SQL](sql.md#from), table datasources reside in the `druid` schema. This is the default schema, so table
datasources can be referenced as either `druid.dataSourceName` or simply `dataSourceName`.

In native queries, table datasources can be referenced using their names as strings (as in the example above), or by
Expand Down Expand Up @@ -92,7 +92,7 @@ SELECT k, v FROM lookup.countries
<!--END_DOCUSAURUS_CODE_TABS-->

Lookup datasources correspond to Druid's key-value [lookup](lookups.md) objects. In [Druid SQL](sql.md#from),
they reside in the the `lookup` schema. They are preloaded in memory on all servers, so they can be accessed rapidly.
they reside in the `lookup` schema. They are preloaded in memory on all servers, so they can be accessed rapidly.
They can be joined onto regular tables using the [join operator](#join).

Lookup datasources are key-value oriented and always have exactly two columns: `k` (the key) and `v` (the value), and
Expand Down
2 changes: 1 addition & 1 deletion docs/querying/sorting-orders.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ This sorting order will try to parse all string values as numbers. Unparseable v
When comparing two unparseable values (e.g., "hello" and "world"), this ordering will sort by comparing the unparsed strings lexicographically.

## Strlen
Sorts values by the their string lengths. When there is a tie, this comparator falls back to using the String compareTo method.
Sorts values by their string lengths. When there is a tie, this comparator falls back to using the String compareTo method.

## Version
Sorts values as versions, e.g.: "10.0 sorts after 9.0", "1.0.0-SNAPSHOT sorts after 1.0.0".
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ interface IntIterator extends org.roaringbitmap.IntIterator
int next();

/**
* Skips all the elements before the the specified element, so that
* Skips all the elements before the specified element, so that
* {@link #next()} gives the given element or, if it does not exist, the
* element immediately after according to the sorting provided by this
* set.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
import java.lang.reflect.Method;

/**
* This wrapper class is created to be able to access some of the the "protected" methods inside Hadoop's
* This wrapper class is created to be able to access some of the "protected" methods inside Hadoop's
* FileSystem class. Those are supposed to become public eventually or more appropriate alternatives would be
* provided.
* This is a hack and should be removed when no longer necessary.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1520,7 +1520,7 @@ public double sum(final float b)
*
* @param probabilities array of probabilities
*
* @return an array of length probabilities.length representing the the approximate sample quantiles
* @return an array of length probabilities.length representing the approximate sample quantiles
* corresponding to the given probabilities
*/

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ public class FixedBucketsHistogram
public static final byte SPARSE_ENCODING_MODE = 0x02;

/**
* Determines how the the histogram handles outliers.
* Determines how the histogram handles outliers.
*
* Ignore: do not track outliers at all
* Overflow: track outlier counts in upperOutlierCount and lowerOutlierCount.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,6 @@ public DruidOperatorTable createOperatorTable()
@Test
public void testQuantileOnFloatAndLongs() throws Exception
{
cannotVectorize();

final List<Object[]> expectedResults = ImmutableList.of(
new Object[]{
1.0299999713897705,
Expand Down Expand Up @@ -238,8 +236,6 @@ public void testQuantileOnFloatAndLongs() throws Exception
@Test
public void testQuantileOnCastedString() throws Exception
{
cannotVectorize();

testQuery(
"SELECT\n"
+ "APPROX_QUANTILE_FIXED_BUCKETS(CAST(dim1 AS DOUBLE), 0.01, 20, 0.0, 10.0),\n"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ public DruidOperatorTable createOperatorTable()
@Test
public void testQuantileOnFloatAndLongs() throws Exception
{
cannotVectorize();
testQuery(
"SELECT\n"
+ "APPROX_QUANTILE(m1, 0.01),\n"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,4 +117,25 @@ public GroupByVectorColumnSelector makeObjectProcessor(
}
return NilGroupByVectorColumnSelector.INSTANCE;
}

/**
* The group by engine vector processor has a more relaxed approach to choosing to use a dictionary encoded string
* selector over an object selector than some of the other {@link VectorColumnProcessorFactory} implementations.
*
* Basically, if a valid dictionary exists, we will use it to group on dictionary ids (so that we can use
* {@link SingleValueStringGroupByVectorColumnSelector} whenever possible instead of
* {@link DictionaryBuildingSingleValueStringGroupByVectorColumnSelector}).
*
* We do this even for things like virtual columns that have a single string input, because it allows deferring
* accessing any of the actual string values, which involves at minimum reading utf8 byte values and converting
* them to string form (if not already cached), and in the case of expressions, computing the expression output for
* the string input.
*/
@Override
public boolean useDictionaryEncodedSelector(ColumnCapabilities capabilities)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add @Nullable for capabilities.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, it should never be null for this method though (nor any of the other VectorColumnProcessorFactory methods), ColumnProcessors will return a nil vector selector if the capabilities are null, since null capabilities in the vectorized engine means that the column doesn't exist.

{
Preconditions.checkArgument(capabilities != null, "Capabilities must not be null");
Preconditions.checkArgument(capabilities.getType() == ValueType.STRING, "Must only be called on a STRING column");
return capabilities.isDictionaryEncoded().isTrue();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just for recording (and helping myself to remember later in the future), this means that the groupBy vector engine will use the dictionary IDs to compute per-segment results, and decode them when merging those results, which is what non-vectorized engine does today. When the column is dictionary encoded but not unique, this optimization might not be always good because there could be some sort of tradeoff depending on the column cardinality post expression evaluation. Even though I think this optimization is likely good in most cases, it could worth investigating further later to understand the tradeoff better.

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import org.apache.druid.query.groupby.epinephelinae.HashVectorGrouper;
import org.apache.druid.query.groupby.epinephelinae.VectorGrouper;
import org.apache.druid.query.vector.VectorCursorGranularizer;
import org.apache.druid.segment.ColumnInspector;
import org.apache.druid.segment.ColumnProcessors;
import org.apache.druid.segment.StorageAdapter;
import org.apache.druid.segment.VirtualColumns;
Expand All @@ -59,7 +60,6 @@
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.function.Function;
import java.util.stream.Collectors;

public class VectorGroupByEngine
Expand All @@ -75,25 +75,29 @@ public static boolean canVectorize(
@Nullable final Filter filter
)
{
Function<String, ColumnCapabilities> capabilitiesFunction = name ->
query.getVirtualColumns().getColumnCapabilitiesWithFallback(adapter, name);
final ColumnInspector inspector = query.getVirtualColumns().wrapInspector(adapter);

return canVectorizeDimensions(capabilitiesFunction, query.getDimensions())
&& query.getDimensions().stream().allMatch(DimensionSpec::canVectorize)
&& query.getAggregatorSpecs().stream().allMatch(aggregatorFactory -> aggregatorFactory.canVectorize(adapter))
return adapter.canVectorize(filter, query.getVirtualColumns(), false)
&& canVectorizeDimensions(inspector, query.getDimensions())
&& VirtualColumns.shouldVectorize(query, query.getVirtualColumns(), adapter)
&& adapter.canVectorize(filter, query.getVirtualColumns(), false);
&& query.getAggregatorSpecs()
.stream()
.allMatch(aggregatorFactory -> aggregatorFactory.canVectorize(inspector));
}

public static boolean canVectorizeDimensions(
final Function<String, ColumnCapabilities> capabilitiesFunction,
final ColumnInspector inspector,
final List<DimensionSpec> dimensions
)
{
return dimensions
.stream()
.allMatch(
dimension -> {
if (!dimension.canVectorize()) {
return false;
}

if (dimension.mustDecorate()) {
// group by on multi value dimensions are not currently supported
// DimensionSpecs that decorate may turn singly-valued columns into multi-valued selectors.
Expand All @@ -102,7 +106,7 @@ public static boolean canVectorizeDimensions(
}

// Now check column capabilities.
final ColumnCapabilities columnCapabilities = capabilitiesFunction.apply(dimension.getDimension());
final ColumnCapabilities columnCapabilities = inspector.getColumnCapabilities(dimension.getDimension());
// null here currently means the column does not exist, nil columns can be vectorized
if (columnCapabilities == null) {
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import org.apache.druid.query.aggregation.AggregatorFactory;
import org.apache.druid.query.filter.Filter;
import org.apache.druid.query.vector.VectorCursorGranularizer;
import org.apache.druid.segment.ColumnInspector;
import org.apache.druid.segment.SegmentMissingException;
import org.apache.druid.segment.StorageAdapter;
import org.apache.druid.segment.VirtualColumns;
Expand Down Expand Up @@ -66,7 +67,7 @@ public class TimeseriesQueryEngine
@VisibleForTesting
public TimeseriesQueryEngine()
{
this.bufferPool = new StupidPool<>("dummy", () -> ByteBuffer.allocate(1000000));
this.bufferPool = new StupidPool<>("dummy", () -> ByteBuffer.allocate(10000000));
}

@Inject
Expand Down Expand Up @@ -94,10 +95,12 @@ public Sequence<Result<TimeseriesResultValue>> process(final TimeseriesQuery que
final Granularity gran = query.getGranularity();
final boolean descending = query.isDescending();

final ColumnInspector inspector = query.getVirtualColumns().wrapInspector(adapter);

final boolean doVectorize = QueryContexts.getVectorize(query).shouldVectorize(
query.getAggregatorSpecs().stream().allMatch(aggregatorFactory -> aggregatorFactory.canVectorize(adapter))
adapter.canVectorize(filter, query.getVirtualColumns(), descending)
&& VirtualColumns.shouldVectorize(query, query.getVirtualColumns(), adapter)
&& adapter.canVectorize(filter, query.getVirtualColumns(), descending)
&& query.getAggregatorSpecs().stream().allMatch(aggregatorFactory -> aggregatorFactory.canVectorize(inspector))
);

final Sequence<Result<TimeseriesResultValue>> result;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ long scanAndAggregate(
void initAggregateStore();

/**
* Closes all on heap {@link Aggregator} associated withe the aggregates processor
* Closes all on heap {@link Aggregator} associated with the aggregates processor
*/
void closeAggregators();
}
Original file line number Diff line number Diff line change
Expand Up @@ -219,13 +219,27 @@ private static ColumnCapabilities computeDimensionSpecCapabilities(
} else if (dimensionSpec.getExtractionFn() != null) {
// DimensionSpec is applying an extractionFn but *not* decorating. We have some insight into how the
// extractionFn will behave, so let's use it.
final boolean dictionaryEncoded;
final boolean unique;
final boolean sorted;
if (columnCapabilities != null) {
dictionaryEncoded = columnCapabilities.isDictionaryEncoded().isTrue();
unique = columnCapabilities.areDictionaryValuesUnique().isTrue();
sorted = columnCapabilities.areDictionaryValuesSorted().isTrue();
} else {
dictionaryEncoded = false;
unique = false;
sorted = false;
}

return new ColumnCapabilitiesImpl()
.setType(ValueType.STRING)
.setDictionaryValuesSorted(dimensionSpec.getExtractionFn().preservesOrdering())
.setDictionaryValuesUnique(dimensionSpec.getExtractionFn().getExtractionType()
== ExtractionFn.ExtractionType.ONE_TO_ONE)
.setHasMultipleValues(dimensionSpec.mustDecorate() || mayBeMultiValue(columnCapabilities));
.setDictionaryEncoded(dictionaryEncoded)
.setDictionaryValuesSorted(sorted && dimensionSpec.getExtractionFn().preservesOrdering())
.setDictionaryValuesUnique(
unique && dimensionSpec.getExtractionFn().getExtractionType() == ExtractionFn.ExtractionType.ONE_TO_ONE
)
.setHasMultipleValues(mayBeMultiValue(columnCapabilities));
} else {
// No transformation. Pass through underlying types.
return columnCapabilities;
Expand Down Expand Up @@ -318,8 +332,8 @@ private static <T> T makeVectorProcessorInternal(

switch (capabilities.getType()) {
case STRING:
// if column is not uniquely dictionary encoded, use an object selector
if (capabilities.isDictionaryEncoded().isFalse() || capabilities.areDictionaryValuesUnique().isFalse()) {
// let the processor factory decide if it prefers to use an object selector or dictionary encoded selector
if (!processorFactory.useDictionaryEncodedSelector(capabilities)) {
return processorFactory.makeObjectProcessor(
capabilities,
objectSelectorFn.apply(selectorFactory)
Expand Down
Loading