Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
import org.apache.druid.query.groupby.orderby.OrderByColumnSpec;
import org.apache.druid.query.groupby.strategy.GroupByStrategyV2;
import org.apache.druid.query.ordering.StringComparator;
import org.apache.druid.segment.ColumnInspector;
import org.apache.druid.segment.ColumnSelectorFactory;
import org.apache.druid.segment.ColumnValueSelector;
import org.apache.druid.segment.Cursor;
Expand All @@ -74,7 +75,6 @@
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.function.Function;
import java.util.stream.Stream;

/**
Expand Down Expand Up @@ -231,7 +231,7 @@ public GroupByEngineIterator make()
processingBuffer,
fudgeTimestamp,
dims,
isAllSingleValueDims(columnSelectorFactory::getColumnCapabilities, query.getDimensions(), false),
isAllSingleValueDims(columnSelectorFactory, query.getDimensions()),
cardinalityForArrayAggregation
);
} else {
Expand All @@ -242,7 +242,7 @@ public GroupByEngineIterator make()
processingBuffer,
fudgeTimestamp,
dims,
isAllSingleValueDims(columnSelectorFactory::getColumnCapabilities, query.getDimensions(), false)
isAllSingleValueDims(columnSelectorFactory, query.getDimensions())
);
}
}
Expand Down Expand Up @@ -319,13 +319,11 @@ public static int getCardinalityForArrayAggregation(
/**
* Checks whether all "dimensions" are either single-valued, or if allowed, nonexistent. Since non-existent column
* selectors will show up as full of nulls they are effectively single valued, however they can also be null during
* broker merge, for example with an 'inline' datasource subquery. 'missingMeansNonExistent' is sort of a hack to let
* the vectorized engine, which only operates on actual segments, to still work in this case for non-existent columns.
* broker merge, for example with an 'inline' datasource subquery.
*/
public static boolean isAllSingleValueDims(
final Function<String, ColumnCapabilities> capabilitiesFunction,
final List<DimensionSpec> dimensions,
final boolean missingMeansNonExistent
final ColumnInspector inspector,
final List<DimensionSpec> dimensions
)
{
return dimensions
Expand All @@ -338,10 +336,9 @@ public static boolean isAllSingleValueDims(
return false;
}

// Now check column capabilities.
final ColumnCapabilities columnCapabilities = capabilitiesFunction.apply(dimension.getDimension());
return (columnCapabilities != null && columnCapabilities.hasMultipleValues().isFalse()) ||
(missingMeansNonExistent && columnCapabilities == null);
// Now check column capabilities, which must be present and explicitly not multi-valued
final ColumnCapabilities columnCapabilities = inspector.getColumnCapabilities(dimension.getDimension());
return columnCapabilities != null && columnCapabilities.hasMultipleValues().isFalse();
});
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
import org.apache.druid.query.vector.VectorCursorGranularizer;
import org.apache.druid.segment.DimensionHandlerUtils;
import org.apache.druid.segment.StorageAdapter;
import org.apache.druid.segment.column.ColumnCapabilities;
import org.apache.druid.segment.column.ValueType;
import org.apache.druid.segment.filter.Filters;
import org.apache.druid.segment.vector.VectorColumnSelectorFactory;
import org.apache.druid.segment.vector.VectorCursor;
Expand All @@ -55,6 +57,7 @@
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.function.Function;
import java.util.stream.Collectors;

public class VectorGroupByEngine
Expand All @@ -70,24 +73,47 @@ public static boolean canVectorize(
@Nullable final Filter filter
)
{
// Multi-value dimensions are not yet supported.
//
// Two notes here about how we're handling this check:
// 1) After multi-value dimensions are supported, we could alter "GroupByQueryEngineV2.isAllSingleValueDims"
// to accept a ColumnSelectorFactory, which makes more sense than using a StorageAdapter (see #8013).
// 2) Technically using StorageAdapter here is bad since it only looks at real columns, but they might
// be shadowed by virtual columns (again, see #8013). But it's fine for now since adapter.canVectorize
// always returns false if there are any virtual columns.
//
// This situation should sort itself out pretty well once this engine supports multi-valued columns. Then we
// won't have to worry about having this all-single-value-dims check here.

return GroupByQueryEngineV2.isAllSingleValueDims(adapter::getColumnCapabilities, query.getDimensions(), true)
Function<String, ColumnCapabilities> capabilitiesFunction = name ->
query.getVirtualColumns().getColumnCapabilitiesWithFallback(adapter, name);

return canVectorizeDimensions(capabilitiesFunction, query.getDimensions())
&& query.getDimensions().stream().allMatch(DimensionSpec::canVectorize)
&& query.getAggregatorSpecs().stream().allMatch(aggregatorFactory -> aggregatorFactory.canVectorize(adapter))
&& adapter.canVectorize(filter, query.getVirtualColumns(), false);
}

public static boolean canVectorizeDimensions(
final Function<String, ColumnCapabilities> capabilitiesFunction,
final List<DimensionSpec> dimensions
)
{
return dimensions
.stream()
.allMatch(
dimension -> {
if (dimension.mustDecorate()) {
// group by on multi value dimensions are not currently supported
// DimensionSpecs that decorate may turn singly-valued columns into multi-valued selectors.
// To be safe, we must return false here.
return false;
}

// Now check column capabilities.
final ColumnCapabilities columnCapabilities = capabilitiesFunction.apply(dimension.getDimension());
// null here currently means the column does not exist, nil columns can be vectorized
if (columnCapabilities == null) {
return true;
}
// strings must be single valued, dictionary encoded, and have unique dictionary entries
if (ValueType.STRING.equals(columnCapabilities.getType())) {
return columnCapabilities.hasMultipleValues().isFalse() &&
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do you decide between columnCapabilities.hasMultipleValues().isFalse() and !columnCapabilities.hasMultipleValues().isMaybeTrue() - I'm never sure which one to check

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, I guess the example I gave is equivalent. Maybe a better way to ask that question is should UNKNOWN be treated as multi-value or single value?

columnCapabilities.isDictionaryEncoded().isTrue() &&
columnCapabilities.areDictionaryValuesUnique().isTrue();
}
return columnCapabilities.hasMultipleValues().isFalse();
});
}

public static Sequence<ResultRow> process(
final GroupByQuery query,
final StorageAdapter storageAdapter,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,6 @@ public Cursor apply(final Interval inputInterval)
public VectorCursor buildVectorized(final int vectorSize)
{
// Sanity check - matches QueryableIndexStorageAdapter.canVectorize
Preconditions.checkState(virtualColumns.size() == 0, "virtualColumns.size == 0");
Preconditions.checkState(!descending, "!descending");

final Map<String, BaseColumn> columnCache = new HashMap<>();
Expand Down Expand Up @@ -229,17 +228,15 @@ public VectorCursor buildVectorized(final int vectorSize)
? new NoFilterVectorOffset(vectorSize, startOffset, endOffset)
: new BitmapVectorOffset(vectorSize, filterBitmap, startOffset, endOffset);

// baseColumnSelectorFactory using baseOffset is the column selector for filtering.
final VectorColumnSelectorFactory baseColumnSelectorFactory = makeVectorColumnSelectorFactoryForOffset(
columnCache,
baseOffset,
closer
);
if (postFilter == null) {
return new QueryableIndexVectorCursor(index, baseOffset, closer, columnCache, vectorSize);
return new QueryableIndexVectorCursor(baseColumnSelectorFactory, baseOffset, vectorSize, closer);
} else {
// baseColumnSelectorFactory using baseOffset is the column selector for filtering.
final VectorColumnSelectorFactory baseColumnSelectorFactory = new QueryableIndexVectorColumnSelectorFactory(
index,
baseOffset,
closer,
columnCache
);

final VectorOffset filteredOffset = FilteredVectorOffset.create(
baseOffset,
baseColumnSelectorFactory,
Expand All @@ -254,10 +251,31 @@ public VectorCursor buildVectorized(final int vectorSize)
// object will get hit twice for some of the values (anything that matched the filter). This is probably most
// noticeable if it causes thrashing of decompression buffers due to out-of-order reads. I haven't observed
// this directly but it seems possible in principle.
return new QueryableIndexVectorCursor(index, filteredOffset, closer, columnCache, vectorSize);
// baseColumnSelectorFactory using baseOffset is the column selector for filtering.
final VectorColumnSelectorFactory filteredColumnSelectorFactory = makeVectorColumnSelectorFactoryForOffset(
columnCache,
filteredOffset,
closer
);
return new QueryableIndexVectorCursor(filteredColumnSelectorFactory, filteredOffset, vectorSize, closer);
}
}

VectorColumnSelectorFactory makeVectorColumnSelectorFactoryForOffset(
Map<String, BaseColumn> columnCache,
VectorOffset baseOffset,
Closer closer
)
{
return new QueryableIndexVectorColumnSelectorFactory(
index,
baseOffset,
closer,
columnCache,
virtualColumns
);
}

/**
* Search the time column using binary search. Benchmarks on various other approaches (linear search, binary
* search that switches to linear at various closeness thresholds) indicated that a pure binary search worked best.
Expand Down Expand Up @@ -318,17 +336,16 @@ private static class QueryableIndexVectorCursor implements VectorCursor
private final VectorColumnSelectorFactory columnSelectorFactory;

public QueryableIndexVectorCursor(
final QueryableIndex index,
final VectorColumnSelectorFactory vectorColumnSelectorFactory,
final VectorOffset offset,
final Closer closer,
final Map<String, BaseColumn> columnCache,
final int vectorSize
final int vectorSize,
final Closer closer
)
{
this.columnSelectorFactory = vectorColumnSelectorFactory;
this.vectorSize = vectorSize;
this.offset = offset;
this.closer = closer;
this.vectorSize = vectorSize;
this.columnSelectorFactory = new QueryableIndexVectorColumnSelectorFactory(index, offset, closer, columnCache);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -219,9 +219,8 @@ public boolean canVectorize(
}
}

// 1) Virtual columns can't vectorize yet
// 2) Vector cursors can't iterate backwards yet
return virtualColumns.size() == 0 && !descending;
// vector cursors can't iterate backwards yet
return virtualColumns.canVectorize(this) && !descending;
}

@Override
Expand Down
Loading