From 39d6a32bd5e3bce6fa2e86175ede9eb9dd5329b1 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Wed, 6 Mar 2024 11:53:52 +0530 Subject: [PATCH 01/46] init --- .../epinephelinae/BufferHashGrouper.java | 1 + .../epinephelinae/GroupByQueryEngine.java | 40 +++-- ...yNumericGroupByColumnSelectorStrategy.java | 1 + ...BuildingGroupByColumnSelectorStrategy.java | 5 + ...ngStringGroupByColumnSelectorStrategy.java | 10 ++ ...xedWidthGroupByColumnSelectorStrategy.java | 162 ++++++++++++++++++ ...yMappingGroupByColumnSelectorStrategy.java | 143 ++++++++++++++++ .../column/MultiValueHelper.java | 14 ++ .../column/MultiValueHelpers.java | 14 ++ .../StringGroupByColumnSelectorStrategy.java | 14 +- .../segment/column/NullableTypeStrategy.java | 3 + .../druid/segment/column/TypeStrategy.java | 1 + 12 files changed, 384 insertions(+), 24 deletions(-) create mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java create mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java create mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java create mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelper.java create mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/BufferHashGrouper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/BufferHashGrouper.java index 167b322b9d45..2bb97c70ee12 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/BufferHashGrouper.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/BufferHashGrouper.java @@ -205,6 +205,7 @@ public int size() } // Sort offsets in-place. + // TODO(laksh): Perhaps this can utilise the MSQ's way of using byte comparisons Collections.sort( wrappedOffsets, (lhs, rhs) -> { diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java index 9ffc006799ce..aa404760ae63 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java @@ -46,6 +46,7 @@ import org.apache.druid.query.groupby.epinephelinae.column.ArrayStringGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.DictionaryBuildingStringGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.DoubleGroupByColumnSelectorStrategy; +import org.apache.druid.query.groupby.epinephelinae.column.FixedWidthGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.FloatGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.GroupByColumnSelectorPlus; import org.apache.druid.query.groupby.epinephelinae.column.GroupByColumnSelectorStrategy; @@ -250,17 +251,32 @@ public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( switch (capabilities.getType()) { case STRING: DimensionSelector dimSelector = (DimensionSelector) selector; - if (dimSelector.getValueCardinality() >= 0) { + if (dimSelector.getValueCardinality() >= 0 && dimSelector.nameLookupPossibleInAdvance()) { return new StringGroupByColumnSelectorStrategy(dimSelector::lookupName, capabilities); } else { return new DictionaryBuildingStringGroupByColumnSelectorStrategy(); } case LONG: - return makeNullableNumericStrategy(new LongGroupByColumnSelectorStrategy()); + return new FixedWidthGroupByColumnSelectorStrategy( + Byte.BYTES + Long.BYTES, + null, + true, + ColumnType.LONG + ); case FLOAT: - return makeNullableNumericStrategy(new FloatGroupByColumnSelectorStrategy()); + return new FixedWidthGroupByColumnSelectorStrategy( + Byte.BYTES + Float.BYTES, + null, + true, + ColumnType.FLOAT + ); case DOUBLE: - return makeNullableNumericStrategy(new DoubleGroupByColumnSelectorStrategy()); + return new FixedWidthGroupByColumnSelectorStrategy( + Byte.BYTES + Double.BYTES, + null, + true, + ColumnType.DOUBLE + ); case ARRAY: switch (capabilities.getElementType().getType()) { case LONG: @@ -280,14 +296,14 @@ public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( } } - private GroupByColumnSelectorStrategy makeNullableNumericStrategy(GroupByColumnSelectorStrategy delegate) - { - if (NullHandling.sqlCompatible()) { - return new NullableNumericGroupByColumnSelectorStrategy(delegate); - } else { - return delegate; - } - } +// private GroupByColumnSelectorStrategy makeNullableNumericStrategy(GroupByColumnSelectorStrategy delegate) +// { +// if (NullHandling.sqlCompatible()) { +// return new NullableNumericGroupByColumnSelectorStrategy(delegate); +// } else { +// return delegate; +// } +// } } private abstract static class GroupByEngineIterator implements Iterator, Closeable diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayNumericGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayNumericGroupByColumnSelectorStrategy.java index 62c479885021..3818b75a1a05 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayNumericGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayNumericGroupByColumnSelectorStrategy.java @@ -144,6 +144,7 @@ protected int addToIndexedDictionary(Object[] t) @Override public Grouper.BufferComparator bufferComparator(int keyBufferPosition, @Nullable StringComparator stringComparator) { + // TODO(laksh): This can be optimised probably if stringComparator == null StringComparator comparator = stringComparator == null ? StringComparators.NUMERIC : stringComparator; return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> { Object[] lhs = dictionary.get(lhsBuffer.getInt(lhsPosition + keyBufferPosition)); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java new file mode 100644 index 000000000000..1f9564ad0a6e --- /dev/null +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -0,0 +1,5 @@ +package org.apache.druid.query.groupby.epinephelinae.column; + +public class DictionaryBuildingGroupByColumnSelectorStrategy extends GroupByColumnSelectorStrategy +{ +} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingStringGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingStringGroupByColumnSelectorStrategy.java index dfc5149d52a5..63b7262df8cf 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingStringGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingStringGroupByColumnSelectorStrategy.java @@ -103,6 +103,16 @@ public int initColumnValues(ColumnValueSelector selector, int columnIndex, Objec return stateFootprintIncrease; } + /** + * Writes a dictionary ID to the grouping key. + */ + private void writeToKeyBuffer(int keyBufferPosition, int dictId, ByteBuffer keyBuffer) + { + keyBuffer.putInt(keyBufferPosition, dictId); + } + + + @Override public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) { diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java new file mode 100644 index 000000000000..dbf7574c4103 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java @@ -0,0 +1,162 @@ +package org.apache.druid.query.groupby.epinephelinae.column; + +import org.apache.druid.query.DimensionComparisonUtils; +import org.apache.druid.query.groupby.ResultRow; +import org.apache.druid.query.groupby.epinephelinae.Grouper; +import org.apache.druid.query.ordering.StringComparator; +import org.apache.druid.segment.ColumnValueSelector; +import org.apache.druid.segment.DimensionHandlerUtils; +import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.column.NullableTypeStrategy; + +import javax.annotation.Nullable; +import javax.annotation.concurrent.NotThreadSafe; +import java.nio.ByteBuffer; + +// Used only by primitives right now, however specialized complex types can reuse this once we have a way to extract +// the required info +// Not thread safe because does weird stuff with buffer's position while reading +@NotThreadSafe +public class FixedWidthGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy +{ + + final int keySize; + @Nullable + final MultiValueHelper multiValueHelper; + final boolean isPrimitive; + final ColumnType columnType; + final NullableTypeStrategy nullableTypeStrategy; + + public FixedWidthGroupByColumnSelectorStrategy( + int keySize, + @Nullable MultiValueHelper multiValueHelper, + boolean isPrimitive, + ColumnType columnType + ) + { + this.keySize = keySize; + this.multiValueHelper = multiValueHelper; + this.isPrimitive = isPrimitive; + this.columnType = columnType; + this.nullableTypeStrategy = columnType.getNullableStrategy(); + } + + @Override + public int getGroupingKeySize() + { + return keySize; + } + + @Override + public void processValueFromGroupingKey( + GroupByColumnSelectorPlus selectorPlus, + ByteBuffer key, + ResultRow resultRow, + int keyBufferPosition + ) + { + resultRow.set( + selectorPlus.getResultRowPosition(), + nullableTypeStrategy.read(key, keyBufferPosition) + ); + } + + @Override + public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] valuess) + { + // It is expected of the primitive selectors to be returning default value of the implementation here. In the + // getObject(), if it returns null, it won't +// if (selectorIsNull(selector)) { +// valuess[columnIndex] = null; +// } else { + // Here the primitive selectors should have returned correct values - float shouldn't return longs and vice versa + // Perhaps we'd require a cast as well, which is done implicitly when we call the .getLong/.getFloat/.getDouble +// valuess[columnIndex] = selector.getObject(); +// } + + valuess[columnIndex] = getValue(selector); + return 0; + } + + @Override + public void initGroupingKeyColumnValue( + int keyBufferPosition, + int dimensionIndex, + Object rowObj, + ByteBuffer keyBuffer, + int[] stack + ) + { + if (rowObj == null) { + nullableTypeStrategy.write(keyBuffer, keyBufferPosition, null, keySize); + } else { + nullableTypeStrategy.write(keyBuffer, keyBufferPosition, (T) rowObj, keySize); + stack[dimensionIndex] = 1; + } + } + + @Override + public boolean checkRowIndexAndAddValueToGroupingKey( + int keyBufferPosition, Object rowObj, int rowValIdx, ByteBuffer keyBuffer + ) + { + return false; + } + + @Override + public int writeToKeyBuffer( + int keyBufferPosition, + ColumnValueSelector selector, + ByteBuffer keyBuffer + ) + { + nullableTypeStrategy.write(keyBuffer, keyBufferPosition, getValue(selector), keySize); + return 0; + } + + @Override + public Grouper.BufferComparator bufferComparator( + int keyBufferPosition, + @Nullable StringComparator stringComparator + ) + { + return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> { + T lhs = nullableTypeStrategy.read(lhsBuffer, lhsPosition + keyBufferPosition); + T rhs = nullableTypeStrategy.read(rhsBuffer, rhsPosition + keyBufferPosition); + if (stringComparator != null + && !DimensionComparisonUtils.isNaturalComparator(columnType.getType(), stringComparator)) { + return stringComparator.compare(String.valueOf(lhs), String.valueOf(rhs)); + } + // Nulls are allowed while comparing + return nullableTypeStrategy.compare(lhs, rhs); + }; + } + + + @Override + public void reset() + { + // Nothing to reset + } + + // unifies the primitive and th + private boolean selectorIsNull(ColumnValueSelector columnValueSelector) + { + if (isPrimitive && columnValueSelector.isNull()) { + return true; + } + return !isPrimitive && (columnValueSelector.getObject() == null); + } + + // Handles primitives as well, also might case + @Nullable + private T getValue(ColumnValueSelector columnValueSelector) + { + if (selectorIsNull(columnValueSelector)) { + return null; + } + // case is safe + return (T) DimensionHandlerUtils.convertObjectToType(columnValueSelector.getObject(), columnType); + } + +} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java new file mode 100644 index 000000000000..19514bdf81b0 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -0,0 +1,143 @@ +package org.apache.druid.query.groupby.epinephelinae.column; + +import com.google.common.base.Preconditions; +import org.apache.druid.java.util.common.Pair; +import org.apache.druid.query.DimensionComparisonUtils; +import org.apache.druid.query.groupby.ResultRow; +import org.apache.druid.query.groupby.epinephelinae.Grouper; +import org.apache.druid.query.ordering.StringComparator; +import org.apache.druid.segment.ColumnValueSelector; +import org.apache.druid.segment.DimensionHandlerUtils; +import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.column.NullableTypeStrategy; + +import javax.annotation.Nullable; +import java.nio.ByteBuffer; + +public class KeyMappingGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy +{ + final int mappedKeySize; + @Nullable + final MultiValueHelper multiValueHelper; + final boolean isPrimitive; + final ColumnType columnType; + final NullableTypeStrategy nullableTypeStrategy; + final Object defaultValue; + final KeyMapper keyMapper; + + @Override + public int getGroupingKeySize() + { + return mappedKeySize; + } + + @Override + public void processValueFromGroupingKey( + GroupByColumnSelectorPlus selectorPlus, + ByteBuffer key, + ResultRow resultRow, + int keyBufferPosition + ) + { + final int id = key.getInt(keyBufferPosition); + if (id != GROUP_BY_MISSING_VALUE) { + resultRow.set(selectorPlus.getResultRowPosition(), keyMapper.idToKey(id)); + } else { + resultRow.set(selectorPlus.getResultRowPosition(), defaultValue); + } + } + + @Override + public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] valuess) + { + Pair multiValueHolderAndSizeIncrease = multiValueHelper.getMultiValueHolder(selector, null); + valuess[columnIndex] = multiValueHelper.getMultiValueHolder(selector, null); + return 0; + } + + @Override + public void initGroupingKeyColumnValue( + int keyBufferPosition, + int dimensionIndex, + Object rowObj, + ByteBuffer keyBuffer, + int[] stack + ) + { + int rowSize = multiValueHelper.multiValueSize(rowObj); + if (rowSize == 0) { + keyBuffer.putInt(keyBufferPosition, GROUP_BY_MISSING_VALUE); + } else { + keyBuffer.putInt(keyBufferPosition, multiValueHelper.getIndividualValueDictId(rowObj, 0)); + } + } + + @Override + public boolean checkRowIndexAndAddValueToGroupingKey( int keyBufferPosition, Object rowObj, int rowValIdx, ByteBuffer keyBuffer) + { + int rowSize = multiValueHelper.multiValueSize(rowObj); + if (rowValIdx < rowSize) { + keyBuffer.putInt( + keyBufferPosition, + multiValueHelper.getIndividualValueDictId(rowObj, rowValIdx) + ); + return true; + } else { + return false; + } + } + + @Override + public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) + { + Object multiValueHolder = multiValueHelper.getMultiValueHolder(selector, null); + int multiValueSize = multiValueHelper.multiValueSize(multiValueHolder); + Preconditions.checkState(multiValueSize < 2, "Not supported for multi-value dimensions"); + final int dictId = multiValueSize == 1 + ? multiValueHelper.getIndividualValueDictId(multiValueHolder, 0) + : GROUP_BY_MISSING_VALUE; + keyBuffer.putInt(keyBufferPosition, dictId); + return 0; + } + + @Override + public Grouper.BufferComparator bufferComparator(int keyBufferPosition, @Nullable StringComparator stringComparator) + { + boolean usesNaturalComparator = + stringComparator == null + || DimensionComparisonUtils.isNaturalComparator(columnType.getType(), stringComparator); + if (keyMapper.canCompareIds() && usesNaturalComparator) { + return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> Integer.compare( + lhsBuffer.getInt(lhsPosition + keyBufferPosition), + rhsBuffer.getInt(rhsPosition + keyBufferPosition) + ); + } else { + return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> { + Object lhsObject = keyMapper.idToKey(lhsBuffer.getInt(lhsPosition + keyBufferPosition)); + Object rhsObject = keyMapper.idToKey(rhsBuffer.getInt(rhsPosition + keyBufferPosition)); + if (usesNaturalComparator) { + return nullableTypeStrategy.compare( + (DimensionType) DimensionHandlerUtils.convertObjectToType(lhsObject, columnType), + (DimensionType) DimensionHandlerUtils.convertObjectToType(rhsObject, columnType) + ); + } else { + return stringComparator.compare(String.valueOf(lhsObject), String.valueOf(rhsObject)); + } + }; + } + } + + @Override + public void reset() + { + + } + + // Doesn't handle GROUP_BY_MISSING_VALUE, should be done by the callers + public interface KeyMapper + { + KeyType idToKey(int id); + + boolean canCompareIds(); + } +} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelper.java new file mode 100644 index 000000000000..bcb675ed9a98 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelper.java @@ -0,0 +1,14 @@ +package org.apache.druid.query.groupby.epinephelinae.column; + +import org.apache.druid.java.util.common.Pair; +import org.apache.druid.segment.ColumnValueSelector; + +// Don't really use HolderType anywhere for now, we cast stuff everywhere, but perhaps with new selectors, we can +public interface MultiValueHelper +{ + Pair getMultiValueHolder(ColumnValueSelector selector, HolderType reusableValue); + + int multiValueSize(HolderType multiValueHolder); + + Pair getIndividualValueDictId(HolderType multiValueHolder, int index); +} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java new file mode 100644 index 000000000000..c330e4d9130a --- /dev/null +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java @@ -0,0 +1,14 @@ +package org.apache.druid.query.groupby.epinephelinae.column; + +public class MultiValueHelpers +{ + MultiValueHelper multiValueHelperForDimensionSelectors() + { + return null; + } + + MultiValueHelper multiValueHelperForSingleValueSelectors() + { + return null; + } +} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/StringGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/StringGroupByColumnSelectorStrategy.java index 8c25c775d71d..12b6da11129d 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/StringGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/StringGroupByColumnSelectorStrategy.java @@ -84,16 +84,6 @@ public int initColumnValues(ColumnValueSelector selector, int columnIndex, Objec return 0; } - /** - * Writes a dictionary ID to the grouping key. - * - * Protected so subclasses can access it, like {@link DictionaryBuildingStringGroupByColumnSelectorStrategy}. - */ - protected void writeToKeyBuffer(int keyBufferPosition, int dictId, ByteBuffer keyBuffer) - { - keyBuffer.putInt(keyBufferPosition, dictId); - } - @Override public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) { @@ -117,7 +107,7 @@ public void initGroupingKeyColumnValue( IndexedInts row = (IndexedInts) rowObj; int rowSize = row.size(); - initializeGroupingKeyV2Dimension(row, rowSize, keyBuffer, keyBufferPosition); + initializeGroupingKeyDimension(row, rowSize, keyBuffer, keyBufferPosition); stack[dimensionIndex] = rowSize == 0 ? 0 : 1; } @@ -143,7 +133,7 @@ public boolean checkRowIndexAndAddValueToGroupingKey( } } - private void initializeGroupingKeyV2Dimension( + private void initializeGroupingKeyDimension( final IndexedInts values, final int rowSize, final ByteBuffer keyBuffer, diff --git a/processing/src/main/java/org/apache/druid/segment/column/NullableTypeStrategy.java b/processing/src/main/java/org/apache/druid/segment/column/NullableTypeStrategy.java index f7ed0298cc55..ad5af7089655 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/NullableTypeStrategy.java +++ b/processing/src/main/java/org/apache/druid/segment/column/NullableTypeStrategy.java @@ -21,6 +21,7 @@ import org.apache.druid.common.config.NullHandling; +import javax.annotation.CheckReturnValue; import javax.annotation.Nullable; import java.nio.ByteBuffer; import java.util.Comparator; @@ -66,6 +67,7 @@ public T read(ByteBuffer buffer) return delegate.read(buffer); } + @CheckReturnValue public int write(ByteBuffer buffer, @Nullable T value, int maxSizeBytes) { final int max = Math.min(buffer.limit() - buffer.position(), maxSizeBytes); @@ -112,6 +114,7 @@ public boolean readRetainsBufferReference() return delegate.readRetainsBufferReference(); } + @CheckReturnValue public int write(ByteBuffer buffer, int offset, @Nullable T value, int maxSizeBytes) { final int oldPosition = buffer.position(); diff --git a/processing/src/main/java/org/apache/druid/segment/column/TypeStrategy.java b/processing/src/main/java/org/apache/druid/segment/column/TypeStrategy.java index e4856f889714..888e8203f4c1 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/TypeStrategy.java +++ b/processing/src/main/java/org/apache/druid/segment/column/TypeStrategy.java @@ -143,6 +143,7 @@ default T read(ByteBuffer buffer, int offset) * Callers MUST check that the return value is positive which indicates a successful write, while a negative response * a partial write. * + * // TODO(laksh): Can be optimised for the primitive types * @return number of bytes written */ default int write(ByteBuffer buffer, int offset, T value, int maxSizeBytes) From 731d2732c90c450293a1b450f06af4db1e09a908 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Wed, 6 Mar 2024 17:20:22 +0530 Subject: [PATCH 02/46] draft work --- .../epinephelinae/DictionaryBuilding.java | 9 ++ ...BuildingGroupByColumnSelectorStrategy.java | 97 ++++++++++++++++++- ...xedWidthGroupByColumnSelectorStrategy.java | 6 +- ...yMappingGroupByColumnSelectorStrategy.java | 53 +++++++--- .../{MultiValueHelper.java => KeyToId.java} | 4 +- .../column/MultiValueHelpers.java | 4 +- ...ryStringGroupByColumnSelectorStrategy.java | 61 ++++++++++++ 7 files changed, 210 insertions(+), 24 deletions(-) rename processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/{MultiValueHelper.java => KeyToId.java} (75%) create mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java index a8f16d2b2ec5..2684dd7aae4f 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java @@ -23,6 +23,7 @@ import it.unimi.dsi.fastutil.objects.Object2IntMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenCustomHashMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import it.unimi.dsi.fastutil.objects.Object2IntRBTreeMap; import org.apache.druid.error.DruidException; import org.apache.druid.segment.DimensionDictionary; import org.apache.druid.segment.column.TypeSignature; @@ -30,6 +31,7 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.Comparator; import java.util.List; /** @@ -70,6 +72,13 @@ private static Object2IntMap createReverseDictionary(final Hash.Strategy< return m; } + public static Object2IntRBTreeMap createTreeSortedReverseDictionary(Comparator comparator) + { + final Object2IntRBTreeMap m = new Object2IntRBTreeMap<>(comparator); + m.defaultReturnValue(DimensionDictionary.ABSENT_VALUE_ID); + return m; + } + /** * Creates a reverse dictionary for arrays of primitive types. */ diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java index 1f9564ad0a6e..62bb51f3932e 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -1,5 +1,100 @@ package org.apache.druid.query.groupby.epinephelinae.column; -public class DictionaryBuildingGroupByColumnSelectorStrategy extends GroupByColumnSelectorStrategy +import it.unimi.dsi.fastutil.objects.Object2IntMap; +import org.apache.druid.java.util.common.Pair; +import org.apache.druid.query.groupby.epinephelinae.DictionaryBuilding; +import org.apache.druid.segment.ColumnValueSelector; +import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.column.NullableTypeStrategy; + +import javax.annotation.Nullable; +import java.util.List; + +public class DictionaryBuildingGroupByColumnSelectorStrategy extends KeyMappingGroupByColumnSelectorStrategy { + + NullableTypeStrategy nullableTypeStrategy; + private final List dictionary = DictionaryBuilding.createDictionary(); + private final Object2IntMap reverseDictionary = DictionaryBuilding.createTreeSortedReverseDictionary( + nullableTypeStrategy); + + public DictionaryBuildingGroupByColumnSelectorStrategy( + @Nullable KeyToId keyToId, + ColumnType columnType, + NullableTypeStrategy nullableTypeStrategy, + Object defaultValue, + KeyMapper keyMapper + ) + { + // For Strings + KeyToId keyToId1 = new KeyToId() + { + @Override + public Pair getMultiValueHolder( + ColumnValueSelector selector, + Object reusableValue + ) + { + return null; + } + + @Override + public int multiValueSize(Object multiValueHolder) + { + return 0; + } + + @Override + public Pair getIndividualValueDictId( + Object multiValueHolder, + int index + ) + { + return null; + } + }; + + // For other types + KeyToId keyToId2 = new KeyToId() + { + @Override + public Pair getMultiValueHolder( + ColumnValueSelector selector, + Object reusableValue + ) + { + return + } + + @Override + public int multiValueSize(Object multiValueHolder) + { + return 0; + } + + @Override + public Pair getIndividualValueDictId(Object multiValueHolder, int index) + { + return null; + } + }; + + KeyMapper keyMapper1 = new KeyMapper() + { + @Override + public Object idToKey(int id) + { + if (id != GROUP_BY_MISSING_VALUE) { + return dictionary.get(id); + } + return defaultValue; + } + + @Override + public boolean canCompareIds() + { + return false; + } + }; + } } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java index dbf7574c4103..5404eb631257 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java @@ -22,20 +22,20 @@ public class FixedWidthGroupByColumnSelectorStrategy implements GroupByColumn final int keySize; @Nullable - final MultiValueHelper multiValueHelper; + final KeyToId keyToId; final boolean isPrimitive; final ColumnType columnType; final NullableTypeStrategy nullableTypeStrategy; public FixedWidthGroupByColumnSelectorStrategy( int keySize, - @Nullable MultiValueHelper multiValueHelper, + @Nullable KeyToId keyToId, boolean isPrimitive, ColumnType columnType ) { this.keySize = keySize; - this.multiValueHelper = multiValueHelper; + this.keyToId = keyToId; this.isPrimitive = isPrimitive; this.columnType = columnType; this.nullableTypeStrategy = columnType.getNullableStrategy(); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java index 19514bdf81b0..520c12860343 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -16,19 +16,32 @@ public class KeyMappingGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy { - final int mappedKeySize; @Nullable - final MultiValueHelper multiValueHelper; - final boolean isPrimitive; + final KeyToId keyToId; final ColumnType columnType; final NullableTypeStrategy nullableTypeStrategy; final Object defaultValue; final KeyMapper keyMapper; + public KeyMappingGroupByColumnSelectorStrategy( + @Nullable final KeyToId keyToId, + final ColumnType columnType, + final NullableTypeStrategy nullableTypeStrategy, + final Object defaultValue, + final KeyMapper keyMapper + ) + { + this.keyToId = keyToId; + this.columnType = columnType; + this.nullableTypeStrategy = nullableTypeStrategy; + this.defaultValue = defaultValue; + this.keyMapper = keyMapper; + } + @Override public int getGroupingKeySize() { - return mappedKeySize; + return Integer.BYTES; } @Override @@ -50,9 +63,9 @@ public void processValueFromGroupingKey( @Override public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] valuess) { - Pair multiValueHolderAndSizeIncrease = multiValueHelper.getMultiValueHolder(selector, null); - valuess[columnIndex] = multiValueHelper.getMultiValueHolder(selector, null); - return 0; + Pair multiValueHolderAndSizeIncrease = keyToId.getMultiValueHolder(selector, null); + valuess[columnIndex] = multiValueHolderAndSizeIncrease.lhs; + return multiValueHolderAndSizeIncrease.rhs; } @Override @@ -64,22 +77,29 @@ public void initGroupingKeyColumnValue( int[] stack ) { - int rowSize = multiValueHelper.multiValueSize(rowObj); + int rowSize = keyToId.multiValueSize(rowObj); if (rowSize == 0) { keyBuffer.putInt(keyBufferPosition, GROUP_BY_MISSING_VALUE); } else { - keyBuffer.putInt(keyBufferPosition, multiValueHelper.getIndividualValueDictId(rowObj, 0)); + // No need to check here, since we'd have already accounted for it when we call + // initColumnValues + keyBuffer.putInt(keyBufferPosition, keyToId.getIndividualValueDictId(rowObj, 0).lhs); } } @Override - public boolean checkRowIndexAndAddValueToGroupingKey( int keyBufferPosition, Object rowObj, int rowValIdx, ByteBuffer keyBuffer) + public boolean checkRowIndexAndAddValueToGroupingKey( + int keyBufferPosition, + Object rowObj, + int rowValIdx, + ByteBuffer keyBuffer + ) { - int rowSize = multiValueHelper.multiValueSize(rowObj); + int rowSize = keyToId.multiValueSize(rowObj); if (rowValIdx < rowSize) { keyBuffer.putInt( keyBufferPosition, - multiValueHelper.getIndividualValueDictId(rowObj, rowValIdx) + keyToId.getIndividualValueDictId(rowObj, rowValIdx).lhs ); return true; } else { @@ -90,14 +110,15 @@ public boolean checkRowIndexAndAddValueToGroupingKey( int keyBufferPosition, Obj @Override public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) { - Object multiValueHolder = multiValueHelper.getMultiValueHolder(selector, null); - int multiValueSize = multiValueHelper.multiValueSize(multiValueHolder); + Object multiValueHolder = keyToId.getMultiValueHolder(selector, null); + int multiValueSize = keyToId.multiValueSize(multiValueHolder); Preconditions.checkState(multiValueSize < 2, "Not supported for multi-value dimensions"); + Pair dictIdAndSizeIncrease = keyToId.getIndividualValueDictId(multiValueHolder, 0); final int dictId = multiValueSize == 1 - ? multiValueHelper.getIndividualValueDictId(multiValueHolder, 0) + ? dictIdAndSizeIncrease.lhs : GROUP_BY_MISSING_VALUE; keyBuffer.putInt(keyBufferPosition, dictId); - return 0; + return dictIdAndSizeIncrease.rhs; } @Override diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyToId.java similarity index 75% rename from processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelper.java rename to processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyToId.java index bcb675ed9a98..a099199a264a 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelper.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyToId.java @@ -4,11 +4,11 @@ import org.apache.druid.segment.ColumnValueSelector; // Don't really use HolderType anywhere for now, we cast stuff everywhere, but perhaps with new selectors, we can -public interface MultiValueHelper +public interface KeyToId { Pair getMultiValueHolder(ColumnValueSelector selector, HolderType reusableValue); int multiValueSize(HolderType multiValueHolder); - Pair getIndividualValueDictId(HolderType multiValueHolder, int index); + Pair getIndividualValueDictId(HolderType multiValueHolder, int index); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java index c330e4d9130a..068ead2ede70 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java @@ -2,12 +2,12 @@ public class MultiValueHelpers { - MultiValueHelper multiValueHelperForDimensionSelectors() + KeyToId multiValueHelperForDimensionSelectors() { return null; } - MultiValueHelper multiValueHelperForSingleValueSelectors() + KeyToId multiValueHelperForSingleValueSelectors() { return null; } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java new file mode 100644 index 000000000000..5808a5c661f2 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java @@ -0,0 +1,61 @@ +package org.apache.druid.query.groupby.epinephelinae.column; + +import org.apache.druid.java.util.common.Pair; +import org.apache.druid.segment.ColumnValueSelector; +import org.apache.druid.segment.DimensionSelector; +import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.column.NullableTypeStrategy; +import org.apache.druid.segment.data.IndexedInts; + +import javax.annotation.Nullable; + +public class PrebuiltDictionaryStringGroupByColumnSelectorStrategy extends KeyMappingGroupByColumnSelectorStrategy +{ + public PrebuiltDictionaryStringGroupByColumnSelectorStrategy( + @Nullable KeyToId keyToId, + ColumnType columnType, + NullableTypeStrategy nullableTypeStrategy, + Object defaultValue, + KeyMapper keyMapper + ) + { + DimensionSelector dimS; + KeyToId keyToId1 = new KeyToId() + { + @Override + public Pair getMultiValueHolder(ColumnValueSelector selector, Object reusableValue) + { + return Pair.of(((DimensionSelector) selector).getRow(), 0); + } + + @Override + public int multiValueSize(Object multiValueHolder) + { + return ((IndexedInts) multiValueHolder).size(); + } + + @Override + public Pair getIndividualValueDictId(Object multiValueHolder, int index) + { + return Pair.of(((IndexedInts) multiValueHolder).get(index), 0); + } + }; + + KeyMapper keyMapper1 = new KeyMapper() + { + @Override + public String idToKey(int id) + { + return dimS.lookupName(id); + } + + @Override + public boolean canCompareIds() + { + return false; + } + }; + } + + +} From b01344a103df890b78a305ba4364e9470cabc93f Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Thu, 7 Mar 2024 12:16:57 +0530 Subject: [PATCH 03/46] more --- ...BuildingGroupByColumnSelectorStrategy.java | 86 ++++++++++++++++--- ...yMappingGroupByColumnSelectorStrategy.java | 47 ++++++---- .../groupby/epinephelinae/column/KeyToId.java | 10 +-- ...ryStringGroupByColumnSelectorStrategy.java | 2 - 4 files changed, 106 insertions(+), 39 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java index 62bb51f3932e..5fc550e0fba4 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -4,18 +4,25 @@ import org.apache.druid.java.util.common.Pair; import org.apache.druid.query.groupby.epinephelinae.DictionaryBuilding; import org.apache.druid.segment.ColumnValueSelector; +import org.apache.druid.segment.DimensionHandlerUtils; +import org.apache.druid.segment.DimensionSelector; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.column.NullableTypeStrategy; +import org.apache.druid.segment.data.ArrayBasedIndexedInts; +import org.apache.druid.segment.data.IndexedInts; import javax.annotation.Nullable; +import javax.annotation.concurrent.NotThreadSafe; import java.util.List; +import java.util.function.Function; +@NotThreadSafe public class DictionaryBuildingGroupByColumnSelectorStrategy extends KeyMappingGroupByColumnSelectorStrategy { NullableTypeStrategy nullableTypeStrategy; - private final List dictionary = DictionaryBuilding.createDictionary(); - private final Object2IntMap reverseDictionary = DictionaryBuilding.createTreeSortedReverseDictionary( + private final List dictionary = DictionaryBuilding.createDictionary(); + private final Object2IntMap reverseDictionary = DictionaryBuilding.createTreeSortedReverseDictionary( nullableTypeStrategy); public DictionaryBuildingGroupByColumnSelectorStrategy( @@ -29,53 +36,99 @@ public DictionaryBuildingGroupByColumnSelectorStrategy( // For Strings KeyToId keyToId1 = new KeyToId() { + final Function footprintCompute; @Override public Pair getMultiValueHolder( ColumnValueSelector selector, - Object reusableValue + Object reusableValue // Optimisation to not create and allocate something new ) { - return null; + final DimensionSelector dimensionSelector = (DimensionSelector) selector; + final IndexedInts row = dimensionSelector.getRow(); + int footprintIncrease = 0; + ArrayBasedIndexedInts newRow = (ArrayBasedIndexedInts) reusableValue; + if (newRow == null) { + newRow = new ArrayBasedIndexedInts(); + } + int rowSize = row.size(); + newRow.ensureSize(rowSize); + for(int i = 0; i < rowSize; ++i) { + final String value = dimensionSelector.lookupName(row.get(i)); + final int dictId = reverseDictionary.getInt(value); + if (dictId < 0) { + final int nextId = dictionary.size(); + dictionary.add(value); + reverseDictionary.put(value, nextId); + newRow.setValue(i, nextId); + footprintIncrease += DictionaryBuilding.estimateEntryFootprint(footprintCompute.apply(value)); + } else { + newRow.setValue(i, dictId); + } + } + newRow.setSize(rowSize); + return Pair.of(newRow, footprintIncrease); } @Override public int multiValueSize(Object multiValueHolder) { - return 0; + return ((IndexedInts) multiValueHolder).size(); } @Override - public Pair getIndividualValueDictId( - Object multiValueHolder, - int index - ) + public Pair getIndividualValueDictId(Object multiValueHolder, int index) { - return null; + // Already converted it to the dictionary id + return Pair.of(((IndexedInts) multiValueHolder).get(index), 0); } }; // For other types KeyToId keyToId2 = new KeyToId() { + final Function footprintCompute; + + // Assert that Object in the return type will be properly casted @Override public Pair getMultiValueHolder( ColumnValueSelector selector, Object reusableValue ) { - return + final Object value = DimensionHandlerUtils.convertObjectToType(selector.getObject(), columnType); + final int dictId = reverseDictionary.getInt(value); + int footprintIncrease = 0; + if (dictId < 0) { + final int size = dictionary.size(); + dictionary.add(value); + reverseDictionary.put(value, size); + footprintIncrease = DictionaryBuilding.estimateEntryFootprint(footprintCompute.apply(value)); + + } + return Pair.of(value, footprintIncrease); } @Override public int multiValueSize(Object multiValueHolder) { - return 0; + return multiValueHolder == null ? 0 : 1; } @Override public Pair getIndividualValueDictId(Object multiValueHolder, int index) { - return null; + assert index == 0; + int dictId = reverseDictionary.getInt(multiValueHolder); + int footprintIncrease = 0; + if (dictId < 0) { + final int size = dictionary.size(); + dictionary.add(multiValueHolder); + reverseDictionary.put(multiValueHolder, size); + dictId = size; + // TODO(laksh): confirm if this is the same for sorted dictionaries as well + footprintIncrease = DictionaryBuilding.estimateEntryFootprint(footprintCompute.apply(multiValueHolder)); + } + return Pair.of(dictId, footprintIncrease); } }; @@ -97,4 +150,11 @@ public boolean canCompareIds() } }; } + + @Override + public void reset() + { + dictionary.clear(); + reverseDictionary.clear(); + } } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java index 520c12860343..b98f033e647e 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -14,20 +14,24 @@ import javax.annotation.Nullable; import java.nio.ByteBuffer; -public class KeyMappingGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy +// Only supports int mapping. +// DimensionType is the dimension's type - eg strings +// DimensionHolderType is the multi value holder for the dimension, if it exists, else it will be same as DimensionType +public class KeyMappingGroupByColumnSelectorStrategy + implements GroupByColumnSelectorStrategy { - @Nullable - final KeyToId keyToId; + final KeyToId keyToId; final ColumnType columnType; final NullableTypeStrategy nullableTypeStrategy; - final Object defaultValue; + final DimensionType defaultValue; final KeyMapper keyMapper; - public KeyMappingGroupByColumnSelectorStrategy( - @Nullable final KeyToId keyToId, + // Restricted access, callers should use one of it's subclasses + KeyMappingGroupByColumnSelectorStrategy( + final KeyToId keyToId, final ColumnType columnType, final NullableTypeStrategy nullableTypeStrategy, - final Object defaultValue, + final DimensionType defaultValue, final KeyMapper keyMapper ) { @@ -63,7 +67,7 @@ public void processValueFromGroupingKey( @Override public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] valuess) { - Pair multiValueHolderAndSizeIncrease = keyToId.getMultiValueHolder(selector, null); + Pair multiValueHolderAndSizeIncrease = keyToId.getMultiValueHolder(selector, null); valuess[columnIndex] = multiValueHolderAndSizeIncrease.lhs; return multiValueHolderAndSizeIncrease.rhs; } @@ -77,13 +81,16 @@ public void initGroupingKeyColumnValue( int[] stack ) { - int rowSize = keyToId.multiValueSize(rowObj); + // It is always called with the DimensionHolderType, created + //noinspection unchecked + DimensionHolderType rowObjCasted = (DimensionHolderType) rowObj; + int rowSize = keyToId.multiValueSize(rowObjCasted); if (rowSize == 0) { keyBuffer.putInt(keyBufferPosition, GROUP_BY_MISSING_VALUE); } else { // No need to check here, since we'd have already accounted for it when we call // initColumnValues - keyBuffer.putInt(keyBufferPosition, keyToId.getIndividualValueDictId(rowObj, 0).lhs); + keyBuffer.putInt(keyBufferPosition, keyToId.getIndividualValueDictId(rowObjCasted, 0).lhs); } } @@ -95,11 +102,12 @@ public boolean checkRowIndexAndAddValueToGroupingKey( ByteBuffer keyBuffer ) { - int rowSize = keyToId.multiValueSize(rowObj); + DimensionHolderType rowObjCasted = (DimensionHolderType) rowObj; + int rowSize = keyToId.multiValueSize(rowObjCasted); if (rowValIdx < rowSize) { keyBuffer.putInt( keyBufferPosition, - keyToId.getIndividualValueDictId(rowObj, rowValIdx).lhs + keyToId.getIndividualValueDictId(rowObjCasted, rowValIdx).lhs ); return true; } else { @@ -110,15 +118,16 @@ public boolean checkRowIndexAndAddValueToGroupingKey( @Override public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) { - Object multiValueHolder = keyToId.getMultiValueHolder(selector, null); - int multiValueSize = keyToId.multiValueSize(multiValueHolder); + Pair multiValueHolder = keyToId.getMultiValueHolder(selector, null); + int multiValueSize = keyToId.multiValueSize(multiValueHolder.lhs); Preconditions.checkState(multiValueSize < 2, "Not supported for multi-value dimensions"); - Pair dictIdAndSizeIncrease = keyToId.getIndividualValueDictId(multiValueHolder, 0); - final int dictId = multiValueSize == 1 - ? dictIdAndSizeIncrease.lhs - : GROUP_BY_MISSING_VALUE; + Pair dictIdAndSizeIncrease = keyToId.getIndividualValueDictId(multiValueHolder.lhs, 0); + final int dictId = multiValueSize == 1 ? dictIdAndSizeIncrease.lhs : GROUP_BY_MISSING_VALUE; keyBuffer.putInt(keyBufferPosition, dictId); - return dictIdAndSizeIncrease.rhs; + + // The implementations must return a non-nullable and non-negative size increase + //noinspection ConstantConditions + return multiValueHolder.rhs + dictIdAndSizeIncrease.rhs; } @Override diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyToId.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyToId.java index a099199a264a..80ba689ae194 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyToId.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyToId.java @@ -3,12 +3,12 @@ import org.apache.druid.java.util.common.Pair; import org.apache.druid.segment.ColumnValueSelector; -// Don't really use HolderType anywhere for now, we cast stuff everywhere, but perhaps with new selectors, we can -public interface KeyToId +// Don't really use DimensionHolderType anywhere for now, we cast stuff everywhere, but perhaps with new selectors, we can +public interface KeyToId { - Pair getMultiValueHolder(ColumnValueSelector selector, HolderType reusableValue); + Pair getMultiValueHolder(ColumnValueSelector selector, DimensionHolderType reusableValue); - int multiValueSize(HolderType multiValueHolder); + int multiValueSize(DimensionHolderType multiValueHolder); - Pair getIndividualValueDictId(HolderType multiValueHolder, int index); + Pair getIndividualValueDictId(DimensionHolderType multiValueHolder, int index); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java index 5808a5c661f2..abc88fd4c74f 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java @@ -56,6 +56,4 @@ public boolean canCompareIds() } }; } - - } From 6e1686556c55db6a625187b9f4ed505fb531add8 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Thu, 7 Mar 2024 17:25:58 +0530 Subject: [PATCH 04/46] some stuff working --- .../epinephelinae/GroupByQueryEngine.java | 14 +- ...BuildingGroupByColumnSelectorStrategy.java | 314 +++++++++++------- ...yToId.java => DimensionToIdConverter.java} | 2 +- ...xedWidthGroupByColumnSelectorStrategy.java | 6 +- .../column/IdToDimensionConverter.java | 9 + ...yMappingGroupByColumnSelectorStrategy.java | 45 ++- .../column/MultiValueHelpers.java | 4 +- ...ryStringGroupByColumnSelectorStrategy.java | 127 ++++--- 8 files changed, 324 insertions(+), 197 deletions(-) rename processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/{KeyToId.java => DimensionToIdConverter.java} (90%) create mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java index aa404760ae63..1461d3ef3a00 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java @@ -41,17 +41,11 @@ import org.apache.druid.query.groupby.GroupByQueryMetrics; import org.apache.druid.query.groupby.GroupingEngine; import org.apache.druid.query.groupby.ResultRow; -import org.apache.druid.query.groupby.epinephelinae.column.ArrayDoubleGroupByColumnSelectorStrategy; -import org.apache.druid.query.groupby.epinephelinae.column.ArrayLongGroupByColumnSelectorStrategy; -import org.apache.druid.query.groupby.epinephelinae.column.ArrayStringGroupByColumnSelectorStrategy; +import org.apache.druid.query.groupby.epinephelinae.column.DictionaryBuildingGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.DictionaryBuildingStringGroupByColumnSelectorStrategy; -import org.apache.druid.query.groupby.epinephelinae.column.DoubleGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.FixedWidthGroupByColumnSelectorStrategy; -import org.apache.druid.query.groupby.epinephelinae.column.FloatGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.GroupByColumnSelectorPlus; import org.apache.druid.query.groupby.epinephelinae.column.GroupByColumnSelectorStrategy; -import org.apache.druid.query.groupby.epinephelinae.column.LongGroupByColumnSelectorStrategy; -import org.apache.druid.query.groupby.epinephelinae.column.NullableNumericGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.StringGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.orderby.DefaultLimitSpec; import org.apache.druid.query.groupby.orderby.OrderByColumnSpec; @@ -280,11 +274,11 @@ public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( case ARRAY: switch (capabilities.getElementType().getType()) { case LONG: - return new ArrayLongGroupByColumnSelectorStrategy(); + return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.LONG_ARRAY); case STRING: - return new ArrayStringGroupByColumnSelectorStrategy(); + return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.STRING_ARRAY); case DOUBLE: - return new ArrayDoubleGroupByColumnSelectorStrategy(); + return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.DOUBLE_ARRAY); case FLOAT: // Array not supported in expressions, ingestion default: diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java index 5fc550e0fba4..4ceb0c1906ea 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -1,6 +1,8 @@ package org.apache.druid.query.groupby.epinephelinae.column; import it.unimi.dsi.fastutil.objects.Object2IntMap; +import org.apache.druid.common.config.NullHandling; +import org.apache.druid.error.DruidException; import org.apache.druid.java.util.common.Pair; import org.apache.druid.query.groupby.epinephelinae.DictionaryBuilding; import org.apache.druid.segment.ColumnValueSelector; @@ -8,147 +10,231 @@ import org.apache.druid.segment.DimensionSelector; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.column.NullableTypeStrategy; +import org.apache.druid.segment.column.ValueType; import org.apache.druid.segment.data.ArrayBasedIndexedInts; import org.apache.druid.segment.data.IndexedInts; -import javax.annotation.Nullable; -import javax.annotation.concurrent.NotThreadSafe; import java.util.List; -import java.util.function.Function; -@NotThreadSafe -public class DictionaryBuildingGroupByColumnSelectorStrategy extends KeyMappingGroupByColumnSelectorStrategy +public class DictionaryBuildingGroupByColumnSelectorStrategy + extends KeyMappingGroupByColumnSelectorStrategy { - NullableTypeStrategy nullableTypeStrategy; - private final List dictionary = DictionaryBuilding.createDictionary(); - private final Object2IntMap reverseDictionary = DictionaryBuilding.createTreeSortedReverseDictionary( - nullableTypeStrategy); + private final List dictionary; + private final Object2IntMap reverseDictionary; - public DictionaryBuildingGroupByColumnSelectorStrategy( - @Nullable KeyToId keyToId, + private DictionaryBuildingGroupByColumnSelectorStrategy( + DimensionToIdConverter dimensionToIdConverter, ColumnType columnType, - NullableTypeStrategy nullableTypeStrategy, - Object defaultValue, - KeyMapper keyMapper + NullableTypeStrategy nullableTypeStrategy, + DimensionType defaultValue, + IdToDimensionConverter idToDimensionConverter, + List dictionary, + Object2IntMap reverseDictionary ) { - // For Strings - KeyToId keyToId1 = new KeyToId() - { - final Function footprintCompute; - @Override - public Pair getMultiValueHolder( - ColumnValueSelector selector, - Object reusableValue // Optimisation to not create and allocate something new - ) - { - final DimensionSelector dimensionSelector = (DimensionSelector) selector; - final IndexedInts row = dimensionSelector.getRow(); - int footprintIncrease = 0; - ArrayBasedIndexedInts newRow = (ArrayBasedIndexedInts) reusableValue; - if (newRow == null) { - newRow = new ArrayBasedIndexedInts(); - } - int rowSize = row.size(); - newRow.ensureSize(rowSize); - for(int i = 0; i < rowSize; ++i) { - final String value = dimensionSelector.lookupName(row.get(i)); - final int dictId = reverseDictionary.getInt(value); - if (dictId < 0) { - final int nextId = dictionary.size(); - dictionary.add(value); - reverseDictionary.put(value, nextId); - newRow.setValue(i, nextId); - footprintIncrease += DictionaryBuilding.estimateEntryFootprint(footprintCompute.apply(value)); - } else { - newRow.setValue(i, dictId); - } - } - newRow.setSize(rowSize); - return Pair.of(newRow, footprintIncrease); - } + super(dimensionToIdConverter, columnType, nullableTypeStrategy, defaultValue, idToDimensionConverter); + this.dictionary = dictionary; + this.reverseDictionary = reverseDictionary; + } - @Override - public int multiValueSize(Object multiValueHolder) - { - return ((IndexedInts) multiValueHolder).size(); - } + public static GroupByColumnSelectorStrategy forType(final ColumnType columnType) + { + // Any way to use the generics here instead of + if (columnType.equals(ColumnType.STRING)) { + return forString(); + } else if (columnType.equals(ColumnType.DOUBLE) || columnType.equals(ColumnType.FLOAT) || columnType.equals( + ColumnType.LONG)) { + throw DruidException.defensive("Could used a fixed width strategy"); + } - @Override - public Pair getIndividualValueDictId(Object multiValueHolder, int index) - { - // Already converted it to the dictionary id - return Pair.of(((IndexedInts) multiValueHolder).get(index), 0); - } - }; + return forArrayAndComplexTypes(columnType); + } + + private static GroupByColumnSelectorStrategy forString() + { + final List dictionary = DictionaryBuilding.createDictionary(); + final Object2IntMap reverseDictionary = + DictionaryBuilding.createTreeSortedReverseDictionary(ColumnType.STRING.getNullableStrategy()); + return new DictionaryBuildingGroupByColumnSelectorStrategy<>( + new StringDimensionToIdConverter(dictionary, reverseDictionary), + ColumnType.STRING, + ColumnType.STRING.getNullableStrategy(), + NullHandling.defaultStringValue(), + new DictionaryIdToDimensionConverter<>(dictionary), + dictionary, + reverseDictionary + ); + } - // For other types - KeyToId keyToId2 = new KeyToId() + // Nothing different about primitive and non-primitive types, however the primitive types are fixed width, therefore + // don't need to use dictionary building strategy. Also, it simplifies the generics because now everything can be treated + // as Object + private static GroupByColumnSelectorStrategy forArrayAndComplexTypes(final ColumnType columnType) + { + // No concept of multi values, therefore DimensionType == DimensionHolderType == Object. For rogue selectors, which + // can return weird representation of arrays, we cast it using DimensionHandlerUtils, therefore the type might not be strictly + // same, but it would be what the callers expect + final List dictionary = DictionaryBuilding.createDictionary(); + final Object2IntMap reverseDictionary = + DictionaryBuilding.createTreeSortedReverseDictionary(columnType.getNullableStrategy()); + return new DictionaryBuildingGroupByColumnSelectorStrategy<>( + new UniValueDimensionToIdConverter(dictionary, reverseDictionary, columnType, columnType.getNullableStrategy()), + columnType, + columnType.getNullableStrategy(), + null, + new DictionaryIdToDimensionConverter<>(dictionary), + dictionary, + reverseDictionary + ); + } + + private static class StringDimensionToIdConverter implements DimensionToIdConverter + { + + private final List dictionary; + private final Object2IntMap reverseDictionary; + + public StringDimensionToIdConverter( + List dictionary, + Object2IntMap reverseDictionary + ) { - final Function footprintCompute; - - // Assert that Object in the return type will be properly casted - @Override - public Pair getMultiValueHolder( - ColumnValueSelector selector, - Object reusableValue - ) - { - final Object value = DimensionHandlerUtils.convertObjectToType(selector.getObject(), columnType); + this.dictionary = dictionary; + this.reverseDictionary = reverseDictionary; + } + + @Override + public Pair getMultiValueHolder( + final ColumnValueSelector selector, + final IndexedInts reusableValue + ) + { + final DimensionSelector dimensionSelector = (DimensionSelector) selector; + final IndexedInts row = dimensionSelector.getRow(); + int footprintIncrease = 0; + ArrayBasedIndexedInts newRow = (ArrayBasedIndexedInts) reusableValue; + if (newRow == null) { + newRow = new ArrayBasedIndexedInts(); + } + int rowSize = row.size(); + newRow.ensureSize(rowSize); + for (int i = 0; i < rowSize; ++i) { + final String value = dimensionSelector.lookupName(row.get(i)); final int dictId = reverseDictionary.getInt(value); - int footprintIncrease = 0; if (dictId < 0) { - final int size = dictionary.size(); + final int nextId = dictionary.size(); dictionary.add(value); - reverseDictionary.put(value, size); - footprintIncrease = DictionaryBuilding.estimateEntryFootprint(footprintCompute.apply(value)); - + reverseDictionary.put(value, nextId); + newRow.setValue(i, nextId); + footprintIncrease += DictionaryBuilding.estimateEntryFootprint( + (value == null ? 0 : value.length()) * Character.BYTES + ); + } else { + newRow.setValue(i, dictId); } - return Pair.of(value, footprintIncrease); } + newRow.setSize(rowSize); + return Pair.of(newRow, footprintIncrease); + } - @Override - public int multiValueSize(Object multiValueHolder) - { - return multiValueHolder == null ? 0 : 1; - } + @Override + public int multiValueSize(IndexedInts multiValueHolder) + { + return multiValueHolder.size(); + } - @Override - public Pair getIndividualValueDictId(Object multiValueHolder, int index) - { - assert index == 0; - int dictId = reverseDictionary.getInt(multiValueHolder); - int footprintIncrease = 0; - if (dictId < 0) { - final int size = dictionary.size(); - dictionary.add(multiValueHolder); - reverseDictionary.put(multiValueHolder, size); - dictId = size; - // TODO(laksh): confirm if this is the same for sorted dictionaries as well - footprintIncrease = DictionaryBuilding.estimateEntryFootprint(footprintCompute.apply(multiValueHolder)); - } - return Pair.of(dictId, footprintIncrease); - } - }; + @Override + public Pair getIndividualValueDictId(IndexedInts multiValueHolder, int index) + { + // Already converted it to the dictionary id + return Pair.of(multiValueHolder.get(index), 0); + } + } + + private static class UniValueDimensionToIdConverter implements DimensionToIdConverter + { + private final List dictionary; + private final Object2IntMap reverseDictionary; + private final ColumnType columnType; + private final NullableTypeStrategy nullableTypeStrategy; - KeyMapper keyMapper1 = new KeyMapper() + public UniValueDimensionToIdConverter( + final List dictionary, + final Object2IntMap reverseDictionary, + final ColumnType columnType, + final NullableTypeStrategy nullableTypeStrategy + ) { - @Override - public Object idToKey(int id) - { - if (id != GROUP_BY_MISSING_VALUE) { - return dictionary.get(id); - } - return defaultValue; + this.dictionary = dictionary; + this.reverseDictionary = reverseDictionary; + this.columnType = columnType; + this.nullableTypeStrategy = nullableTypeStrategy; + } + + @Override + public Pair getMultiValueHolder(ColumnValueSelector selector, Object reusableValue) + { + final Object value = DimensionHandlerUtils.convertObjectToType(selector.getObject(), columnType); + final int dictId = reverseDictionary.getInt(value); + int footprintIncrease = 0; + if (dictId < 0) { + final int size = dictionary.size(); + dictionary.add(value); + reverseDictionary.put(value, size); + footprintIncrease = DictionaryBuilding.estimateEntryFootprint(nullableTypeStrategy.estimateSizeBytes(value)); + } + return Pair.of(value, footprintIncrease); + } + + @Override + public int multiValueSize(Object multiValueHolder) + { + return multiValueHolder == null ? 0 : 1; + } - @Override - public boolean canCompareIds() - { - return false; + @Override + public Pair getIndividualValueDictId(Object multiValueHolder, int index) + { + assert index == 0; + int dictId = reverseDictionary.getInt(multiValueHolder); + int footprintIncrease = 0; + if (dictId < 0) { + final int size = dictionary.size(); + dictionary.add(multiValueHolder); + reverseDictionary.put(multiValueHolder, size); + dictId = size; + // TODO(laksh): confirm if this is the same for sorted dictionaries as well + footprintIncrease = DictionaryBuilding.estimateEntryFootprint(nullableTypeStrategy.estimateSizeBytes( + multiValueHolder)); } - }; + return Pair.of(dictId, footprintIncrease); + + } + } + + private static class DictionaryIdToDimensionConverter implements IdToDimensionConverter + { + private final List dictionary; + + public DictionaryIdToDimensionConverter(List dictionary) + { + this.dictionary = dictionary; + } + + // Don't need to handle default id value + @Override + public DimensionType idToKey(int id) + { + return dictionary.get(id); + } + + @Override + public boolean canCompareIds() + { + return false; + } } @Override diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyToId.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java similarity index 90% rename from processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyToId.java rename to processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java index 80ba689ae194..872e8e8fd022 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyToId.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java @@ -4,7 +4,7 @@ import org.apache.druid.segment.ColumnValueSelector; // Don't really use DimensionHolderType anywhere for now, we cast stuff everywhere, but perhaps with new selectors, we can -public interface KeyToId +public interface DimensionToIdConverter { Pair getMultiValueHolder(ColumnValueSelector selector, DimensionHolderType reusableValue); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java index 5404eb631257..34c823db6c64 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java @@ -22,20 +22,20 @@ public class FixedWidthGroupByColumnSelectorStrategy implements GroupByColumn final int keySize; @Nullable - final KeyToId keyToId; + final DimensionToIdConverter dimensionToIdConverter; final boolean isPrimitive; final ColumnType columnType; final NullableTypeStrategy nullableTypeStrategy; public FixedWidthGroupByColumnSelectorStrategy( int keySize, - @Nullable KeyToId keyToId, + @Nullable DimensionToIdConverter dimensionToIdConverter, boolean isPrimitive, ColumnType columnType ) { this.keySize = keySize; - this.keyToId = keyToId; + this.dimensionToIdConverter = dimensionToIdConverter; this.isPrimitive = isPrimitive; this.columnType = columnType; this.nullableTypeStrategy = columnType.getNullableStrategy(); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java new file mode 100644 index 000000000000..64d9bc96d849 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java @@ -0,0 +1,9 @@ +package org.apache.druid.query.groupby.epinephelinae.column; + +// Doesn't handle GROUP_BY_MISSING_VALUE, should be done by the callers +public interface IdToDimensionConverter +{ + DimensionType idToKey(int id); + + boolean canCompareIds(); +} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java index b98f033e647e..5546bdd133e9 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -12,34 +12,36 @@ import org.apache.druid.segment.column.NullableTypeStrategy; import javax.annotation.Nullable; +import javax.annotation.concurrent.NotThreadSafe; import java.nio.ByteBuffer; // Only supports int mapping. // DimensionType is the dimension's type - eg strings // DimensionHolderType is the multi value holder for the dimension, if it exists, else it will be same as DimensionType +@NotThreadSafe public class KeyMappingGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy { - final KeyToId keyToId; + final DimensionToIdConverter dimensionToIdConverter; final ColumnType columnType; final NullableTypeStrategy nullableTypeStrategy; final DimensionType defaultValue; - final KeyMapper keyMapper; + final IdToDimensionConverter idToDimensionConverter; // Restricted access, callers should use one of it's subclasses KeyMappingGroupByColumnSelectorStrategy( - final KeyToId keyToId, + final DimensionToIdConverter dimensionToIdConverter, final ColumnType columnType, final NullableTypeStrategy nullableTypeStrategy, final DimensionType defaultValue, - final KeyMapper keyMapper + final IdToDimensionConverter idToDimensionConverter ) { - this.keyToId = keyToId; + this.dimensionToIdConverter = dimensionToIdConverter; this.columnType = columnType; this.nullableTypeStrategy = nullableTypeStrategy; this.defaultValue = defaultValue; - this.keyMapper = keyMapper; + this.idToDimensionConverter = idToDimensionConverter; } @Override @@ -58,7 +60,7 @@ public void processValueFromGroupingKey( { final int id = key.getInt(keyBufferPosition); if (id != GROUP_BY_MISSING_VALUE) { - resultRow.set(selectorPlus.getResultRowPosition(), keyMapper.idToKey(id)); + resultRow.set(selectorPlus.getResultRowPosition(), idToDimensionConverter.idToKey(id)); } else { resultRow.set(selectorPlus.getResultRowPosition(), defaultValue); } @@ -67,7 +69,7 @@ public void processValueFromGroupingKey( @Override public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] valuess) { - Pair multiValueHolderAndSizeIncrease = keyToId.getMultiValueHolder(selector, null); + Pair multiValueHolderAndSizeIncrease = dimensionToIdConverter.getMultiValueHolder(selector, null); valuess[columnIndex] = multiValueHolderAndSizeIncrease.lhs; return multiValueHolderAndSizeIncrease.rhs; } @@ -84,13 +86,13 @@ public void initGroupingKeyColumnValue( // It is always called with the DimensionHolderType, created //noinspection unchecked DimensionHolderType rowObjCasted = (DimensionHolderType) rowObj; - int rowSize = keyToId.multiValueSize(rowObjCasted); + int rowSize = dimensionToIdConverter.multiValueSize(rowObjCasted); if (rowSize == 0) { keyBuffer.putInt(keyBufferPosition, GROUP_BY_MISSING_VALUE); } else { // No need to check here, since we'd have already accounted for it when we call // initColumnValues - keyBuffer.putInt(keyBufferPosition, keyToId.getIndividualValueDictId(rowObjCasted, 0).lhs); + keyBuffer.putInt(keyBufferPosition, dimensionToIdConverter.getIndividualValueDictId(rowObjCasted, 0).lhs); } } @@ -103,11 +105,11 @@ public boolean checkRowIndexAndAddValueToGroupingKey( ) { DimensionHolderType rowObjCasted = (DimensionHolderType) rowObj; - int rowSize = keyToId.multiValueSize(rowObjCasted); + int rowSize = dimensionToIdConverter.multiValueSize(rowObjCasted); if (rowValIdx < rowSize) { keyBuffer.putInt( keyBufferPosition, - keyToId.getIndividualValueDictId(rowObjCasted, rowValIdx).lhs + dimensionToIdConverter.getIndividualValueDictId(rowObjCasted, rowValIdx).lhs ); return true; } else { @@ -118,10 +120,10 @@ public boolean checkRowIndexAndAddValueToGroupingKey( @Override public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) { - Pair multiValueHolder = keyToId.getMultiValueHolder(selector, null); - int multiValueSize = keyToId.multiValueSize(multiValueHolder.lhs); + Pair multiValueHolder = dimensionToIdConverter.getMultiValueHolder(selector, null); + int multiValueSize = dimensionToIdConverter.multiValueSize(multiValueHolder.lhs); Preconditions.checkState(multiValueSize < 2, "Not supported for multi-value dimensions"); - Pair dictIdAndSizeIncrease = keyToId.getIndividualValueDictId(multiValueHolder.lhs, 0); + Pair dictIdAndSizeIncrease = dimensionToIdConverter.getIndividualValueDictId(multiValueHolder.lhs, 0); final int dictId = multiValueSize == 1 ? dictIdAndSizeIncrease.lhs : GROUP_BY_MISSING_VALUE; keyBuffer.putInt(keyBufferPosition, dictId); @@ -136,15 +138,15 @@ public Grouper.BufferComparator bufferComparator(int keyBufferPosition, @Nullabl boolean usesNaturalComparator = stringComparator == null || DimensionComparisonUtils.isNaturalComparator(columnType.getType(), stringComparator); - if (keyMapper.canCompareIds() && usesNaturalComparator) { + if (idToDimensionConverter.canCompareIds() && usesNaturalComparator) { return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> Integer.compare( lhsBuffer.getInt(lhsPosition + keyBufferPosition), rhsBuffer.getInt(rhsPosition + keyBufferPosition) ); } else { return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> { - Object lhsObject = keyMapper.idToKey(lhsBuffer.getInt(lhsPosition + keyBufferPosition)); - Object rhsObject = keyMapper.idToKey(rhsBuffer.getInt(rhsPosition + keyBufferPosition)); + Object lhsObject = idToDimensionConverter.idToKey(lhsBuffer.getInt(lhsPosition + keyBufferPosition)); + Object rhsObject = idToDimensionConverter.idToKey(rhsBuffer.getInt(rhsPosition + keyBufferPosition)); if (usesNaturalComparator) { return nullableTypeStrategy.compare( (DimensionType) DimensionHandlerUtils.convertObjectToType(lhsObject, columnType), @@ -163,11 +165,4 @@ public void reset() } - // Doesn't handle GROUP_BY_MISSING_VALUE, should be done by the callers - public interface KeyMapper - { - KeyType idToKey(int id); - - boolean canCompareIds(); - } } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java index 068ead2ede70..073a16c6be7a 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java @@ -2,12 +2,12 @@ public class MultiValueHelpers { - KeyToId multiValueHelperForDimensionSelectors() + DimensionToIdConverter multiValueHelperForDimensionSelectors() { return null; } - KeyToId multiValueHelperForSingleValueSelectors() + DimensionToIdConverter multiValueHelperForSingleValueSelectors() { return null; } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java index abc88fd4c74f..c27e76f05924 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java @@ -1,59 +1,102 @@ package org.apache.druid.query.groupby.epinephelinae.column; +import org.apache.druid.common.config.NullHandling; +import org.apache.druid.error.DruidException; import org.apache.druid.java.util.common.Pair; import org.apache.druid.segment.ColumnValueSelector; import org.apache.druid.segment.DimensionSelector; +import org.apache.druid.segment.column.ColumnCapabilities; import org.apache.druid.segment.column.ColumnType; -import org.apache.druid.segment.column.NullableTypeStrategy; import org.apache.druid.segment.data.IndexedInts; import javax.annotation.Nullable; -public class PrebuiltDictionaryStringGroupByColumnSelectorStrategy extends KeyMappingGroupByColumnSelectorStrategy +// Note: Avoiding anonymous classes +// This is more of a helper class, as it just creates an instance of the KeyMappingGroupingColumnSelectorStrategy +public class PrebuiltDictionaryStringGroupByColumnSelectorStrategy { - public PrebuiltDictionaryStringGroupByColumnSelectorStrategy( - @Nullable KeyToId keyToId, - ColumnType columnType, - NullableTypeStrategy nullableTypeStrategy, - Object defaultValue, - KeyMapper keyMapper + + public static GroupByColumnSelectorStrategy forType( + final ColumnType columnType, + final ColumnValueSelector columnValueSelector, + final ColumnCapabilities columnCapabilities + ) + { + if (columnType.equals(ColumnType.STRING)) { + return forString(columnValueSelector, columnCapabilities); + } else { + // This can change with array columns + throw DruidException.defensive("Only string columns expose prebuilt dictionaries"); + } + } + + private static GroupByColumnSelectorStrategy forString( + final ColumnValueSelector columnValueSelector, + final ColumnCapabilities columnCapabilities ) { - DimensionSelector dimS; - KeyToId keyToId1 = new KeyToId() + return new KeyMappingGroupByColumnSelectorStrategy<>( + new StringDimensionToIdConverter(), + ColumnType.STRING, + ColumnType.STRING.getNullableStrategy(), + NullHandling.defaultStringValue(), + new StringIdToDimensionConverter((DimensionSelector) columnValueSelector, columnCapabilities) + ); + } + + private static class StringDimensionToIdConverter implements DimensionToIdConverter + { + @Override + public Pair getMultiValueHolder( + final ColumnValueSelector selector, + final IndexedInts reusableValue + ) { - @Override - public Pair getMultiValueHolder(ColumnValueSelector selector, Object reusableValue) - { - return Pair.of(((DimensionSelector) selector).getRow(), 0); - } - - @Override - public int multiValueSize(Object multiValueHolder) - { - return ((IndexedInts) multiValueHolder).size(); - } - - @Override - public Pair getIndividualValueDictId(Object multiValueHolder, int index) - { - return Pair.of(((IndexedInts) multiValueHolder).get(index), 0); - } - }; - - KeyMapper keyMapper1 = new KeyMapper() + return Pair.of(((DimensionSelector) selector).getRow(), 0); + } + + @Override + public int multiValueSize(IndexedInts multiValueHolder) + { + return multiValueHolder.size(); + } + + @Override + public Pair getIndividualValueDictId(IndexedInts multiValueHolder, int index) + { + return Pair.of(multiValueHolder.get(index), 0); + } + } + + private static class StringIdToDimensionConverter implements IdToDimensionConverter + { + + final DimensionSelector dimensionSelector; + @Nullable + final ColumnCapabilities columnCapabilities; + + public StringIdToDimensionConverter( + final DimensionSelector dimensionSelector, + @Nullable final ColumnCapabilities columnCapabilities + ) + { + this.dimensionSelector = dimensionSelector; + this.columnCapabilities = columnCapabilities; + } + + @Override + public String idToKey(int id) + { + return dimensionSelector.lookupName(id); + } + + @Override + public boolean canCompareIds() { - @Override - public String idToKey(int id) - { - return dimS.lookupName(id); - } - - @Override - public boolean canCompareIds() - { - return false; - } - }; + return columnCapabilities != null + && columnCapabilities.hasBitmapIndexes() + && (columnCapabilities.areDictionaryValuesSorted() + .and(columnCapabilities.areDictionaryValuesUnique())).isTrue(); + } } } From 718eef95bfe871ea7efb1489a99950984250923a Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 8 Mar 2024 01:23:55 +0530 Subject: [PATCH 05/46] group by complex col working --- .../epinephelinae/RowBasedGrouperHelper.java | 79 +++++++++++++++++++ ...BuildingGroupByColumnSelectorStrategy.java | 1 - .../druid/segment/DimensionHandlerUtils.java | 8 +- .../druid/sql/calcite/rel/DruidQuery.java | 16 ++-- .../calcite/CalciteNestedDataQueryTest.java | 16 ++++ .../druid/sql/calcite/QueryTestRunner.java | 2 +- 6 files changed, 110 insertions(+), 12 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java index 22761f5c9d3d..a5feb6b8c650 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java @@ -89,8 +89,10 @@ import java.util.Arrays; import java.util.BitSet; import java.util.Comparator; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.function.Function; import java.util.function.Predicate; @@ -1190,6 +1192,10 @@ private static class RowBasedKeySerde implements Grouper.KeySerde doubleArrayDictionary; private final Object2IntMap reverseDoubleArrayDictionary; + // We can probably use same dictionary for all the complex types, if all of them are done using hash mapping + private final Map> complexTypeDictionaries = new HashMap<>(); + private final Map> complexTypeReverseDictionaries = new HashMap<>(); + // Size limiting for the dictionary, in (roughly estimated) bytes. private final long maxDictionarySize; @@ -1435,6 +1441,12 @@ private RowBasedKeySerdeHelper makeSerdeHelper( ) { switch (valueType.getType()) { + case COMPLEX: + if (stringComparator != null + && !DimensionComparisonUtils.isNaturalComparator(valueType.getType(), stringComparator)) { + throw DruidException.defensive("Unexpected string comparator supplied"); + } + return new ComplexRowBasedKeySerdeHelper(keyBufferPosition, valueType); case ARRAY: switch (valueType.getElementType().getType()) { case STRING: @@ -1523,6 +1535,73 @@ private RowBasedKeySerdeHelper makeNumericSerdeHelper( } } + private class ComplexRowBasedKeySerdeHelper implements RowBasedKeySerdeHelper + { + final int keyBufferPosition; + final BufferComparator bufferComparator; + final ColumnType complexType; + final String complexTypeName; + + final List complexTypeDictionary; + final Object2IntMap complexTypeReverseDictionary; + + public ComplexRowBasedKeySerdeHelper( + int keyBufferPosition, + ColumnType complexType + ) + { + this.keyBufferPosition = keyBufferPosition; + this.complexType = complexType; + this.complexTypeName = Preconditions.checkNotNull(complexType.getComplexTypeName(), "complex type name expected"); + this.complexTypeDictionary = complexTypeDictionaries.computeIfAbsent( + complexTypeName, + ignored -> DictionaryBuilding.createDictionary() + ); + this.complexTypeReverseDictionary = complexTypeReverseDictionaries.computeIfAbsent( + complexTypeName, + ignored -> DictionaryBuilding.createTreeSortedReverseDictionary(complexType.getNullableStrategy()) + ); + this.bufferComparator = (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> + complexType.getNullableStrategy().compare( + complexTypeDictionary.get(lhsBuffer.getInt(lhsPosition + keyBufferPosition)), + complexTypeDictionary.get(rhsBuffer.getInt(rhsPosition + keyBufferPosition)) + ); + + }; + + @Override + public int getKeyBufferValueSize() + { + return Integer.BYTES; + } + + @Override + public boolean putToKeyBuffer(RowBasedKey key, int idx) + { + final Object obj = key.getKey()[idx]; + int id = complexTypeReverseDictionary.getInt(obj); + if (id == DimensionDictionary.ABSENT_VALUE_ID) { + id = complexTypeDictionary.size(); + complexTypeReverseDictionary.put(obj, id); + complexTypeDictionary.add(obj); + } + keyBuffer.putInt(id); + return true; + } + + @Override + public void getFromByteBuffer(ByteBuffer buffer, int initialOffset, int dimValIdx, Object[] dimValues) + { + dimValues[dimValIdx] = complexTypeDictionary.get(buffer.getInt(initialOffset + keyBufferPosition)); + } + + @Override + public BufferComparator getBufferComparator() + { + return bufferComparator; + } + } + private class ArrayNumericRowBasedKeySerdeHelper implements RowBasedKeySerdeHelper { final int keyBufferPosition; diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java index 4ceb0c1906ea..71bd10e6859a 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -10,7 +10,6 @@ import org.apache.druid.segment.DimensionSelector; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.column.NullableTypeStrategy; -import org.apache.druid.segment.column.ValueType; import org.apache.druid.segment.data.ArrayBasedIndexedInts; import org.apache.druid.segment.data.IndexedInts; diff --git a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java index 1ca911a95448..87fa9243bc79 100644 --- a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java +++ b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java @@ -25,6 +25,7 @@ import com.google.common.primitives.Floats; import org.apache.druid.common.guava.GuavaUtils; import org.apache.druid.data.input.impl.DimensionSchema.MultiValueHandling; +import org.apache.druid.error.DruidException; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.parsers.ParseException; @@ -413,9 +414,12 @@ public static Object convertObjectToType( case DOUBLE: return coerceToObjectArrayWithElementCoercionFunction(obj, DimensionHandlerUtils::convertObjectToDouble); } - + case COMPLEX: + // Can't coerce complex objects, and we shouldn't need to. If in future selectors behave weirdly, or we need to + // cast them (for some unknown reason), we can have that casting knowledge in the type strategy + return obj; default: - throw new IAE("Type[%s] is not supported for dimensions!", type); + throw DruidException.defensive("Type[%s] is not supported for dimensions!", type); } } diff --git a/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java b/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java index 6e0bab212771..d0292fc84fb8 100644 --- a/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java +++ b/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java @@ -486,14 +486,14 @@ private static List computeDimensions( final RelDataType dataType = rexNode.getType(); final ColumnType outputType = Calcites.getColumnTypeForRelDataType(dataType); - if (Types.isNullOr(outputType, ValueType.COMPLEX)) { - // Can't group on unknown or COMPLEX types. - plannerContext.setPlanningError( - "SQL requires a group-by on a column of type %s that is unsupported.", - outputType - ); - throw new CannotBuildQueryException(aggregate, rexNode); - } +// if (Types.isNullOr(outputType, ValueType.COMPLEX)) { +// // Can't group on unknown or COMPLEX types. +// plannerContext.setPlanningError( +// "SQL requires a group-by on a column of type %s that is unsupported.", +// outputType +// ); +// throw new CannotBuildQueryException(aggregate, rexNode); +// } final String dimOutputName = outputNamePrefix + outputNameCounter++; if (!druidExpression.isSimpleExtraction()) { diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java index 6c177e76e7ba..6ee89033b6f4 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java @@ -108,6 +108,12 @@ public class CalciteNestedDataQueryTest extends BaseCalciteQueryTest .put("long", 4L) .put("nester", "hello") .build(), + ImmutableMap.builder() + .put("t", "2000-01-01") + .put("string", "bbb") + .put("long", 4L) + .put("nester", "hello") + .build(), ImmutableMap.builder() .put("t", "2000-01-01") .put("string", "ccc") @@ -538,6 +544,16 @@ public void testTopNPath() ); } + @Test + public void testGroupByNested() + { + testQuery( + "SELECT nester, COUNT(*) FROM druid.nested GROUP BY 1", + ImmutableList.of(), + ImmutableList.of() + ); + } + @Test public void testGroupByRootPath() { diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java b/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java index 1dd1df4eea8c..117ac4de2fe3 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java @@ -381,7 +381,7 @@ public VerifyNativeQueries(BaseExecuteQuery execStep) public void verify() { for (QueryResults queryResults : execStep.results()) { - verifyQuery(queryResults); +// verifyQuery(queryResults); } } From cbb1181ebf6853e4d84ffe0be97118d84820e04d Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 8 Mar 2024 02:47:54 +0530 Subject: [PATCH 06/46] fixup --- .../query/groupby/epinephelinae/GroupByQueryEngine.java | 9 +++++++-- .../DictionaryBuildingGroupByColumnSelectorStrategy.java | 1 + .../column/KeyMappingGroupByColumnSelectorStrategy.java | 2 ++ .../org/apache/druid/sql/calcite/CalciteQueryTest.java | 2 +- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java index 1461d3ef3a00..d40dce1ab11e 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java @@ -46,6 +46,7 @@ import org.apache.druid.query.groupby.epinephelinae.column.FixedWidthGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.GroupByColumnSelectorPlus; import org.apache.druid.query.groupby.epinephelinae.column.GroupByColumnSelectorStrategy; +import org.apache.druid.query.groupby.epinephelinae.column.PrebuiltDictionaryStringGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.StringGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.orderby.DefaultLimitSpec; import org.apache.druid.query.groupby.orderby.OrderByColumnSpec; @@ -246,9 +247,13 @@ public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( case STRING: DimensionSelector dimSelector = (DimensionSelector) selector; if (dimSelector.getValueCardinality() >= 0 && dimSelector.nameLookupPossibleInAdvance()) { - return new StringGroupByColumnSelectorStrategy(dimSelector::lookupName, capabilities); + return PrebuiltDictionaryStringGroupByColumnSelectorStrategy.forType( + ColumnType.STRING, + selector, + capabilities + ); } else { - return new DictionaryBuildingStringGroupByColumnSelectorStrategy(); + return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.STRING); } case LONG: return new FixedWidthGroupByColumnSelectorStrategy( diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java index 71bd10e6859a..4ec30e5b6995 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -199,6 +199,7 @@ public Pair getIndividualValueDictId(Object multiValueHolder, assert index == 0; int dictId = reverseDictionary.getInt(multiValueHolder); int footprintIncrease = 0; + // Even if called again, then this is no-op if (dictId < 0) { final int size = dictionary.size(); dictionary.add(multiValueHolder); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java index 5546bdd133e9..73a9ac6096d8 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -89,10 +89,12 @@ public void initGroupingKeyColumnValue( int rowSize = dimensionToIdConverter.multiValueSize(rowObjCasted); if (rowSize == 0) { keyBuffer.putInt(keyBufferPosition, GROUP_BY_MISSING_VALUE); + stack[dimensionIndex] = 0; } else { // No need to check here, since we'd have already accounted for it when we call // initColumnValues keyBuffer.putInt(keyBufferPosition, dimensionToIdConverter.getIndividualValueDictId(rowObjCasted, 0).lhs); + stack[dimensionIndex] = 1; } } diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteQueryTest.java index 4d9614132d80..16d75b4094c9 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteQueryTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteQueryTest.java @@ -5856,7 +5856,7 @@ public void testUnplannableExactCountDistinctOnSketch() // COUNT DISTINCT on a sketch cannot be exact. assertQueryIsUnplannable( PLANNER_CONFIG_NO_HLL, - "SELECT COUNT(distinct unique_dim1) FROM druid.foo", + "SELECT unique_dim1, COUNT(*) FROM druid.foo GROUP BY 1", "SQL requires a group-by on a column of type COMPLEX that is unsupported." ); } From 33de9a8a1310e68d63959bebe02fa7974103f9bb Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 8 Mar 2024 02:52:53 +0530 Subject: [PATCH 07/46] fixup --- .../druid/query/groupby/epinephelinae/GroupByQueryEngine.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java index d40dce1ab11e..6e88709e0685 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java @@ -263,14 +263,14 @@ public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( ColumnType.LONG ); case FLOAT: - return new FixedWidthGroupByColumnSelectorStrategy( + return new FixedWidthGroupByColumnSelectorStrategy( Byte.BYTES + Float.BYTES, null, true, ColumnType.FLOAT ); case DOUBLE: - return new FixedWidthGroupByColumnSelectorStrategy( + return new FixedWidthGroupByColumnSelectorStrategy( Byte.BYTES + Double.BYTES, null, true, From f1394f72f27076f9fdeff0450d7a3ee86ee9adc4 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Tue, 12 Mar 2024 16:05:47 +0530 Subject: [PATCH 08/46] remove original strategies --- .../epinephelinae/GroupByQueryEngine.java | 11 - ...ayDoubleGroupByColumnSelectorStrategy.java | 58 ---- ...rrayLongGroupByColumnSelectorStrategy.java | 56 ---- ...yNumericGroupByColumnSelectorStrategy.java | 197 ------------- ...ayStringGroupByColumnSelectorStrategy.java | 272 ------------------ ...ngStringGroupByColumnSelectorStrategy.java | 162 ----------- .../DoubleGroupByColumnSelectorStrategy.java | 112 -------- .../FloatGroupByColumnSelectorStrategy.java | 116 -------- .../column/GroupByColumnSelectorStrategy.java | 2 +- .../LongGroupByColumnSelectorStrategy.java | 116 -------- .../column/MultiValueHelpers.java | 14 - ...eNumericGroupByColumnSelectorStrategy.java | 151 ---------- .../StringGroupByColumnSelectorStrategy.java | 178 ------------ ...alueStringGroupByVectorColumnSelector.java | 3 +- ...ubleGroupByColumnSelectorStrategyTest.java | 159 ---------- ...LongGroupByColumnSelectorStrategyTest.java | 163 ----------- ...ringGroupByColumnSelectorStrategyTest.java | 168 ----------- ...ringGroupByColumnSelectorStrategyTest.java | 109 ------- 18 files changed, 3 insertions(+), 2044 deletions(-) delete mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayDoubleGroupByColumnSelectorStrategy.java delete mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayLongGroupByColumnSelectorStrategy.java delete mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayNumericGroupByColumnSelectorStrategy.java delete mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayStringGroupByColumnSelectorStrategy.java delete mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingStringGroupByColumnSelectorStrategy.java delete mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DoubleGroupByColumnSelectorStrategy.java delete mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FloatGroupByColumnSelectorStrategy.java delete mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/LongGroupByColumnSelectorStrategy.java delete mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java delete mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/NullableNumericGroupByColumnSelectorStrategy.java delete mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/StringGroupByColumnSelectorStrategy.java delete mode 100644 processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayDoubleGroupByColumnSelectorStrategyTest.java delete mode 100644 processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayLongGroupByColumnSelectorStrategyTest.java delete mode 100644 processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayStringGroupByColumnSelectorStrategyTest.java delete mode 100644 processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/StringGroupByColumnSelectorStrategyTest.java diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java index 6e88709e0685..dfbffd072b63 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java @@ -42,12 +42,10 @@ import org.apache.druid.query.groupby.GroupingEngine; import org.apache.druid.query.groupby.ResultRow; import org.apache.druid.query.groupby.epinephelinae.column.DictionaryBuildingGroupByColumnSelectorStrategy; -import org.apache.druid.query.groupby.epinephelinae.column.DictionaryBuildingStringGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.FixedWidthGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.GroupByColumnSelectorPlus; import org.apache.druid.query.groupby.epinephelinae.column.GroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.PrebuiltDictionaryStringGroupByColumnSelectorStrategy; -import org.apache.druid.query.groupby.epinephelinae.column.StringGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.orderby.DefaultLimitSpec; import org.apache.druid.query.groupby.orderby.OrderByColumnSpec; import org.apache.druid.query.ordering.StringComparator; @@ -294,15 +292,6 @@ public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( throw new IAE("Cannot create query type helper from invalid type [%s]", capabilities.asTypeString()); } } - -// private GroupByColumnSelectorStrategy makeNullableNumericStrategy(GroupByColumnSelectorStrategy delegate) -// { -// if (NullHandling.sqlCompatible()) { -// return new NullableNumericGroupByColumnSelectorStrategy(delegate); -// } else { -// return delegate; -// } -// } } private abstract static class GroupByEngineIterator implements Iterator, Closeable diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayDoubleGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayDoubleGroupByColumnSelectorStrategy.java deleted file mode 100644 index e1a7e940de26..000000000000 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayDoubleGroupByColumnSelectorStrategy.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -import org.apache.druid.java.util.common.ISE; -import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.column.ColumnType; -import org.apache.druid.segment.column.ValueType; - -import java.util.Arrays; -import java.util.List; - -public class ArrayDoubleGroupByColumnSelectorStrategy extends ArrayNumericGroupByColumnSelectorStrategy -{ - public ArrayDoubleGroupByColumnSelectorStrategy() - { - super(Double.BYTES, ColumnType.DOUBLE_ARRAY); - } - - @Override - protected int computeDictionaryId(ColumnValueSelector selector) - { - Object object = selector.getObject(); - if (object == null) { - return GROUP_BY_MISSING_VALUE; - } else if (object instanceof Double) { - return addToIndexedDictionary(new Object[]{object}); - } else if (object instanceof List) { - return addToIndexedDictionary(((List) object).toArray()); - } else if (object instanceof Double[]) { - // Defensive check, since we don't usually expect to encounter Double[] objects from selectors - return addToIndexedDictionary(Arrays.stream((Double[]) object).toArray()); - } else if (object instanceof Object[]) { - return addToIndexedDictionary((Object[]) object); - } else { - throw new ISE("Found unexpected object type [%s] in %s array.", object.getClass().getName(), ValueType.DOUBLE); - } - } -} - - diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayLongGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayLongGroupByColumnSelectorStrategy.java deleted file mode 100644 index 49cd91baedf9..000000000000 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayLongGroupByColumnSelectorStrategy.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -import org.apache.druid.java.util.common.ISE; -import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.column.ColumnType; -import org.apache.druid.segment.column.ValueType; - -import java.util.Arrays; -import java.util.List; - -public class ArrayLongGroupByColumnSelectorStrategy extends ArrayNumericGroupByColumnSelectorStrategy -{ - public ArrayLongGroupByColumnSelectorStrategy() - { - super(Long.BYTES, ColumnType.LONG_ARRAY); - } - - @Override - protected int computeDictionaryId(ColumnValueSelector selector) - { - Object object = selector.getObject(); - if (object == null) { - return GROUP_BY_MISSING_VALUE; - } else if (object instanceof Long) { - return addToIndexedDictionary(new Object[]{object}); - } else if (object instanceof List) { - return addToIndexedDictionary(((List) object).toArray()); - } else if (object instanceof Long[]) { - // Defensive check, since we don't usually expect to encounter Long[] objects from selectors - return addToIndexedDictionary(Arrays.stream((Long[]) object).toArray()); - } else if (object instanceof Object[]) { - return addToIndexedDictionary((Object[]) object); - } else { - throw new ISE("Found unexpected object type [%s] in %s array.", object.getClass().getName(), ValueType.LONG); - } - } -} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayNumericGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayNumericGroupByColumnSelectorStrategy.java deleted file mode 100644 index 3818b75a1a05..000000000000 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayNumericGroupByColumnSelectorStrategy.java +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -import com.google.common.annotations.VisibleForTesting; -import it.unimi.dsi.fastutil.objects.Object2IntMap; -import org.apache.druid.query.groupby.ResultRow; -import org.apache.druid.query.groupby.epinephelinae.DictionaryBuilding; -import org.apache.druid.query.groupby.epinephelinae.Grouper; -import org.apache.druid.query.ordering.StringComparator; -import org.apache.druid.query.ordering.StringComparators; -import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.column.ColumnType; - -import javax.annotation.Nullable; -import java.nio.ByteBuffer; -import java.util.List; - -public abstract class ArrayNumericGroupByColumnSelectorStrategy - implements GroupByColumnSelectorStrategy -{ - protected static final int GROUP_BY_MISSING_VALUE = -1; - - private final List dictionary; - private final Object2IntMap reverseDictionary; - private long estimatedFootprint = 0L; - private final int valueFootprint; - - public ArrayNumericGroupByColumnSelectorStrategy(final int valueFootprint, final ColumnType arrayType) - { - this.dictionary = DictionaryBuilding.createDictionary(); - this.reverseDictionary = DictionaryBuilding.createReverseDictionaryForPrimitiveArray(arrayType); - this.valueFootprint = valueFootprint; - } - - @Override - public int getGroupingKeySize() - { - return Integer.BYTES; - } - - @Override - public void processValueFromGroupingKey( - GroupByColumnSelectorPlus selectorPlus, - ByteBuffer key, - ResultRow resultRow, - int keyBufferPosition - ) - { - final int id = key.getInt(keyBufferPosition); - - // GROUP_BY_MISSING_VALUE is used to indicate empty rows, which are omitted from the result map. - if (id != GROUP_BY_MISSING_VALUE) { - final Object[] value = dictionary.get(id); - resultRow.set(selectorPlus.getResultRowPosition(), value); - } else { - resultRow.set(selectorPlus.getResultRowPosition(), null); - } - } - - @Override - public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] valuess) - { - final long priorFootprint = estimatedFootprint; - valuess[columnIndex] = computeDictionaryId(selector); - return (int) (estimatedFootprint - priorFootprint); - } - - @Override - public void initGroupingKeyColumnValue( - int keyBufferPosition, - int dimensionIndex, - Object rowObj, - ByteBuffer keyBuffer, - int[] stack - ) - { - final int groupingKey = (int) rowObj; - writeToKeyBuffer(keyBufferPosition, groupingKey, keyBuffer); - if (groupingKey == GROUP_BY_MISSING_VALUE) { - stack[dimensionIndex] = 0; - } else { - stack[dimensionIndex] = 1; - } - - } - - @Override - public boolean checkRowIndexAndAddValueToGroupingKey( - int keyBufferPosition, - Object rowObj, - int rowValIdx, - ByteBuffer keyBuffer - ) - { - return false; - } - - protected abstract int computeDictionaryId(ColumnValueSelector selector); - - @Override - public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) - { - final long priorFootprint = estimatedFootprint; - - // computeDictionaryId updates estimatedFootprint - keyBuffer.putInt(keyBufferPosition, computeDictionaryId(selector)); - - return (int) (estimatedFootprint - priorFootprint); - } - - protected int addToIndexedDictionary(Object[] t) - { - final int dictId = reverseDictionary.getInt(t); - if (dictId < 0) { - final int size = dictionary.size(); - dictionary.add(t); - reverseDictionary.put(t, size); - - // Footprint estimate: one pointer, one value per list entry. - estimatedFootprint += DictionaryBuilding.estimateEntryFootprint(t.length * (Long.BYTES + valueFootprint)); - return size; - } - return dictId; - } - - @Override - public Grouper.BufferComparator bufferComparator(int keyBufferPosition, @Nullable StringComparator stringComparator) - { - // TODO(laksh): This can be optimised probably if stringComparator == null - StringComparator comparator = stringComparator == null ? StringComparators.NUMERIC : stringComparator; - return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> { - Object[] lhs = dictionary.get(lhsBuffer.getInt(lhsPosition + keyBufferPosition)); - Object[] rhs = dictionary.get(rhsBuffer.getInt(rhsPosition + keyBufferPosition)); - - int minLength = Math.min(lhs.length, rhs.length); - //noinspection ArrayEquality - if (lhs == rhs) { - return 0; - } else { - for (int i = 0; i < minLength; i++) { - final Object left = lhs[i]; - final Object right = rhs[i]; - final int cmp; - if (left == null && right == null) { - cmp = 0; - } else if (left == null) { - cmp = -1; - } else { - cmp = comparator.compare(String.valueOf(left), String.valueOf(right)); - } - if (cmp == 0) { - continue; - } - return cmp; - } - if (lhs.length == rhs.length) { - return 0; - } else if (lhs.length < rhs.length) { - return -1; - } - return 1; - } - }; - } - - @Override - public void reset() - { - dictionary.clear(); - reverseDictionary.clear(); - estimatedFootprint = 0; - } - - @VisibleForTesting - void writeToKeyBuffer(int keyBufferPosition, int groupingKey, ByteBuffer keyBuffer) - { - keyBuffer.putInt(keyBufferPosition, groupingKey); - } -} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayStringGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayStringGroupByColumnSelectorStrategy.java deleted file mode 100644 index 375496671fa2..000000000000 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayStringGroupByColumnSelectorStrategy.java +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.BiMap; -import com.google.common.collect.HashBiMap; -import org.apache.druid.java.util.common.ISE; -import org.apache.druid.query.groupby.ResultRow; -import org.apache.druid.query.groupby.epinephelinae.DictionaryBuilding; -import org.apache.druid.query.groupby.epinephelinae.Grouper; -import org.apache.druid.query.ordering.StringComparator; -import org.apache.druid.query.ordering.StringComparators; -import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.column.ValueType; -import org.apache.druid.segment.data.ComparableIntArray; - -import javax.annotation.Nullable; -import java.nio.ByteBuffer; -import java.util.List; - -public class ArrayStringGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy -{ - private static final int GROUP_BY_MISSING_VALUE = -1; - - - // contains string <-> id for each element of the multi value grouping column - // for eg : [a,b,c] is the col value. dictionaryToInt will contain { a <-> 1, b <-> 2, c <-> 3} - private final BiMap dictionaryToInt; - - // stores each row as an integer array where the int represents the value in dictionaryToInt - // for eg : [a,b,c] would be converted to [1,2,3] and assigned a integer value 1. - // [1,2,3] <-> 1 - private final BiMap intListToInt; - - private long estimatedFootprint = 0L; - - @Override - public int getGroupingKeySize() - { - return Integer.BYTES; - } - - public ArrayStringGroupByColumnSelectorStrategy() - { - dictionaryToInt = HashBiMap.create(); - intListToInt = HashBiMap.create(); - } - - @VisibleForTesting - ArrayStringGroupByColumnSelectorStrategy( - BiMap dictionaryToInt, - BiMap intArrayToInt - ) - { - this.dictionaryToInt = dictionaryToInt; - this.intListToInt = intArrayToInt; - } - - @Override - public void processValueFromGroupingKey( - GroupByColumnSelectorPlus selectorPlus, - ByteBuffer key, - ResultRow resultRow, - int keyBufferPosition - ) - { - final int id = key.getInt(keyBufferPosition); - - // GROUP_BY_MISSING_VALUE is used to indicate empty rows - if (id != GROUP_BY_MISSING_VALUE) { - final int[] intRepresentation = intListToInt.inverse() - .get(id) - .getDelegate(); - final Object[] stringRepresentaion = new Object[intRepresentation.length]; - for (int i = 0; i < intRepresentation.length; i++) { - stringRepresentaion[i] = dictionaryToInt.inverse().get(intRepresentation[i]); - } - resultRow.set(selectorPlus.getResultRowPosition(), stringRepresentaion); - } else { - resultRow.set(selectorPlus.getResultRowPosition(), null); - } - - } - - @Override - public int initColumnValues( - ColumnValueSelector selector, - int columnIndex, - Object[] valuess - ) - { - final long priorFootprint = estimatedFootprint; - final int groupingKey = computeDictionaryId(selector); - valuess[columnIndex] = groupingKey; - return (int) (estimatedFootprint - priorFootprint); - } - - @Override - public void initGroupingKeyColumnValue( - int keyBufferPosition, - int dimensionIndex, - Object rowObj, - ByteBuffer keyBuffer, - int[] stack - ) - { - final int groupingKey = (int) rowObj; - writeToKeyBuffer(keyBufferPosition, groupingKey, keyBuffer); - if (groupingKey == GROUP_BY_MISSING_VALUE) { - stack[dimensionIndex] = 0; - } else { - stack[dimensionIndex] = 1; - } - } - - @Override - public boolean checkRowIndexAndAddValueToGroupingKey( - int keyBufferPosition, - Object rowObj, - int rowValIdx, - ByteBuffer keyBuffer - ) - { - return false; - } - - /** - * Compute dictionary ID for the given selector. Updates {@link #estimatedFootprint} as necessary. - */ - @VisibleForTesting - int computeDictionaryId(ColumnValueSelector selector) - { - final int[] intRepresentation; - Object object = selector.getObject(); - if (object == null) { - return GROUP_BY_MISSING_VALUE; - } else if (object instanceof String) { - intRepresentation = new int[1]; - intRepresentation[0] = addToIndexedDictionary((String) object); - } else if (object instanceof List) { - final int size = ((List) object).size(); - intRepresentation = new int[size]; - for (int i = 0; i < size; i++) { - intRepresentation[i] = addToIndexedDictionary((String) ((List) object).get(i)); - } - } else if (object instanceof String[]) { - final int size = ((String[]) object).length; - intRepresentation = new int[size]; - for (int i = 0; i < size; i++) { - intRepresentation[i] = addToIndexedDictionary(((String[]) object)[i]); - } - } else if (object instanceof Object[]) { - final int size = ((Object[]) object).length; - intRepresentation = new int[size]; - for (int i = 0; i < size; i++) { - intRepresentation[i] = addToIndexedDictionary((String) ((Object[]) object)[i]); - } - } else { - throw new ISE("Found unexpected object type [%s] in %s array.", object.getClass().getName(), ValueType.STRING); - } - - final ComparableIntArray comparableIntArray = ComparableIntArray.of(intRepresentation); - final int dictId = intListToInt.getOrDefault(comparableIntArray, GROUP_BY_MISSING_VALUE); - if (dictId == GROUP_BY_MISSING_VALUE) { - final int nextId = intListToInt.keySet().size(); - intListToInt.put(comparableIntArray, nextId); - - // We're not using the dictionary and reverseDictionary from DictionaryBuilding, but the BiMap is close enough - // that we expect this footprint calculation to still be useful. (It doesn't have to be exact.) - estimatedFootprint += - DictionaryBuilding.estimateEntryFootprint(comparableIntArray.getDelegate().length * Integer.BYTES); - - return nextId; - } else { - return dictId; - } - } - - private int addToIndexedDictionary(String value) - { - final Integer dictId = dictionaryToInt.get(value); - if (dictId == null) { - final int nextId = dictionaryToInt.size(); - dictionaryToInt.put(value, nextId); - - // We're not using the dictionary and reverseDictionary from DictionaryBuilding, but the BiMap is close enough - // that we expect this footprint calculation to still be useful. (It doesn't have to be exact.) - estimatedFootprint += - DictionaryBuilding.estimateEntryFootprint((value == null ? 0 : value.length()) * Character.BYTES); - - return nextId; - } else { - return dictId; - } - } - - @Override - public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) - { - final long priorFootprint = estimatedFootprint; - - // computeDictionaryId updates estimatedFootprint - keyBuffer.putInt(keyBufferPosition, computeDictionaryId(selector)); - - return (int) (estimatedFootprint - priorFootprint); - } - - @Override - public Grouper.BufferComparator bufferComparator(int keyBufferPosition, @Nullable StringComparator stringComparator) - { - final StringComparator comparator = stringComparator == null ? StringComparators.LEXICOGRAPHIC : stringComparator; - return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> { - int[] lhs = intListToInt.inverse().get(lhsBuffer.getInt(lhsPosition + keyBufferPosition)).getDelegate(); - int[] rhs = intListToInt.inverse().get(rhsBuffer.getInt(rhsPosition + keyBufferPosition)).getDelegate(); - - int minLength = Math.min(lhs.length, rhs.length); - //noinspection ArrayEquality - if (lhs == rhs) { - return 0; - } else { - for (int i = 0; i < minLength; i++) { - final int cmp = comparator.compare( - dictionaryToInt.inverse().get(lhs[i]), - dictionaryToInt.inverse().get(rhs[i]) - ); - if (cmp == 0) { - continue; - } - return cmp; - } - if (lhs.length == rhs.length) { - return 0; - } else if (lhs.length < rhs.length) { - return -1; - } - return 1; - } - }; - } - - @Override - public void reset() - { - dictionaryToInt.clear(); - intListToInt.clear(); - estimatedFootprint = 0; - } - - @VisibleForTesting - void writeToKeyBuffer(int keyBufferPosition, int groupingKey, ByteBuffer keyBuffer) - { - keyBuffer.putInt(keyBufferPosition, groupingKey); - } -} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingStringGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingStringGroupByColumnSelectorStrategy.java deleted file mode 100644 index 63b7262df8cf..000000000000 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingStringGroupByColumnSelectorStrategy.java +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -import com.google.common.base.Preconditions; -import it.unimi.dsi.fastutil.objects.Object2IntMap; -import org.apache.druid.common.config.NullHandling; -import org.apache.druid.query.groupby.ResultRow; -import org.apache.druid.query.groupby.epinephelinae.DictionaryBuilding; -import org.apache.druid.query.groupby.epinephelinae.Grouper; -import org.apache.druid.query.ordering.StringComparator; -import org.apache.druid.query.ordering.StringComparators; -import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.DimensionDictionary; -import org.apache.druid.segment.DimensionSelector; -import org.apache.druid.segment.data.ArrayBasedIndexedInts; -import org.apache.druid.segment.data.IndexedInts; - -import javax.annotation.Nullable; -import java.nio.ByteBuffer; -import java.util.List; - -/** - * A String strategy that builds an internal String<->Integer dictionary for - * DimensionSelectors that return false for nameLookupPossibleInAdvance() - */ -public class DictionaryBuildingStringGroupByColumnSelectorStrategy extends StringGroupByColumnSelectorStrategy -{ - private static final int GROUP_BY_MISSING_VALUE = -1; - - private final List dictionary = DictionaryBuilding.createDictionary(); - private final Object2IntMap reverseDictionary = DictionaryBuilding.createReverseDictionary(); - - public DictionaryBuildingStringGroupByColumnSelectorStrategy() - { - super(null, null); - } - - @Override - public void processValueFromGroupingKey( - GroupByColumnSelectorPlus selectorPlus, - ByteBuffer key, - ResultRow resultRow, - int keyBufferPosition - ) - { - final int id = key.getInt(keyBufferPosition); - - // GROUP_BY_MISSING_VALUE is used to indicate empty rows, which are omitted from the result map. - if (id != GROUP_BY_MISSING_VALUE) { - final String value = dictionary.get(id); - resultRow.set(selectorPlus.getResultRowPosition(), value); - } else { - resultRow.set(selectorPlus.getResultRowPosition(), NullHandling.defaultStringValue()); - } - } - - @Override - public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] valuess) - { - final DimensionSelector dimSelector = (DimensionSelector) selector; - final IndexedInts row = dimSelector.getRow(); - int stateFootprintIncrease = 0; - ArrayBasedIndexedInts newRow = (ArrayBasedIndexedInts) valuess[columnIndex]; - if (newRow == null) { - newRow = new ArrayBasedIndexedInts(); - valuess[columnIndex] = newRow; - } - int rowSize = row.size(); - newRow.ensureSize(rowSize); - for (int i = 0; i < rowSize; i++) { - final String value = dimSelector.lookupName(row.get(i)); - final int dictId = reverseDictionary.getInt(value); - if (dictId < 0) { - final int nextId = dictionary.size(); - dictionary.add(value); - reverseDictionary.put(value, nextId); - newRow.setValue(i, nextId); - stateFootprintIncrease += - DictionaryBuilding.estimateEntryFootprint((value == null ? 0 : value.length()) * Character.BYTES); - } else { - newRow.setValue(i, dictId); - } - } - newRow.setSize(rowSize); - return stateFootprintIncrease; - } - - /** - * Writes a dictionary ID to the grouping key. - */ - private void writeToKeyBuffer(int keyBufferPosition, int dictId, ByteBuffer keyBuffer) - { - keyBuffer.putInt(keyBufferPosition, dictId); - } - - - - @Override - public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) - { - final DimensionSelector dimSelector = (DimensionSelector) selector; - final IndexedInts row = dimSelector.getRow(); - - Preconditions.checkState(row.size() < 2, "Not supported for multi-value dimensions"); - - if (row.size() == 0) { - writeToKeyBuffer(keyBufferPosition, GROUP_BY_MISSING_VALUE, keyBuffer); - return 0; - } - - final String value = dimSelector.lookupName(row.get(0)); - final int dictId = reverseDictionary.getInt(value); - if (dictId == DimensionDictionary.ABSENT_VALUE_ID) { - final int nextId = dictionary.size(); - dictionary.add(value); - reverseDictionary.put(value, nextId); - writeToKeyBuffer(keyBufferPosition, nextId, keyBuffer); - return DictionaryBuilding.estimateEntryFootprint((value == null ? 0 : value.length()) * Character.BYTES); - } else { - writeToKeyBuffer(keyBufferPosition, dictId, keyBuffer); - return 0; - } - } - - @Override - public Grouper.BufferComparator bufferComparator(int keyBufferPosition, @Nullable StringComparator stringComparator) - { - final StringComparator realComparator = stringComparator == null ? - StringComparators.LEXICOGRAPHIC : - stringComparator; - return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> { - String lhsStr = dictionary.get(lhsBuffer.getInt(lhsPosition + keyBufferPosition)); - String rhsStr = dictionary.get(rhsBuffer.getInt(rhsPosition + keyBufferPosition)); - return realComparator.compare(lhsStr, rhsStr); - }; - } - - @Override - public void reset() - { - dictionary.clear(); - reverseDictionary.clear(); - } -} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DoubleGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DoubleGroupByColumnSelectorStrategy.java deleted file mode 100644 index 6fcbf9ca9d57..000000000000 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DoubleGroupByColumnSelectorStrategy.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -import org.apache.druid.query.groupby.ResultRow; -import org.apache.druid.query.groupby.epinephelinae.Grouper; -import org.apache.druid.query.groupby.epinephelinae.GrouperBufferComparatorUtils; -import org.apache.druid.query.ordering.StringComparator; -import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.DimensionHandlerUtils; - -import javax.annotation.Nullable; -import java.nio.ByteBuffer; - -public class DoubleGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy -{ - @Override - public int getGroupingKeySize() - { - return Double.BYTES; - } - - @Override - public void processValueFromGroupingKey( - GroupByColumnSelectorPlus selectorPlus, - ByteBuffer key, - ResultRow resultRow, - int keyBufferPosition - ) - { - final double val = key.getDouble(keyBufferPosition); - resultRow.set(selectorPlus.getResultRowPosition(), val); - } - - @Override - public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] values) - { - values[columnIndex] = selector.getDouble(); - return 0; - } - - @Override - public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) - { - keyBuffer.putDouble(keyBufferPosition, selector.getDouble()); - return 0; - } - - @Override - public void initGroupingKeyColumnValue( - int keyBufferPosition, - int dimensionIndex, - Object rowObj, - ByteBuffer keyBuffer, - int[] stack - ) - { - writeToKeyBuffer(keyBufferPosition, DimensionHandlerUtils.nullToZero((Double) rowObj), keyBuffer); - stack[dimensionIndex] = 1; - } - - @Override - public boolean checkRowIndexAndAddValueToGroupingKey( - int keyBufferPosition, - Object rowObj, - int rowValIdx, - ByteBuffer keyBuffer - ) - { - // rows from a double column always have a single value, multi-value is not currently supported - // this method handles row values after the first in a multivalued row, so just return false - return false; - } - - @Override - public Grouper.BufferComparator bufferComparator(int keyBufferPosition, @Nullable StringComparator stringComparator) - { - return GrouperBufferComparatorUtils.makeBufferComparatorForDouble( - keyBufferPosition, - true, - stringComparator - ); - } - - @Override - public void reset() - { - // Nothing to do. - } - - private void writeToKeyBuffer(int keyBufferPosition, double value, ByteBuffer keyBuffer) - { - keyBuffer.putDouble(keyBufferPosition, value); - } -} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FloatGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FloatGroupByColumnSelectorStrategy.java deleted file mode 100644 index a01c3c3bd1b8..000000000000 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FloatGroupByColumnSelectorStrategy.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -import org.apache.druid.query.groupby.ResultRow; -import org.apache.druid.query.groupby.epinephelinae.Grouper; -import org.apache.druid.query.groupby.epinephelinae.GrouperBufferComparatorUtils; -import org.apache.druid.query.ordering.StringComparator; -import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.DimensionHandlerUtils; - -import javax.annotation.Nullable; -import java.nio.ByteBuffer; - -public class FloatGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy -{ - - @Override - public int getGroupingKeySize() - { - return Float.BYTES; - } - - @Override - public void processValueFromGroupingKey( - GroupByColumnSelectorPlus selectorPlus, - ByteBuffer key, - ResultRow resultRow, - int keyBufferPosition - ) - { - final float val = key.getFloat(keyBufferPosition); - resultRow.set(selectorPlus.getResultRowPosition(), val); - } - - @Override - public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] valuess) - { - valuess[columnIndex] = selector.getFloat(); - return 0; - } - - @Override - public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) - { - keyBuffer.putFloat(keyBufferPosition, selector.getFloat()); - return 0; - } - - @Override - public Grouper.BufferComparator bufferComparator( - int keyBufferPosition, - @Nullable StringComparator stringComparator - ) - { - return GrouperBufferComparatorUtils.makeBufferComparatorForFloat( - keyBufferPosition, - true, - stringComparator - ); - } - - @Override - public void initGroupingKeyColumnValue( - int keyBufferPosition, - int dimensionIndex, - Object rowObj, - ByteBuffer keyBuffer, - int[] stack - ) - { - writeToKeyBuffer(keyBufferPosition, DimensionHandlerUtils.nullToZero((Float) rowObj), keyBuffer); - stack[dimensionIndex] = 1; - } - - @Override - public boolean checkRowIndexAndAddValueToGroupingKey( - int keyBufferPosition, - Object rowObj, - int rowValIdx, - ByteBuffer keyBuffer - ) - { - // rows from a float column always have a single value, multi-value is not currently supported - // this method handles row values after the first in a multivalued row, so just return false - return false; - } - - @Override - public void reset() - { - // Nothing to do. - } - - private void writeToKeyBuffer(int keyBufferPosition, float value, ByteBuffer keyBuffer) - { - keyBuffer.putFloat(keyBufferPosition, value); - } -} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/GroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/GroupByColumnSelectorStrategy.java index 26095b5a2b29..34a4fd2b21fc 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/GroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/GroupByColumnSelectorStrategy.java @@ -36,7 +36,7 @@ * Each GroupByColumnSelectorStrategy is associated with a single dimension. * * Strategies may have internal state, such as the dictionary maintained by - * {@link DictionaryBuildingStringGroupByColumnSelectorStrategy}. Callers should assume that the internal + * {@link DictionaryBuildingGroupByColumnSelectorStrategy}. Callers should assume that the internal * state footprint starts out empty (zero bytes) and is also reset to zero on each call to {@link #reset()}. Each call * to {@link #initColumnValues} or {@link #writeToKeyBuffer(int, ColumnValueSelector, ByteBuffer)} returns the * incremental increase in internal state footprint that happened as a result of that particular call. diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/LongGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/LongGroupByColumnSelectorStrategy.java deleted file mode 100644 index 95d57e03da15..000000000000 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/LongGroupByColumnSelectorStrategy.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -import org.apache.druid.query.groupby.ResultRow; -import org.apache.druid.query.groupby.epinephelinae.Grouper; -import org.apache.druid.query.groupby.epinephelinae.GrouperBufferComparatorUtils; -import org.apache.druid.query.ordering.StringComparator; -import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.DimensionHandlerUtils; - -import javax.annotation.Nullable; -import java.nio.ByteBuffer; - -public class LongGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy -{ - - @Override - public int getGroupingKeySize() - { - return Long.BYTES; - } - - @Override - public void processValueFromGroupingKey( - GroupByColumnSelectorPlus selectorPlus, - ByteBuffer key, - ResultRow resultRow, - int keyBufferPosition - ) - { - final long val = key.getLong(keyBufferPosition); - resultRow.set(selectorPlus.getResultRowPosition(), val); - } - - @Override - public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] valuess) - { - valuess[columnIndex] = selector.getLong(); - return 0; - } - - @Override - public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) - { - keyBuffer.putLong(keyBufferPosition, selector.getLong()); - return 0; - } - - @Override - public Grouper.BufferComparator bufferComparator( - int keyBufferPosition, - @Nullable StringComparator stringComparator - ) - { - return GrouperBufferComparatorUtils.makeBufferComparatorForLong( - keyBufferPosition, - true, - stringComparator - ); - } - - @Override - public void initGroupingKeyColumnValue( - int keyBufferPosition, - int dimensionIndex, - Object rowObj, - ByteBuffer keyBuffer, - int[] stack - ) - { - writeToKeyBuffer(keyBufferPosition, DimensionHandlerUtils.nullToZero((Long) rowObj), keyBuffer); - stack[dimensionIndex] = 1; - } - - @Override - public boolean checkRowIndexAndAddValueToGroupingKey( - int keyBufferPosition, - Object rowObj, - int rowValIdx, - ByteBuffer keyBuffer - ) - { - // rows from a long column always have a single value, multi-value is not currently supported - // this method handles row values after the first in a multivalued row, so just return false - return false; - } - - @Override - public void reset() - { - // Nothing to do. - } - - public void writeToKeyBuffer(int keyBufferPosition, long value, ByteBuffer keyBuffer) - { - keyBuffer.putLong(keyBufferPosition, value); - } -} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java deleted file mode 100644 index 073a16c6be7a..000000000000 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MultiValueHelpers.java +++ /dev/null @@ -1,14 +0,0 @@ -package org.apache.druid.query.groupby.epinephelinae.column; - -public class MultiValueHelpers -{ - DimensionToIdConverter multiValueHelperForDimensionSelectors() - { - return null; - } - - DimensionToIdConverter multiValueHelperForSingleValueSelectors() - { - return null; - } -} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/NullableNumericGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/NullableNumericGroupByColumnSelectorStrategy.java deleted file mode 100644 index 34af76211397..000000000000 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/NullableNumericGroupByColumnSelectorStrategy.java +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -import org.apache.druid.common.config.NullHandling; -import org.apache.druid.query.groupby.ResultRow; -import org.apache.druid.query.groupby.epinephelinae.Grouper; -import org.apache.druid.query.groupby.epinephelinae.GrouperBufferComparatorUtils; -import org.apache.druid.query.ordering.StringComparator; -import org.apache.druid.segment.ColumnValueSelector; - -import javax.annotation.Nullable; -import java.nio.ByteBuffer; - -/** - * A wrapper around a numeric {@link GroupByColumnSelectorStrategy} that makes it null-aware. Should only be used - * for numeric strategies, not for string strategies. - * - * @see org.apache.druid.segment.BaseNullableColumnValueSelector#isNull() for why this only works in the numeric case - */ -public class NullableNumericGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy -{ - private final GroupByColumnSelectorStrategy delegate; - private final byte[] nullKeyBytes; - - public NullableNumericGroupByColumnSelectorStrategy(GroupByColumnSelectorStrategy delegate) - { - this.delegate = delegate; - this.nullKeyBytes = new byte[delegate.getGroupingKeySize() + 1]; - this.nullKeyBytes[0] = NullHandling.IS_NULL_BYTE; - } - - @Override - public int getGroupingKeySize() - { - return delegate.getGroupingKeySize() + Byte.BYTES; - } - - @Override - public void processValueFromGroupingKey( - GroupByColumnSelectorPlus selectorPlus, - ByteBuffer key, - ResultRow resultRow, - int keyBufferPosition - ) - { - if (key.get(keyBufferPosition) == NullHandling.IS_NULL_BYTE) { - resultRow.set(selectorPlus.getResultRowPosition(), null); - } else { - delegate.processValueFromGroupingKey(selectorPlus, key, resultRow, keyBufferPosition + Byte.BYTES); - } - } - - @Override - public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] values) - { - if (selector.isNull()) { - values[columnIndex] = null; - return 0; - } else { - return delegate.initColumnValues(selector, columnIndex, values); - } - } - - @Override - public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) - { - if (selector.isNull()) { - keyBuffer.position(keyBufferPosition); - keyBuffer.put(nullKeyBytes); - return 0; - } else { - keyBuffer.put(keyBufferPosition, NullHandling.IS_NOT_NULL_BYTE); - return delegate.writeToKeyBuffer(keyBufferPosition + Byte.BYTES, selector, keyBuffer); - } - } - - @Override - public Grouper.BufferComparator bufferComparator( - int keyBufferPosition, - @Nullable StringComparator stringComparator - ) - { - return GrouperBufferComparatorUtils.makeNullHandlingBufferComparatorForNumericData( - keyBufferPosition, - delegate.bufferComparator(keyBufferPosition + Byte.BYTES, stringComparator) - ); - } - - @Override - public void initGroupingKeyColumnValue( - int keyBufferPosition, - int dimensionIndex, - Object rowObj, - ByteBuffer keyBuffer, - int[] stack - ) - { - if (rowObj == null) { - keyBuffer.position(keyBufferPosition); - keyBuffer.put(nullKeyBytes); - } else { - keyBuffer.put(keyBufferPosition, NullHandling.IS_NOT_NULL_BYTE); - - // No need to update stack ourselves; we expect the delegate to do this. - delegate.initGroupingKeyColumnValue( - keyBufferPosition + Byte.BYTES, - dimensionIndex, - rowObj, - keyBuffer, - stack - ); - } - } - - @Override - public boolean checkRowIndexAndAddValueToGroupingKey( - int keyBufferPosition, - Object rowObj, - int rowValIdx, - ByteBuffer keyBuffer - ) - { - // rows from a nullable column always have a single value, multi-value is not currently supported - // this method handles row values after the first in a multivalued row, so just return false - return false; - } - - @Override - public void reset() - { - delegate.reset(); - } -} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/StringGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/StringGroupByColumnSelectorStrategy.java deleted file mode 100644 index 12b6da11129d..000000000000 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/StringGroupByColumnSelectorStrategy.java +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -import com.google.common.base.Preconditions; -import org.apache.druid.common.config.NullHandling; -import org.apache.druid.query.groupby.ResultRow; -import org.apache.druid.query.groupby.epinephelinae.Grouper; -import org.apache.druid.query.ordering.StringComparator; -import org.apache.druid.query.ordering.StringComparators; -import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.DimensionSelector; -import org.apache.druid.segment.column.ColumnCapabilities; -import org.apache.druid.segment.data.IndexedInts; - -import javax.annotation.Nullable; -import java.nio.ByteBuffer; -import java.util.function.IntFunction; - -public class StringGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy -{ - @Nullable - private final ColumnCapabilities capabilities; - - @Nullable - private final IntFunction dictionaryLookup; - - public StringGroupByColumnSelectorStrategy(IntFunction dictionaryLookup, ColumnCapabilities capabilities) - { - this.dictionaryLookup = dictionaryLookup; - this.capabilities = capabilities; - } - - @Override - public int getGroupingKeySize() - { - return Integer.BYTES; - } - - @Override - public void processValueFromGroupingKey( - GroupByColumnSelectorPlus selectorPlus, - ByteBuffer key, - ResultRow resultRow, - int keyBufferPosition - ) - { - final int id = key.getInt(keyBufferPosition); - - // GROUP_BY_MISSING_VALUE is used to indicate empty rows, which are omitted from the result map. - if (id != GROUP_BY_MISSING_VALUE) { - resultRow.set( - selectorPlus.getResultRowPosition(), - ((DimensionSelector) selectorPlus.getSelector()).lookupName(id) - ); - } else { - resultRow.set(selectorPlus.getResultRowPosition(), NullHandling.defaultStringValue()); - } - } - - @Override - public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] valuess) - { - DimensionSelector dimSelector = (DimensionSelector) selector; - IndexedInts row = dimSelector.getRow(); - valuess[columnIndex] = row; - return 0; - } - - @Override - public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) - { - final DimensionSelector dimSelector = (DimensionSelector) selector; - final IndexedInts row = dimSelector.getRow(); - Preconditions.checkState(row.size() < 2, "Not supported for multi-value dimensions"); - final int dictId = row.size() == 1 ? row.get(0) : GROUP_BY_MISSING_VALUE; - keyBuffer.putInt(keyBufferPosition, dictId); - return 0; - } - - @Override - public void initGroupingKeyColumnValue( - int keyBufferPosition, - int dimensionIndex, - Object rowObj, - ByteBuffer keyBuffer, - int[] stack - ) - { - IndexedInts row = (IndexedInts) rowObj; - int rowSize = row.size(); - - initializeGroupingKeyDimension(row, rowSize, keyBuffer, keyBufferPosition); - stack[dimensionIndex] = rowSize == 0 ? 0 : 1; - } - - @Override - public boolean checkRowIndexAndAddValueToGroupingKey( - int keyBufferPosition, - Object rowObj, - int rowValIdx, - ByteBuffer keyBuffer - ) - { - IndexedInts row = (IndexedInts) rowObj; - int rowSize = row.size(); - - if (rowValIdx < rowSize) { - keyBuffer.putInt( - keyBufferPosition, - row.get(rowValIdx) - ); - return true; - } else { - return false; - } - } - - private void initializeGroupingKeyDimension( - final IndexedInts values, - final int rowSize, - final ByteBuffer keyBuffer, - final int keyBufferPosition - ) - { - if (rowSize == 0) { - keyBuffer.putInt(keyBufferPosition, GROUP_BY_MISSING_VALUE); - } else { - keyBuffer.putInt(keyBufferPosition, values.get(0)); - } - } - - @Override - public Grouper.BufferComparator bufferComparator(int keyBufferPosition, @Nullable StringComparator stringComparator) - { - final boolean canCompareInts = - capabilities != null && - capabilities.hasBitmapIndexes() && - capabilities.areDictionaryValuesSorted().and(capabilities.areDictionaryValuesUnique()).isTrue(); - final StringComparator comparator = stringComparator == null ? StringComparators.LEXICOGRAPHIC : stringComparator; - if (canCompareInts && StringComparators.LEXICOGRAPHIC.equals(comparator)) { - return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> Integer.compare( - lhsBuffer.getInt(lhsPosition + keyBufferPosition), - rhsBuffer.getInt(rhsPosition + keyBufferPosition) - ); - } else { - Preconditions.checkState(dictionaryLookup != null, "null dictionary lookup"); - return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> { - String lhsStr = dictionaryLookup.apply(lhsBuffer.getInt(lhsPosition + keyBufferPosition)); - String rhsStr = dictionaryLookup.apply(rhsBuffer.getInt(rhsPosition + keyBufferPosition)); - return comparator.compare(lhsStr, rhsStr); - }; - } - } - - @Override - public void reset() - { - // Nothing to do. - } -} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/vector/DictionaryBuildingSingleValueStringGroupByVectorColumnSelector.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/vector/DictionaryBuildingSingleValueStringGroupByVectorColumnSelector.java index 83f49e1c834e..67025aa855b3 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/vector/DictionaryBuildingSingleValueStringGroupByVectorColumnSelector.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/vector/DictionaryBuildingSingleValueStringGroupByVectorColumnSelector.java @@ -36,7 +36,8 @@ * single-valued STRING columns which are not natively dictionary encoded, e.g. expression virtual columns. * * This is effectively the {@link VectorGroupByEngine} analog of - * {@link org.apache.druid.query.groupby.epinephelinae.column.DictionaryBuildingStringGroupByColumnSelectorStrategy} + * {@link org.apache.druid.query.groupby.epinephelinae.column.DictionaryBuildingGroupByColumnSelectorStrategy} for + * String columns */ public class DictionaryBuildingSingleValueStringGroupByVectorColumnSelector implements GroupByVectorColumnSelector { diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayDoubleGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayDoubleGroupByColumnSelectorStrategyTest.java deleted file mode 100644 index ad0c96c4673f..000000000000 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayDoubleGroupByColumnSelectorStrategyTest.java +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -import com.google.common.collect.ImmutableList; -import org.apache.druid.query.groupby.ResultRow; -import org.apache.druid.query.groupby.epinephelinae.Grouper; -import org.apache.druid.query.ordering.StringComparators; -import org.apache.druid.segment.ColumnValueSelector; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.mockito.Mockito; - -import java.nio.ByteBuffer; - -public class ArrayDoubleGroupByColumnSelectorStrategyTest -{ - private final ByteBuffer buffer1 = ByteBuffer.allocate(4); - private final ByteBuffer buffer2 = ByteBuffer.allocate(4); - - private ArrayNumericGroupByColumnSelectorStrategy strategy; - - @Before - public void setup() - { - strategy = new ArrayDoubleGroupByColumnSelectorStrategy(); - addToStrategy(new Object[]{1.0, 2.0}); - addToStrategy(ImmutableList.of(2.0, 3.0)); - addToStrategy(new Double[]{1.0}); - } - - @Test - public void testKeySize() - { - Assert.assertEquals(Integer.BYTES, strategy.getGroupingKeySize()); - } - - @Test - public void testWriteKey() - { - strategy.writeToKeyBuffer(0, 1, buffer1); - Assert.assertEquals(1, buffer1.getInt(0)); - } - - @Test - public void testBufferComparatorsWithNullAndNonNullStringComprators() - { - buffer1.putInt(1); - buffer2.putInt(2); - Grouper.BufferComparator comparator = strategy.bufferComparator(0, null); - Assert.assertTrue(comparator.compare(buffer1, buffer2, 0, 0) > 0); - Assert.assertTrue(comparator.compare(buffer2, buffer1, 0, 0) < 0); - - comparator = strategy.bufferComparator(0, StringComparators.LEXICOGRAPHIC); - Assert.assertTrue(comparator.compare(buffer1, buffer2, 0, 0) > 0); - Assert.assertTrue(comparator.compare(buffer2, buffer1, 0, 0) < 0); - - comparator = strategy.bufferComparator(0, StringComparators.STRLEN); - Assert.assertTrue(comparator.compare(buffer1, buffer2, 0, 0) > 0); - Assert.assertTrue(comparator.compare(buffer2, buffer1, 0, 0) < 0); - } - - @Test - public void testBufferComparator() - { - buffer1.putInt(0); - buffer2.putInt(2); - Grouper.BufferComparator comparator = strategy.bufferComparator(0, null); - Assert.assertTrue(comparator.compare(buffer1, buffer2, 0, 0) > 0); - - } - - @Test - public void testSanity() - { - testSanity(new Object[]{1.0, 2.0}, 0); - testSanity(new Object[]{2.0, 3.0}, 1); - testSanity(new Object[]{1.0}, 2); - } - - private void testSanity(Object[] storedValue, int expectedIndex) - { - ColumnValueSelector columnValueSelector = Mockito.mock(ColumnValueSelector.class); - Mockito.when(columnValueSelector.getObject()).thenReturn(storedValue); - Assert.assertEquals(expectedIndex, strategy.computeDictionaryId(columnValueSelector)); - - GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); - Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); - ResultRow row = ResultRow.create(1); - - buffer1.putInt(0, expectedIndex); - strategy.processValueFromGroupingKey(groupByColumnSelectorPlus, buffer1, row, 0); - Assert.assertArrayEquals(storedValue, (Object[]) row.get(0)); - } - - @Test - public void testAddingInDictionary() - { - ColumnValueSelector columnValueSelector = Mockito.mock(ColumnValueSelector.class); - Mockito.when(columnValueSelector.getObject()).thenReturn(ImmutableList.of(4.0, 2.0)); - Assert.assertEquals(3, strategy.computeDictionaryId(columnValueSelector)); - - GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); - Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); - ResultRow row = ResultRow.create(1); - - buffer1.putInt(3); - strategy.processValueFromGroupingKey(groupByColumnSelectorPlus, buffer1, row, 0); - Assert.assertArrayEquals(new Object[]{4.0, 2.0}, (Object[]) row.get(0)); - } - - @Test - public void testAddingInDictionaryWithObjects() - { - ColumnValueSelector columnValueSelector = Mockito.mock(ColumnValueSelector.class); - Mockito.when(columnValueSelector.getObject()).thenReturn(new Object[]{4.0D, 2.0D}); - Assert.assertEquals(3, strategy.computeDictionaryId(columnValueSelector)); - - GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); - Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); - ResultRow row = ResultRow.create(1); - buffer1.putInt(3); - strategy.processValueFromGroupingKey(groupByColumnSelectorPlus, buffer1, row, 0); - Assert.assertArrayEquals(new Object[]{4.0, 2.0}, (Object[]) row.get(0)); - } - - private void addToStrategy(Object value) - { - ColumnValueSelector columnValueSelector = Mockito.mock(ColumnValueSelector.class); - Mockito.when(columnValueSelector.getObject()).thenReturn(value); - strategy.computeDictionaryId(columnValueSelector); - } - - @After - public void tearDown() - { - buffer1.clear(); - buffer2.clear(); - } -} diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayLongGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayLongGroupByColumnSelectorStrategyTest.java deleted file mode 100644 index d8bc5372cd70..000000000000 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayLongGroupByColumnSelectorStrategyTest.java +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -import com.google.common.collect.ImmutableList; -import org.apache.druid.query.groupby.ResultRow; -import org.apache.druid.query.groupby.epinephelinae.Grouper; -import org.apache.druid.query.ordering.StringComparators; -import org.apache.druid.segment.ColumnValueSelector; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.mockito.Mockito; -import org.mockito.junit.MockitoJUnitRunner; - -import java.nio.ByteBuffer; - -@RunWith(MockitoJUnitRunner.class) -public class ArrayLongGroupByColumnSelectorStrategyTest -{ - private final ByteBuffer buffer1 = ByteBuffer.allocate(4); - private final ByteBuffer buffer2 = ByteBuffer.allocate(4); - - private ArrayNumericGroupByColumnSelectorStrategy strategy; - - @Before - public void setup() - { - strategy = new ArrayLongGroupByColumnSelectorStrategy(); - addToStrategy(new Object[]{1L, 2L}); - addToStrategy(ImmutableList.of(2L, 3L)); - addToStrategy(new Long[]{1L}); - } - - @Test - public void testKeySize() - { - Assert.assertEquals(Integer.BYTES, strategy.getGroupingKeySize()); - } - - @Test - public void testWriteKey() - { - strategy.writeToKeyBuffer(0, 1, buffer1); - Assert.assertEquals(1, buffer1.getInt(0)); - } - - @Test - public void testBufferComparatorsWithNullAndNonNullStringComprators() - { - buffer1.putInt(1); - buffer2.putInt(2); - Grouper.BufferComparator comparator = strategy.bufferComparator(0, null); - Assert.assertTrue(comparator.compare(buffer1, buffer2, 0, 0) > 0); - Assert.assertTrue(comparator.compare(buffer2, buffer1, 0, 0) < 0); - - comparator = strategy.bufferComparator(0, StringComparators.LEXICOGRAPHIC); - Assert.assertTrue(comparator.compare(buffer1, buffer2, 0, 0) > 0); - Assert.assertTrue(comparator.compare(buffer2, buffer1, 0, 0) < 0); - - comparator = strategy.bufferComparator(0, StringComparators.STRLEN); - Assert.assertTrue(comparator.compare(buffer1, buffer2, 0, 0) > 0); - Assert.assertTrue(comparator.compare(buffer2, buffer1, 0, 0) < 0); - } - - @Test - public void testBufferComparator() - { - buffer1.putInt(0); - buffer2.putInt(2); - Grouper.BufferComparator comparator = strategy.bufferComparator(0, null); - Assert.assertTrue(comparator.compare(buffer1, buffer2, 0, 0) > 0); - - } - - @Test - public void testSanity() - { - testSanity(new Object[]{1L, 2L}, 0); - testSanity(new Object[]{2L, 3L}, 1); - testSanity(new Object[]{1L}, 2); - } - - private void testSanity(Object[] storedValue, int expectedIndex) - { - ColumnValueSelector columnValueSelector = Mockito.mock(ColumnValueSelector.class); - Mockito.when(columnValueSelector.getObject()).thenReturn(storedValue); - Assert.assertEquals(expectedIndex, strategy.computeDictionaryId(columnValueSelector)); - - GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); - Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); - ResultRow row = ResultRow.create(1); - - buffer1.putInt(0, expectedIndex); - strategy.processValueFromGroupingKey(groupByColumnSelectorPlus, buffer1, row, 0); - Assert.assertArrayEquals(storedValue, (Object[]) row.get(0)); - } - - @Test - public void testAddingInDictionary() - { - ColumnValueSelector columnValueSelector = Mockito.mock(ColumnValueSelector.class); - Mockito.when(columnValueSelector.getObject()).thenReturn(ImmutableList.of(4L, 2L)); - Assert.assertEquals(3, strategy.computeDictionaryId(columnValueSelector)); - - GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); - Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); - ResultRow row = ResultRow.create(1); - - buffer1.putInt(3); - strategy.processValueFromGroupingKey(groupByColumnSelectorPlus, buffer1, row, 0); - Assert.assertArrayEquals(new Object[]{4L, 2L}, (Object[]) row.get(0)); - } - - @Test - public void testAddingInDictionaryWithObjects() - { - ColumnValueSelector columnValueSelector = Mockito.mock(ColumnValueSelector.class); - Mockito.when(columnValueSelector.getObject()).thenReturn(new Object[]{4L, 2L}); - Assert.assertEquals(3, strategy.computeDictionaryId(columnValueSelector)); - - GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); - Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); - ResultRow row = ResultRow.create(1); - - buffer1.putInt(3); - strategy.processValueFromGroupingKey(groupByColumnSelectorPlus, buffer1, row, 0); - Assert.assertArrayEquals(new Object[]{4L, 2L}, (Object[]) row.get(0)); - } - - private void addToStrategy(Object value) - { - ColumnValueSelector columnValueSelector = Mockito.mock(ColumnValueSelector.class); - Mockito.when(columnValueSelector.getObject()).thenReturn(value); - strategy.computeDictionaryId(columnValueSelector); - } - - @After - public void tearDown() - { - buffer1.clear(); - buffer2.clear(); - } -} diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayStringGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayStringGroupByColumnSelectorStrategyTest.java deleted file mode 100644 index b3ed04605fda..000000000000 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/ArrayStringGroupByColumnSelectorStrategyTest.java +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -import com.google.common.collect.BiMap; -import com.google.common.collect.HashBiMap; -import com.google.common.collect.ImmutableList; -import org.apache.druid.query.groupby.ResultRow; -import org.apache.druid.query.groupby.epinephelinae.Grouper; -import org.apache.druid.query.ordering.StringComparators; -import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.data.ComparableIntArray; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.mockito.Mockito; -import org.mockito.junit.MockitoJUnitRunner; - -import java.nio.ByteBuffer; - -@RunWith(MockitoJUnitRunner.class) -public class ArrayStringGroupByColumnSelectorStrategyTest -{ - private final BiMap dictionaryInt = HashBiMap.create(); - - // The dictionary has been constructed such that the values are not sorted lexicographically - // so we can tell when the comparator uses a lexicographic comparison and when it uses the indexes. - private final BiMap indexedIntArrays = HashBiMap.create(); - - private final ByteBuffer buffer1 = ByteBuffer.allocate(4); - private final ByteBuffer buffer2 = ByteBuffer.allocate(4); - - private ArrayStringGroupByColumnSelectorStrategy strategy; - - @Before - public void setup() - { - strategy = new ArrayStringGroupByColumnSelectorStrategy(dictionaryInt, indexedIntArrays); - - dictionaryInt.put("a", 0); - dictionaryInt.put("b", 1); - dictionaryInt.put("bd", 2); - dictionaryInt.put("d", 3); - dictionaryInt.put("e", 4); - - indexedIntArrays.put(ComparableIntArray.of(0, 1), 0); - indexedIntArrays.put(ComparableIntArray.of(2, 4), 1); - indexedIntArrays.put(ComparableIntArray.of(0, 2), 2); - } - - @Test - public void testKeySize() - { - Assert.assertEquals(Integer.BYTES, strategy.getGroupingKeySize()); - } - - @Test - public void testWriteKey() - { - strategy.writeToKeyBuffer(0, 1, buffer1); - Assert.assertEquals(1, buffer1.getInt(0)); - } - - @Test - public void testBufferComparatorCanCompareIntsAndNullStringComparatorShouldUseLexicographicComparator() - { - buffer1.putInt(1); - buffer2.putInt(2); - Grouper.BufferComparator comparator = strategy.bufferComparator(0, null); - Assert.assertTrue(comparator.compare(buffer1, buffer2, 0, 0) > 0); - Assert.assertTrue(comparator.compare(buffer2, buffer1, 0, 0) < 0); - } - - @Test - public void testBufferComparatorCanCompareIntsAndLexicographicStringComparatorShouldUseLexicographicComparator() - { - buffer1.putInt(1); - buffer2.putInt(2); - Grouper.BufferComparator comparator = strategy.bufferComparator(0, StringComparators.LEXICOGRAPHIC); - Assert.assertTrue(comparator.compare(buffer1, buffer2, 0, 0) > 0); - Assert.assertTrue(comparator.compare(buffer2, buffer1, 0, 0) < 0); - } - - @Test - public void testBufferComparatorCanCompareIntsAndStrLenStringComparatorShouldUseLexicographicComparator() - { - buffer1.putInt(1); - buffer2.putInt(2); - Grouper.BufferComparator comparator = strategy.bufferComparator(0, StringComparators.STRLEN); - Assert.assertTrue(comparator.compare(buffer1, buffer2, 0, 0) > 0); - Assert.assertTrue(comparator.compare(buffer2, buffer1, 0, 0) < 0); - } - - @Test - public void testSanity() - { - ColumnValueSelector columnValueSelector = Mockito.mock(ColumnValueSelector.class); - Mockito.when(columnValueSelector.getObject()).thenReturn(ImmutableList.of("a", "b")); - Assert.assertEquals(0, strategy.computeDictionaryId(columnValueSelector)); - - GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); - Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); - ResultRow row = ResultRow.create(1); - - buffer1.putInt(0); - strategy.processValueFromGroupingKey(groupByColumnSelectorPlus, buffer1, row, 0); - Assert.assertArrayEquals(new Object[]{"a", "b"}, (Object[]) row.get(0)); - } - - - @Test - public void testAddingInDictionary() - { - ColumnValueSelector columnValueSelector = Mockito.mock(ColumnValueSelector.class); - Mockito.when(columnValueSelector.getObject()).thenReturn(ImmutableList.of("f", "a")); - Assert.assertEquals(3, strategy.computeDictionaryId(columnValueSelector)); - - GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); - Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); - ResultRow row = ResultRow.create(1); - - buffer1.putInt(3); - strategy.processValueFromGroupingKey(groupByColumnSelectorPlus, buffer1, row, 0); - Assert.assertArrayEquals(new Object[]{"f", "a"}, (Object[]) row.get(0)); - } - - @Test - public void testAddingInDictionaryWithObjects() - { - ColumnValueSelector columnValueSelector = Mockito.mock(ColumnValueSelector.class); - Mockito.when(columnValueSelector.getObject()).thenReturn(new Object[]{"f", "a"}); - Assert.assertEquals(3, strategy.computeDictionaryId(columnValueSelector)); - - GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); - Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); - ResultRow row = ResultRow.create(1); - - buffer1.putInt(3); - strategy.processValueFromGroupingKey(groupByColumnSelectorPlus, buffer1, row, 0); - Assert.assertArrayEquals(new Object[]{"f", "a"}, (Object[]) row.get(0)); - } - - @After - public void tearDown() - { - buffer1.clear(); - buffer2.clear(); - } -} diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/StringGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/StringGroupByColumnSelectorStrategyTest.java deleted file mode 100644 index 86524bb91cb4..000000000000 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/StringGroupByColumnSelectorStrategyTest.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap; -import it.unimi.dsi.fastutil.ints.Int2ObjectMap; -import org.apache.druid.query.groupby.epinephelinae.Grouper; -import org.apache.druid.query.ordering.StringComparators; -import org.apache.druid.segment.column.ColumnCapabilities; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.mockito.Mock; -import org.mockito.Mockito; -import org.mockito.junit.MockitoJUnitRunner; - -import java.nio.ByteBuffer; -import java.util.function.IntFunction; - -@RunWith(MockitoJUnitRunner.class) -public class StringGroupByColumnSelectorStrategyTest -{ - // The dictionary has been constructed such that the values are not sorted lexicographically - // so we can tell when the comparator uses a lexicographic comparison and when it uses the indexes. - private static final Int2ObjectMap DICTIONARY = new Int2ObjectArrayMap<>( - new int[] {0, 1, 2}, - new String[] {"A", "F1", "D"} - ); - - private final ByteBuffer lhsBuffer = ByteBuffer.allocate(4); - private final ByteBuffer rhsBuffer = ByteBuffer.allocate(4); - - @Mock - private ColumnCapabilities capabilities; - private final IntFunction dictionaryLookup = DICTIONARY::get; - - private StringGroupByColumnSelectorStrategy target; - - @Before - public void setUp() - { - lhsBuffer.putInt(1); - rhsBuffer.putInt(2); - Mockito.doReturn(true).when(capabilities).hasBitmapIndexes(); - Mockito.doReturn(ColumnCapabilities.Capable.TRUE).when(capabilities).areDictionaryValuesSorted(); - Mockito.doReturn(ColumnCapabilities.Capable.TRUE).when(capabilities).areDictionaryValuesUnique(); - target = new StringGroupByColumnSelectorStrategy(dictionaryLookup, capabilities); - } - - @Test - public void testBufferComparatorCannotCompareIntsAndNullStringComparatorShouldUseLexicographicComparator() - { - Mockito.when(capabilities.areDictionaryValuesSorted()).thenReturn(ColumnCapabilities.Capable.FALSE); - // The comparator is not using the short circuit so it isn't comparing indexes. - Grouper.BufferComparator comparator = target.bufferComparator(0, null); - Assert.assertTrue(comparator.compare(lhsBuffer, rhsBuffer, 0, 0) > 0); - Assert.assertTrue(comparator.compare(rhsBuffer, lhsBuffer, 0, 0) < 0); - } - - @Test - public void testBufferComparatorCanCompareIntsAndNullStringComparatorShouldUseLexicographicComparator() - { - Grouper.BufferComparator comparator = target.bufferComparator(0, null); - Assert.assertTrue(comparator.compare(lhsBuffer, rhsBuffer, 0, 0) < 0); - Assert.assertTrue(comparator.compare(rhsBuffer, lhsBuffer, 0, 0) > 0); - } - - @Test - public void testBufferComparatorCanCompareIntsAndLexicographicStringComparatorShouldUseLexicographicComparator() - { - Grouper.BufferComparator comparator = target.bufferComparator(0, StringComparators.LEXICOGRAPHIC); - Assert.assertTrue(comparator.compare(lhsBuffer, rhsBuffer, 0, 0) < 0); - Assert.assertTrue(comparator.compare(rhsBuffer, lhsBuffer, 0, 0) > 0); - } - - @Test - public void testBufferComparatorCanCompareIntsAndStrLenStringComparatorShouldUseLexicographicComparator() - { - Grouper.BufferComparator comparator = target.bufferComparator(0, StringComparators.STRLEN); - Assert.assertTrue(comparator.compare(lhsBuffer, rhsBuffer, 0, 0) > 0); - Assert.assertTrue(comparator.compare(rhsBuffer, lhsBuffer, 0, 0) < 0); - } - - @After - public void tearDown() - { - lhsBuffer.clear(); - rhsBuffer.clear(); - } -} From bd8264999de63176cd7bfe4131905f72b531e2fb Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Wed, 13 Mar 2024 09:57:50 +0530 Subject: [PATCH 09/46] checkstyle --- .../epinephelinae/DictionaryBuilding.java | 2 +- .../epinephelinae/GroupByQueryEngine.java | 3 -- .../epinephelinae/RowBasedGrouperHelper.java | 3 +- ...BuildingGroupByColumnSelectorStrategy.java | 19 ++++++++++ .../column/DimensionToIdConverter.java | 19 ++++++++++ ...xedWidthGroupByColumnSelectorStrategy.java | 38 +++++++++++++------ .../column/IdToDimensionConverter.java | 19 ++++++++++ ...yMappingGroupByColumnSelectorStrategy.java | 19 ++++++++++ ...ryStringGroupByColumnSelectorStrategy.java | 19 ++++++++++ .../druid/sql/calcite/rel/DruidQuery.java | 19 +++++----- .../druid/sql/calcite/QueryTestRunner.java | 2 +- 11 files changed, 133 insertions(+), 29 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java index 2684dd7aae4f..23e0d23dcde8 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java @@ -72,7 +72,7 @@ private static Object2IntMap createReverseDictionary(final Hash.Strategy< return m; } - public static Object2IntRBTreeMap createTreeSortedReverseDictionary(Comparator comparator) + public static Object2IntRBTreeMap createTreeSortedReverseDictionary(Comparator comparator) { final Object2IntRBTreeMap m = new Object2IntRBTreeMap<>(comparator); m.defaultReturnValue(DimensionDictionary.ABSENT_VALUE_ID); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java index dfbffd072b63..aef3732e0ab5 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java @@ -256,21 +256,18 @@ public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( case LONG: return new FixedWidthGroupByColumnSelectorStrategy( Byte.BYTES + Long.BYTES, - null, true, ColumnType.LONG ); case FLOAT: return new FixedWidthGroupByColumnSelectorStrategy( Byte.BYTES + Float.BYTES, - null, true, ColumnType.FLOAT ); case DOUBLE: return new FixedWidthGroupByColumnSelectorStrategy( Byte.BYTES + Double.BYTES, - null, true, ColumnType.DOUBLE ); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java index a5feb6b8c650..870152974af0 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java @@ -1566,8 +1566,7 @@ public ComplexRowBasedKeySerdeHelper( complexTypeDictionary.get(lhsBuffer.getInt(lhsPosition + keyBufferPosition)), complexTypeDictionary.get(rhsBuffer.getInt(rhsPosition + keyBufferPosition)) ); - - }; + } @Override public int getKeyBufferValueSize() diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java index 4ec30e5b6995..81d22bacae3a 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.druid.query.groupby.epinephelinae.column; import it.unimi.dsi.fastutil.objects.Object2IntMap; diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java index 872e8e8fd022..8541d6a5b165 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.druid.query.groupby.epinephelinae.column; import org.apache.druid.java.util.common.Pair; diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java index 34c823db6c64..d4a91d7bee85 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.druid.query.groupby.epinephelinae.column; import org.apache.druid.query.DimensionComparisonUtils; @@ -15,27 +34,24 @@ // Used only by primitives right now, however specialized complex types can reuse this once we have a way to extract // the required info +// Doesn't work with multi value dimensions, as only strings are multi-valued which are handled elsewhere. // Not thread safe because does weird stuff with buffer's position while reading @NotThreadSafe public class FixedWidthGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy { final int keySize; - @Nullable - final DimensionToIdConverter dimensionToIdConverter; final boolean isPrimitive; final ColumnType columnType; final NullableTypeStrategy nullableTypeStrategy; public FixedWidthGroupByColumnSelectorStrategy( int keySize, - @Nullable DimensionToIdConverter dimensionToIdConverter, boolean isPrimitive, ColumnType columnType ) { this.keySize = keySize; - this.dimensionToIdConverter = dimensionToIdConverter; this.isPrimitive = isPrimitive; this.columnType = columnType; this.nullableTypeStrategy = columnType.getNullableStrategy(); @@ -66,13 +82,8 @@ public int initColumnValues(ColumnValueSelector selector, int columnIndex, Objec { // It is expected of the primitive selectors to be returning default value of the implementation here. In the // getObject(), if it returns null, it won't -// if (selectorIsNull(selector)) { -// valuess[columnIndex] = null; -// } else { // Here the primitive selectors should have returned correct values - float shouldn't return longs and vice versa // Perhaps we'd require a cast as well, which is done implicitly when we call the .getLong/.getFloat/.getDouble -// valuess[columnIndex] = selector.getObject(); -// } valuess[columnIndex] = getValue(selector); return 0; @@ -97,7 +108,10 @@ public void initGroupingKeyColumnValue( @Override public boolean checkRowIndexAndAddValueToGroupingKey( - int keyBufferPosition, Object rowObj, int rowValIdx, ByteBuffer keyBuffer + int keyBufferPosition, + Object rowObj, + int rowValIdx, + ByteBuffer keyBuffer ) { return false; @@ -148,14 +162,14 @@ private boolean selectorIsNull(ColumnValueSelector columnValueSelector) return !isPrimitive && (columnValueSelector.getObject() == null); } - // Handles primitives as well, also might case + // Handles primitives as well, also objercts case @Nullable private T getValue(ColumnValueSelector columnValueSelector) { if (selectorIsNull(columnValueSelector)) { return null; } - // case is safe + // cast is safe return (T) DimensionHandlerUtils.convertObjectToType(columnValueSelector.getObject(), columnType); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java index 64d9bc96d849..34545072567d 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.druid.query.groupby.epinephelinae.column; // Doesn't handle GROUP_BY_MISSING_VALUE, should be done by the callers diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java index 73a9ac6096d8..2e22405df196 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.druid.query.groupby.epinephelinae.column; import com.google.common.base.Preconditions; diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java index c27e76f05924..430c50542803 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.druid.query.groupby.epinephelinae.column; import org.apache.druid.common.config.NullHandling; diff --git a/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java b/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java index d0292fc84fb8..152aa9105e0a 100644 --- a/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java +++ b/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java @@ -90,8 +90,6 @@ import org.apache.druid.segment.column.ColumnHolder; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.column.RowSignature; -import org.apache.druid.segment.column.Types; -import org.apache.druid.segment.column.ValueType; import org.apache.druid.segment.join.JoinableFactoryWrapper; import org.apache.druid.sql.calcite.aggregation.Aggregation; import org.apache.druid.sql.calcite.aggregation.DimensionExpression; @@ -486,14 +484,15 @@ private static List computeDimensions( final RelDataType dataType = rexNode.getType(); final ColumnType outputType = Calcites.getColumnTypeForRelDataType(dataType); -// if (Types.isNullOr(outputType, ValueType.COMPLEX)) { -// // Can't group on unknown or COMPLEX types. -// plannerContext.setPlanningError( -// "SQL requires a group-by on a column of type %s that is unsupported.", -// outputType -// ); -// throw new CannotBuildQueryException(aggregate, rexNode); -// } + // TODO(laksh): This might change if we disallow certain complex types from grouping + if (outputType == null) { + // Can't group on unknown or COMPLEX types. + plannerContext.setPlanningError( + "SQL requires a group-by on a column with unknown type that is unsupported.", + outputType + ); + throw new CannotBuildQueryException(aggregate, rexNode); + } final String dimOutputName = outputNamePrefix + outputNameCounter++; if (!druidExpression.isSimpleExtraction()) { diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java b/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java index 117ac4de2fe3..1dd1df4eea8c 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java @@ -381,7 +381,7 @@ public VerifyNativeQueries(BaseExecuteQuery execStep) public void verify() { for (QueryResults queryResults : execStep.results()) { -// verifyQuery(queryResults); + verifyQuery(queryResults); } } From 3017e4dba76f10720bf9332d821e2a4252b795f9 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Thu, 14 Mar 2024 16:07:09 +0530 Subject: [PATCH 10/46] comments --- .../epinephelinae/BufferHashGrouper.java | 1 - .../epinephelinae/DictionaryBuilding.java | 6 + .../epinephelinae/GroupByQueryEngine.java | 4 +- .../epinephelinae/RowBasedGrouperHelper.java | 167 +++++++++--------- ...BuildingGroupByColumnSelectorStrategy.java | 115 +++++++++--- .../column/DimensionToIdConverter.java | 52 +++++- ...xedWidthGroupByColumnSelectorStrategy.java | 82 ++++++--- .../column/GroupByColumnSelectorStrategy.java | 9 +- .../column/IdToDimensionConverter.java | 30 +++- ...yMappingGroupByColumnSelectorStrategy.java | 86 ++++++--- .../epinephelinae/column/MemoryEstimate.java | 46 +++++ ...ryStringGroupByColumnSelectorStrategy.java | 33 +++- 12 files changed, 464 insertions(+), 167 deletions(-) create mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MemoryEstimate.java diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/BufferHashGrouper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/BufferHashGrouper.java index 2bb97c70ee12..167b322b9d45 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/BufferHashGrouper.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/BufferHashGrouper.java @@ -205,7 +205,6 @@ public int size() } // Sort offsets in-place. - // TODO(laksh): Perhaps this can utilise the MSQ's way of using byte comparisons Collections.sort( wrappedOffsets, (lhs, rhs) -> { diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java index 23e0d23dcde8..c5e65dd37cee 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java @@ -72,6 +72,12 @@ private static Object2IntMap createReverseDictionary(final Hash.Strategy< return m; } + /** + * Creates a reverse dictionary which stores the keys in a sorted map. The sorting is decided based on the given + * comparator + * + * TODO(laksh): This function might be removed, if we decide ot go with hash based dictionaries. Also RB v/s AVL tree + */ public static Object2IntRBTreeMap createTreeSortedReverseDictionary(Comparator comparator) { final Object2IntRBTreeMap m = new Object2IntRBTreeMap<>(comparator); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java index aef3732e0ab5..13ef67e34d36 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java @@ -136,7 +136,7 @@ public GroupByEngineIterator make() curPos, query.getResultRowDimensionStart() + i ); - curPos += dims[i].getColumnSelectorStrategy().getGroupingKeySize(); + curPos += dims[i].getColumnSelectorStrategy().getGroupingKeySizeBytes(); } final int cardinalityForArrayAggregation = GroupingEngine.getCardinalityForArrayAggregation( @@ -844,7 +844,7 @@ private GroupByEngineKeySerde(final GroupByColumnSelectorPlus[] dims, GroupByQue this.dims = dims; int keySize = 0; for (GroupByColumnSelectorPlus selectorPlus : dims) { - keySize += selectorPlus.getColumnSelectorStrategy().getGroupingKeySize(); + keySize += selectorPlus.getColumnSelectorStrategy().getGroupingKeySizeBytes(); } this.keySize = keySize; diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java index 870152974af0..4b761880e4b9 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java @@ -1174,7 +1174,6 @@ private static class RowBasedKeySerde implements Grouper.KeySerde valueTypes; - private final boolean enableRuntimeDictionaryGeneration; private final List dictionary; @@ -1192,11 +1191,9 @@ private static class RowBasedKeySerde implements Grouper.KeySerde doubleArrayDictionary; private final Object2IntMap reverseDoubleArrayDictionary; - // We can probably use same dictionary for all the complex types, if all of them are done using hash mapping private final Map> complexTypeDictionaries = new HashMap<>(); private final Map> complexTypeReverseDictionaries = new HashMap<>(); - // Size limiting for the dictionary, in (roughly estimated) bytes. private final long maxDictionarySize; @@ -1535,7 +1532,55 @@ private RowBasedKeySerdeHelper makeNumericSerdeHelper( } } - private class ComplexRowBasedKeySerdeHelper implements RowBasedKeySerdeHelper + private abstract class DictionaryBuildingSingleValuedRowBasedKeySerdeHelper implements RowBasedKeySerdeHelper + { + private final int keyBufferPosition; + + public DictionaryBuildingSingleValuedRowBasedKeySerdeHelper(final int keyBufferPosition) + { + this.keyBufferPosition = keyBufferPosition; + } + + @Override + public int getKeyBufferValueSize() + { + return Integer.BYTES; + } + + @Override + public boolean putToKeyBuffer(RowBasedKey key, int idx) + { + final Object obj = key.getKey()[idx]; + int id = getReverseDictionary().getInt(obj); + if (id == DimensionDictionary.ABSENT_VALUE_ID) { + id = getDictionary().size(); + getReverseDictionary().put(obj, id); + getDictionary().add(obj); + } + keyBuffer.putInt(id); + return true; + } + + @Override + public void getFromByteBuffer(ByteBuffer buffer, int initialOffset, int dimValIdx, Object[] dimValues) + { + dimValues[dimValIdx] = getDictionary().get(buffer.getInt(initialOffset + keyBufferPosition)); + } + + /** + * Raw type used because arrays and object dictionaries differ + */ + @SuppressWarnings("rawtypes") + public abstract List getDictionary(); + + /** + * Raw types used because arrays and object dictionaries differ + */ + @SuppressWarnings("rawtypes") + public abstract Object2IntMap getReverseDictionary(); + } + + private class ComplexRowBasedKeySerdeHelper extends DictionaryBuildingSingleValuedRowBasedKeySerdeHelper { final int keyBufferPosition; final BufferComparator bufferComparator; @@ -1550,6 +1595,7 @@ public ComplexRowBasedKeySerdeHelper( ColumnType complexType ) { + super(keyBufferPosition); this.keyBufferPosition = keyBufferPosition; this.complexType = complexType; this.complexTypeName = Preconditions.checkNotNull(complexType.getComplexTypeName(), "complex type name expected"); @@ -1569,43 +1615,30 @@ public ComplexRowBasedKeySerdeHelper( } @Override - public int getKeyBufferValueSize() - { - return Integer.BYTES; - } - - @Override - public boolean putToKeyBuffer(RowBasedKey key, int idx) + public BufferComparator getBufferComparator() { - final Object obj = key.getKey()[idx]; - int id = complexTypeReverseDictionary.getInt(obj); - if (id == DimensionDictionary.ABSENT_VALUE_ID) { - id = complexTypeDictionary.size(); - complexTypeReverseDictionary.put(obj, id); - complexTypeDictionary.add(obj); - } - keyBuffer.putInt(id); - return true; + return bufferComparator; } @Override - public void getFromByteBuffer(ByteBuffer buffer, int initialOffset, int dimValIdx, Object[] dimValues) + public List getDictionary() { - dimValues[dimValIdx] = complexTypeDictionary.get(buffer.getInt(initialOffset + keyBufferPosition)); + return complexTypeDictionary; } @Override - public BufferComparator getBufferComparator() + public Object2IntMap getReverseDictionary() { - return bufferComparator; + return complexTypeReverseDictionary; } } - private class ArrayNumericRowBasedKeySerdeHelper implements RowBasedKeySerdeHelper + + private class ArrayNumericRowBasedKeySerdeHelper extends DictionaryBuildingSingleValuedRowBasedKeySerdeHelper { - final int keyBufferPosition; - final BufferComparator bufferComparator; - final TypeSignature elementType; + private final BufferComparator bufferComparator; + private final List dictionary; + private final Object2IntMap reverseDictionary; public ArrayNumericRowBasedKeySerdeHelper( int keyBufferPosition, @@ -1613,20 +1646,22 @@ public ArrayNumericRowBasedKeySerdeHelper( ColumnType arrayType ) { - this.keyBufferPosition = keyBufferPosition; - this.elementType = arrayType.getElementType(); + super(keyBufferPosition); + final TypeSignature elementType = arrayType.getElementType(); + this.dictionary = getDictionaryForType(elementType); + this.reverseDictionary = getReverseDictionaryForType(elementType); this.bufferComparator = (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> { if (stringComparator == null || StringComparators.NUMERIC.equals(stringComparator) || StringComparators.NATURAL.equals(stringComparator)) { return arrayType.getNullableStrategy().compare( - getDictionaryForType(elementType).get(lhsBuffer.getInt(lhsPosition + keyBufferPosition)), - getDictionaryForType(elementType).get(rhsBuffer.getInt(rhsPosition + keyBufferPosition)) + this.dictionary.get(lhsBuffer.getInt(lhsPosition + keyBufferPosition)), + this.dictionary.get(rhsBuffer.getInt(rhsPosition + keyBufferPosition)) ); } else { return new DimensionComparisonUtils.ArrayComparatorForUnnaturalStringComparator(stringComparator).compare( - getDictionaryForType(elementType).get(lhsBuffer.getInt(lhsPosition + keyBufferPosition)), - getDictionaryForType(elementType).get(rhsBuffer.getInt(rhsPosition + keyBufferPosition)) + this.dictionary.get(lhsBuffer.getInt(lhsPosition + keyBufferPosition)), + this.dictionary.get(rhsBuffer.getInt(rhsPosition + keyBufferPosition)) ); } }; @@ -1660,43 +1695,27 @@ private Object2IntMap getReverseDictionaryForType(TypeSignature getDictionary() { - dimValues[dimValIdx] = getDictionaryForType(elementType).get(buffer.getInt(initialOffset + keyBufferPosition)); + return dictionary; } @Override - public BufferComparator getBufferComparator() + public Object2IntMap getReverseDictionary() { - return bufferComparator; + return reverseDictionary; } } - private class ArrayStringRowBasedKeySerdeHelper implements RowBasedKeySerdeHelper + private class ArrayStringRowBasedKeySerdeHelper extends DictionaryBuildingSingleValuedRowBasedKeySerdeHelper { - final int keyBufferPosition; final BufferComparator bufferComparator; ArrayStringRowBasedKeySerdeHelper( @@ -1704,9 +1723,10 @@ private class ArrayStringRowBasedKeySerdeHelper implements RowBasedKeySerdeHelpe @Nullable StringComparator stringComparator ) { - this.keyBufferPosition = keyBufferPosition; + super(keyBufferPosition); bufferComparator = (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> - new DimensionComparisonUtils.ArrayComparator(stringComparator == null ? StringComparators.LEXICOGRAPHIC : stringComparator) + new DimensionComparisonUtils.ArrayComparator<>( + stringComparator == null ? StringComparators.LEXICOGRAPHIC : stringComparator) .compare( stringArrayDictionary.get(lhsBuffer.getInt(lhsPosition + keyBufferPosition)), stringArrayDictionary.get(rhsBuffer.getInt(rhsPosition + keyBufferPosition)) @@ -1720,38 +1740,21 @@ public int getKeyBufferValueSize() } @Override - public boolean putToKeyBuffer(RowBasedKey key, int idx) + public BufferComparator getBufferComparator() { - Object[] stringArray = (Object[]) key.getKey()[idx]; - final int id = addToArrayDictionary(stringArray); - if (id < 0) { - return false; - } - keyBuffer.putInt(id); - return true; + return bufferComparator; } @Override - public void getFromByteBuffer(ByteBuffer buffer, int initialOffset, int dimValIdx, Object[] dimValues) + public List getDictionary() { - dimValues[dimValIdx] = stringArrayDictionary.get(buffer.getInt(initialOffset + keyBufferPosition)); + return stringArrayDictionary; } @Override - public BufferComparator getBufferComparator() + public Object2IntMap getReverseDictionary() { - return bufferComparator; - } - - private int addToArrayDictionary(final Object[] s) - { - int idx = reverseStringArrayDictionary.getInt(s); - if (idx == DimensionDictionary.ABSENT_VALUE_ID) { - idx = stringArrayDictionary.size(); - reverseStringArrayDictionary.put(s, idx); - stringArrayDictionary.add(s); - } - return idx; + return reverseStringArrayDictionary; } } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java index 81d22bacae3a..0d9f7fc07163 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -22,7 +22,6 @@ import it.unimi.dsi.fastutil.objects.Object2IntMap; import org.apache.druid.common.config.NullHandling; import org.apache.druid.error.DruidException; -import org.apache.druid.java.util.common.Pair; import org.apache.druid.query.groupby.epinephelinae.DictionaryBuilding; import org.apache.druid.segment.ColumnValueSelector; import org.apache.druid.segment.DimensionHandlerUtils; @@ -32,13 +31,40 @@ import org.apache.druid.segment.data.ArrayBasedIndexedInts; import org.apache.druid.segment.data.IndexedInts; +import javax.annotation.concurrent.NotThreadSafe; import java.util.List; +/** + * Strategy for grouping dimensions which can have variable-width objects, and aren't backed by prebuilt dictionaries. It + * encapsulates the dictionary building logic, along with providing the implementations for dimension to dictionary id + * encoding-decoding. + *

+ * This strategy can handle any dimension that can be addressed on a reverse-dictionary. Reverse dictionary uses + * a sorted map, rather than a hashmap. + * TODO(laksh): Benchmark results + *

+ * This is the most expensive of all the strategies, and hence must be used only when other strategies aren't valid. + */ +@NotThreadSafe public class DictionaryBuildingGroupByColumnSelectorStrategy extends KeyMappingGroupByColumnSelectorStrategy { + /** + * Dictionary for mapping the dimension value to an index. i-th position in the dictionary holds the value represented + * by the dictionaryId "i". + * Therefore, if a value has a dictionary id "i", dictionary.get(i) = value + */ private final List dictionary; + + /** + * Reverse dictionary for faster lookup into the dictionary, and reusing pre-existing dictionary ids. + *

+ * An entry of form (value, i) in the reverse dictionary represents that "value" is present at the i-th location in the + * {@link #dictionary}. + * Absence of mapping of a "value" (denoted by returning {@link GroupByColumnSelectorStrategy#GROUP_BY_MISSING_VALUE}) + * represents that the value is absent in the dictionary + */ private final Object2IntMap reverseDictionary; private DictionaryBuildingGroupByColumnSelectorStrategy( @@ -56,24 +82,34 @@ private DictionaryBuildingGroupByColumnSelectorStrategy( this.reverseDictionary = reverseDictionary; } + /** + * Creates an implementation of the strategy for the given type + */ public static GroupByColumnSelectorStrategy forType(final ColumnType columnType) { - // Any way to use the generics here instead of if (columnType.equals(ColumnType.STRING)) { + // String types are handled specially because they can have multi-value dimensions return forString(); - } else if (columnType.equals(ColumnType.DOUBLE) || columnType.equals(ColumnType.FLOAT) || columnType.equals( - ColumnType.LONG)) { + } else if ( + // Defensive check, primitives should be using a faster fixed-width strategy + columnType.equals(ColumnType.DOUBLE) + || columnType.equals(ColumnType.FLOAT) + || columnType.equals(ColumnType.LONG)) { throw DruidException.defensive("Could used a fixed width strategy"); } + // Catch-all for all other types, that can only have single-valued dimensions return forArrayAndComplexTypes(columnType); } + /** + * Implementation of the dictionary building strategy for string types. + */ private static GroupByColumnSelectorStrategy forString() { final List dictionary = DictionaryBuilding.createDictionary(); final Object2IntMap reverseDictionary = - DictionaryBuilding.createTreeSortedReverseDictionary(ColumnType.STRING.getNullableStrategy()); + DictionaryBuilding.createReverseDictionary(); return new DictionaryBuildingGroupByColumnSelectorStrategy<>( new StringDimensionToIdConverter(dictionary, reverseDictionary), ColumnType.STRING, @@ -85,14 +121,18 @@ private static GroupByColumnSelectorStrategy forString() ); } - // Nothing different about primitive and non-primitive types, however the primitive types are fixed width, therefore - // don't need to use dictionary building strategy. Also, it simplifies the generics because now everything can be treated - // as Object + /** + * Implemenatation of dictionary building strategy for types other than strings (since they can be multi-valued and need + * to be handled separately) and numeric primitives (since they can be handled by fixed-width strategy). + * This also means that we handle array and complex types here, which simplifies the generics a lot, as everything can be + * treated as Object in this class. + *

+ * Also, there isn't any concept of multi-values here, therefore Dimension == DimensionHolderType == Object. We still + * homogenize rogue selectors which can return non-standard implementation of arrays (like Long[] for long arrays instead of + * Object[]) to what the callers would expect (i.e. Object[] in this case). + */ private static GroupByColumnSelectorStrategy forArrayAndComplexTypes(final ColumnType columnType) { - // No concept of multi values, therefore DimensionType == DimensionHolderType == Object. For rogue selectors, which - // can return weird representation of arrays, we cast it using DimensionHandlerUtils, therefore the type might not be strictly - // same, but it would be what the callers expect final List dictionary = DictionaryBuilding.createDictionary(); final Object2IntMap reverseDictionary = DictionaryBuilding.createTreeSortedReverseDictionary(columnType.getNullableStrategy()); @@ -107,9 +147,28 @@ private static GroupByColumnSelectorStrategy forArrayAndComplexTypes(final Colum ); } + /** + * Encodes the multi-valued string dimension to the ids. It replaces the original IndexedInts, with the one containing + * the global dictionary ids, This removes an extra redirection involved while looking up the value. + * + * Therefore, if the input dimension column has two rows, with dimensions like: + * + * (Input) + * Column1 - [1, 2] - lookupName(1) = foo, lookupName(2) = bar + * Column2 - [1, 2, 2] - lookupName(1) = baz, lookupName(2) = foo + * + * The multi-value holders for the column, after conversion would look like: + * Column1 - [1, 2] + * Column2 - [3, 1] + * + * And the dictionary-reverse dictionary would look like: + * Dictionary: [foo, bar, baz] + * Reverse dictionary: (foo, 1), (bar, 2), (baz, 3) + * + * Converting a value from the returned row to the dictId is as simple as fetching the int present at the given location. + */ private static class StringDimensionToIdConverter implements DimensionToIdConverter { - private final List dictionary; private final Object2IntMap reverseDictionary; @@ -123,7 +182,7 @@ public StringDimensionToIdConverter( } @Override - public Pair getMultiValueHolder( + public MemoryEstimate getMultiValueHolder( final ColumnValueSelector selector, final IndexedInts reusableValue ) @@ -153,7 +212,7 @@ public Pair getMultiValueHolder( } } newRow.setSize(rowSize); - return Pair.of(newRow, footprintIncrease); + return new MemoryEstimate<>(newRow, footprintIncrease); } @Override @@ -163,10 +222,10 @@ public int multiValueSize(IndexedInts multiValueHolder) } @Override - public Pair getIndividualValueDictId(IndexedInts multiValueHolder, int index) + public MemoryEstimate getIndividualValueDictId(IndexedInts multiValueHolder, int index) { // Already converted it to the dictionary id - return Pair.of(multiValueHolder.get(index), 0); + return new MemoryEstimate<>(multiValueHolder.get(index), 0); } } @@ -175,6 +234,7 @@ private static class UniValueDimensionToIdConverter implements DimensionToIdConv private final List dictionary; private final Object2IntMap reverseDictionary; private final ColumnType columnType; + @SuppressWarnings("rawtypes") private final NullableTypeStrategy nullableTypeStrategy; public UniValueDimensionToIdConverter( @@ -191,7 +251,7 @@ public UniValueDimensionToIdConverter( } @Override - public Pair getMultiValueHolder(ColumnValueSelector selector, Object reusableValue) + public MemoryEstimate getMultiValueHolder(ColumnValueSelector selector, Object reusableValue) { final Object value = DimensionHandlerUtils.convertObjectToType(selector.getObject(), columnType); final int dictId = reverseDictionary.getInt(value); @@ -203,17 +263,18 @@ public Pair getMultiValueHolder(ColumnValueSelector selector, O footprintIncrease = DictionaryBuilding.estimateEntryFootprint(nullableTypeStrategy.estimateSizeBytes(value)); } - return Pair.of(value, footprintIncrease); + return new MemoryEstimate<>(value, footprintIncrease); } @Override public int multiValueSize(Object multiValueHolder) { + //noinspection VariableNotUsedInsideIf return multiValueHolder == null ? 0 : 1; } @Override - public Pair getIndividualValueDictId(Object multiValueHolder, int index) + public MemoryEstimate getIndividualValueDictId(Object multiValueHolder, int index) { assert index == 0; int dictId = reverseDictionary.getInt(multiValueHolder); @@ -225,14 +286,18 @@ public Pair getIndividualValueDictId(Object multiValueHolder, reverseDictionary.put(multiValueHolder, size); dictId = size; // TODO(laksh): confirm if this is the same for sorted dictionaries as well - footprintIncrease = DictionaryBuilding.estimateEntryFootprint(nullableTypeStrategy.estimateSizeBytes( - multiValueHolder)); + footprintIncrease = DictionaryBuilding.estimateEntryFootprint( + nullableTypeStrategy.estimateSizeBytes(multiValueHolder) + ); } - return Pair.of(dictId, footprintIncrease); + return new MemoryEstimate<>(dictId, footprintIncrease); } } + /** + * Defers to the dictionary we have built to decode the dictionary id + */ private static class DictionaryIdToDimensionConverter implements IdToDimensionConverter { private final List dictionary; @@ -242,16 +307,18 @@ public DictionaryIdToDimensionConverter(List dictionary) this.dictionary = dictionary; } - // Don't need to handle default id value @Override public DimensionType idToKey(int id) { + // No need to handle GROUP_BY_MISSING_VALUE, by contract return dictionary.get(id); } @Override public boolean canCompareIds() { + // Dictionaries are built on the fly, and ids are assigned in the order in which the value is added to the + // dictionary. return false; } } @@ -259,6 +326,8 @@ public boolean canCompareIds() @Override public void reset() { + super.reset(); + // Clean up the dictionaries dictionary.clear(); reverseDictionary.clear(); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java index 8541d6a5b165..d989a73b18ec 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java @@ -19,15 +19,59 @@ package org.apache.druid.query.groupby.epinephelinae.column; -import org.apache.druid.java.util.common.Pair; import org.apache.druid.segment.ColumnValueSelector; -// Don't really use DimensionHolderType anywhere for now, we cast stuff everywhere, but perhaps with new selectors, we can +import javax.annotation.Nullable; + +/** + * Interface for converters of dimension to dictionary id. + * + * This is a slightly convoluted interface because it also encapsulates the additional logic for handling multi-value + * dimensions. It has an additional step that converts the given dimensions to "dimension holders", which represent the + * multi-value holders for a given dimension. + * Therefore, the conversion goes from ColumnValueSelector -> DimensionHolder -> DictionaryID (for each dimension in the holder) + * + * The dimension holder is only applicable for multi-value strings. + * For other dimensions that cannot have multi-values the dimension holder is identical to the dimension. They can be + * defensively cast or homogenised, for example doubles to floats for float selectors or Long[] to Object[] for array + * selectors, so that the upstream callers can assume the class of the dimensions. The size of these dimensions is always 1, + * and only contain a value at index 0. + * + * Converting a value to its dictionary id might require building dictionaries on the fly while computing the id. The + * return type of the methods, except {@link #multiValueSize}, takes that into account. + * + * The implementations can pre-convert the value to the dictionaryId while extracting the dimensionHolder. Extracting + * dictionary id for a specific value from the (potentially multi-value dimension holder) can be done by calling + * {@link #getIndividualValueDictId} and passing the index to the multi-value. + * + * @see IdToDimensionConverter for converting the dictionary values back to dimensions + * + * @param Type of the dimension holder + */ public interface DimensionToIdConverter { - Pair getMultiValueHolder(ColumnValueSelector selector, DimensionHolderType reusableValue); + /** + * @param selector Column value selector to extract the dimension holder from + * @param reusableValue Dimension holder can be reused throughout multiple calls to prevent reallocation of memory + * or arrays. The older value can be disregarded and the object can be reused for freely by this call. + * @return DimensionHolder associated with the selector, and the internal dictionary increase associated with it + */ + MemoryEstimate getMultiValueHolder( + ColumnValueSelector selector, + // TODO(laksh): This is always null. Find a way to use this or remove this parameter + @Nullable DimensionHolderType reusableValue + ); + /** + * @param multiValueHolder Multi value holder obtained from call to {@link #getMultiValueHolder} + * @return Size of the multi-value dimension + */ int multiValueSize(DimensionHolderType multiValueHolder); - Pair getIndividualValueDictId(DimensionHolderType multiValueHolder, int index); + /** + * @param multiValueHolder Multi value holder obtained from call to {@link #getMultiValueHolder} + * @param index Index of the value inside the multi-value holder to obtain + * @return DictionaryId of the object at the given index + */ + MemoryEstimate getIndividualValueDictId(DimensionHolderType multiValueHolder, int index); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java index d4a91d7bee85..91972ab42140 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java @@ -19,6 +19,7 @@ package org.apache.druid.query.groupby.epinephelinae.column; +import org.apache.druid.error.DruidException; import org.apache.druid.query.DimensionComparisonUtils; import org.apache.druid.query.groupby.ResultRow; import org.apache.druid.query.groupby.epinephelinae.Grouper; @@ -32,35 +33,52 @@ import javax.annotation.concurrent.NotThreadSafe; import java.nio.ByteBuffer; -// Used only by primitives right now, however specialized complex types can reuse this once we have a way to extract -// the required info -// Doesn't work with multi value dimensions, as only strings are multi-valued which are handled elsewhere. -// Not thread safe because does weird stuff with buffer's position while reading +/** + * Strategy for grouping dimensions which have fixed-width objects. It is only used for numeric primitive types, + * however complex types can reuse this strategy if they can hint the engine that they are always fixed width + * (for e.g. IP types). Such types donot need to be backed by a dictionary, and hence are faster to group by. + * + * @param Class of the dimension + */ @NotThreadSafe public class FixedWidthGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy { - - final int keySize; + /** + * Size of the key when materialized as bytes + */ + final int keySizeBytes; + + /** + * Indicates whether the type is primitive or not + */ final boolean isPrimitive; + + /** + * Type of the dimension on which the grouping strategy is being used + */ final ColumnType columnType; + + /** + * Nullable type strategy of the dimension + */ final NullableTypeStrategy nullableTypeStrategy; public FixedWidthGroupByColumnSelectorStrategy( - int keySize, + int keySizeBytes, boolean isPrimitive, ColumnType columnType ) { - this.keySize = keySize; + this.keySizeBytes = keySizeBytes; this.isPrimitive = isPrimitive; this.columnType = columnType; this.nullableTypeStrategy = columnType.getNullableStrategy(); } @Override - public int getGroupingKeySize() + public int getGroupingKeySizeBytes() { - return keySize; + return keySizeBytes; } @Override @@ -80,11 +98,6 @@ public void processValueFromGroupingKey( @Override public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] valuess) { - // It is expected of the primitive selectors to be returning default value of the implementation here. In the - // getObject(), if it returns null, it won't - // Here the primitive selectors should have returned correct values - float shouldn't return longs and vice versa - // Perhaps we'd require a cast as well, which is done implicitly when we call the .getLong/.getFloat/.getDouble - valuess[columnIndex] = getValue(selector); return 0; } @@ -98,14 +111,25 @@ public void initGroupingKeyColumnValue( int[] stack ) { + int written; if (rowObj == null) { - nullableTypeStrategy.write(keyBuffer, keyBufferPosition, null, keySize); + written = nullableTypeStrategy.write(keyBuffer, keyBufferPosition, null, keySizeBytes); + stack[dimensionIndex] = 0; } else { - nullableTypeStrategy.write(keyBuffer, keyBufferPosition, (T) rowObj, keySize); + written = nullableTypeStrategy.write(keyBuffer, keyBufferPosition, (T) rowObj, keySizeBytes); stack[dimensionIndex] = 1; } + // Since this is a fixed width strategy, the caller should already have allocated enough space to materialize the + // key object, and the type strategy should always be able to write to the buffer + if (written < 0) { + throw DruidException.defensive("Unable to serialize the value [%s] to buffer", rowObj); + } } + /** + * This is used for multi-valued dimensions, for values after the first one. None of the current types supported by + * this strategy handle multi-valued dimensions, therefore this short circuits and returns false + */ @Override public boolean checkRowIndexAndAddValueToGroupingKey( int keyBufferPosition, @@ -124,7 +148,12 @@ public int writeToKeyBuffer( ByteBuffer keyBuffer ) { - nullableTypeStrategy.write(keyBuffer, keyBufferPosition, getValue(selector), keySize); + T value = getValue(selector); + int written = nullableTypeStrategy.write(keyBuffer, keyBufferPosition, value, keySizeBytes); + if (written < 0) { + throw DruidException.defensive("Unable to serialize the value [%s] to buffer", value); + } + // This strategy doesn't use dictionary building and doesn't hold any internal state, therefore size increase is nil. return 0; } @@ -142,6 +171,7 @@ public Grouper.BufferComparator bufferComparator( return stringComparator.compare(String.valueOf(lhs), String.valueOf(rhs)); } // Nulls are allowed while comparing + //noinspection ConstantConditions return nullableTypeStrategy.compare(lhs, rhs); }; } @@ -153,7 +183,10 @@ public void reset() // Nothing to reset } - // unifies the primitive and th + /** + * Returns true if the value at the selector is null. It unifies the null handling of primitive numeric types and the + * other types + */ private boolean selectorIsNull(ColumnValueSelector columnValueSelector) { if (isPrimitive && columnValueSelector.isNull()) { @@ -162,15 +195,20 @@ private boolean selectorIsNull(ColumnValueSelector columnValueSelector) return !isPrimitive && (columnValueSelector.getObject() == null); } - // Handles primitives as well, also objercts case + /** + * Returns the value of the selector. It handles nullity of the value and casts it to the proper type so that the + * upstream callers donot need to worry about handling incorrect types (for example, if a double column value selector + * returns a long) + */ @Nullable private T getValue(ColumnValueSelector columnValueSelector) { if (selectorIsNull(columnValueSelector)) { return null; } - // cast is safe + // TODO(laksh): Check if calling .getObject() on primitive selectors be problematic?? + // Convert the object to the desired type + //noinspection unchecked return (T) DimensionHandlerUtils.convertObjectToType(columnValueSelector.getObject(), columnType); } - } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/GroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/GroupByColumnSelectorStrategy.java index 34a4fd2b21fc..bb7ac01aafc1 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/GroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/GroupByColumnSelectorStrategy.java @@ -45,6 +45,9 @@ */ public interface GroupByColumnSelectorStrategy extends ColumnSelectorStrategy { + /** + * Index to indicate the absence of a key in the dictionary + */ int GROUP_BY_MISSING_VALUE = -1; /** @@ -54,7 +57,7 @@ public interface GroupByColumnSelectorStrategy extends ColumnSelectorStrategy * * @return size, in bytes, of this dimension's values in the grouping key. */ - int getGroupingKeySize(); + int getGroupingKeySizeBytes(); /** * Read a value from a grouping key and add it to the group by query result row, using the output name specified @@ -117,7 +120,9 @@ void initGroupingKeyColumnValue( /** * If rowValIdx is less than the size of rowObj (haven't handled all of the row values): * First, read the value at rowValIdx from a rowObj and write that value to the keyBuffer at keyBufferPosition. - * Then return true + * Then return true. + * This method assumes that the size increase associated with the dictionary building has occurred already when calling + * {@link #initColumnValues} * * Otherwise, return false. * diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java index 34545072567d..979f8f7b0829 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java @@ -19,10 +19,38 @@ package org.apache.druid.query.groupby.epinephelinae.column; -// Doesn't handle GROUP_BY_MISSING_VALUE, should be done by the callers +/** + * Converts back the dictionaryId to the dimension value. The implementations might or might not handle + * {@link GroupByColumnSelectorStrategy#GROUP_BY_MISSING_VALUE}. The callers should handle those values appropriately on + * their own, and filter those out before trying to convert the dictionary id back to value. + * + * The encoding - decoding workflow looks like: + * + * Encoding + * 1. {@link DimensionToIdConverter} extracts the multi-value holder for the given row, which get's stored somewhere + * 2. For each entry in the multi-value object, the value gets encoded into a dictionaryId, using {@link DimensionToIdConverter#getIndividualValueDictId} + * 3. The callers can use this integer dictionaryID to materialize the results somewhere + * + * Decoding + * 1. The materialized dictionary id is deserialized back to an int, and then decoded into value using {@link #idToKey} + * + * @see DimensionToIdConverter for converting the dimensions to dictionary ids + * + * @param Type of the dimension's values + */ public interface IdToDimensionConverter { + /** + * Decodes the dictionaryId back to the dimensionKey + */ DimensionType idToKey(int id); + /** + * Returns if the object comparison can be optimised directly by comparing the dictionaryIds, instead of decoding the + * objects and comparing those. Therefore, it returns true iff the "dict" function defined by dict(id) = value is + * monotonically increasing. + * + * Ids backed by dictionaries built on the fly can never be compared, therefore those should always return false. + */ boolean canCompareIds(); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java index 2e22405df196..bea2bacb62fe 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -20,7 +20,6 @@ package org.apache.druid.query.groupby.epinephelinae.column; import com.google.common.base.Preconditions; -import org.apache.druid.java.util.common.Pair; import org.apache.druid.query.DimensionComparisonUtils; import org.apache.druid.query.groupby.ResultRow; import org.apache.druid.query.groupby.epinephelinae.Grouper; @@ -34,20 +33,51 @@ import javax.annotation.concurrent.NotThreadSafe; import java.nio.ByteBuffer; -// Only supports int mapping. -// DimensionType is the dimension's type - eg strings -// DimensionHolderType is the multi value holder for the dimension, if it exists, else it will be same as DimensionType +/** + * Strategy for grouping dimensions which can have variable-width objects. Materializing such objects on the buffer + * require an additional step of mapping them to an integer index. The integer index can be materialized on the buffer within + * a fixed width, and is often backed by a dictionary representing the actual dimension object. It is used for arrays, + * strings, and complex types. + * + * The visibility of the class is limited, and the callers must use one of the two variants of the mapping strategy: + * 1. {@link PrebuiltDictionaryStringGroupByColumnSelectorStrategy} + * 2. {@link DictionaryBuildingGroupByColumnSelectorStrategy} + * + * @param > Class of the dimension + * @param Class of the "dimension holder". For single-value dimensions, the holder's type and the + * holder's object are equivalent to the dimension. For multi-value dimensions (only strings), + * the holder's type and the object are different, where the type would be {@link org.apache.druid.segment.data.IndexedInts} + * representing all the values in the multi-valued string, while the dimension type would be + * String + * + * @see DimensionToIdConverter encoding logic for converting value to dictionary + * @see IdToDimensionConverter decoding logic for converting back dictionary to value + */ @NotThreadSafe -public class KeyMappingGroupByColumnSelectorStrategy +class KeyMappingGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy { + /** + * Converts the dimension to equivalent dictionaryId. + */ final DimensionToIdConverter dimensionToIdConverter; + + /** + * Type of the dimension on which the grouping strategy is used + */ final ColumnType columnType; + + /** + * Nullable type strategy of the dimension + */ final NullableTypeStrategy nullableTypeStrategy; + + /** + * Default value of the dimension + */ final DimensionType defaultValue; final IdToDimensionConverter idToDimensionConverter; - // Restricted access, callers should use one of it's subclasses KeyMappingGroupByColumnSelectorStrategy( final DimensionToIdConverter dimensionToIdConverter, final ColumnType columnType, @@ -63,8 +93,11 @@ public class KeyMappingGroupByColumnSelectorStrategy multiValueHolderAndSizeIncrease = dimensionToIdConverter.getMultiValueHolder(selector, null); - valuess[columnIndex] = multiValueHolderAndSizeIncrease.lhs; - return multiValueHolderAndSizeIncrease.rhs; + MemoryEstimate multiValueHolder = dimensionToIdConverter.getMultiValueHolder(selector, null); + valuess[columnIndex] = multiValueHolder.value(); + return multiValueHolder.memoryIncrease(); } @Override @@ -110,9 +143,12 @@ public void initGroupingKeyColumnValue( keyBuffer.putInt(keyBufferPosition, GROUP_BY_MISSING_VALUE); stack[dimensionIndex] = 0; } else { - // No need to check here, since we'd have already accounted for it when we call - // initColumnValues - keyBuffer.putInt(keyBufferPosition, dimensionToIdConverter.getIndividualValueDictId(rowObjCasted, 0).lhs); + MemoryEstimate dictionaryIdAndMemoryIncrease = + dimensionToIdConverter.getIndividualValueDictId(rowObjCasted, 0); + // We should have already accounted for the memory increase when we call initColumnValues(). Dictionary building for + // all the values in the dimension (potentially multi-valued) should have happened there + assert dictionaryIdAndMemoryIncrease.memoryIncrease() == 0; + keyBuffer.putInt(keyBufferPosition, dictionaryIdAndMemoryIncrease.value()); stack[dimensionIndex] = 1; } } @@ -125,12 +161,20 @@ public boolean checkRowIndexAndAddValueToGroupingKey( ByteBuffer keyBuffer ) { + // Casting is fine, because while extracting the multiValueHolder, the implementations must ensure that the returned "multi-value" + // type is what the callers here expect + //noinspection unchecked DimensionHolderType rowObjCasted = (DimensionHolderType) rowObj; int rowSize = dimensionToIdConverter.multiValueSize(rowObjCasted); if (rowValIdx < rowSize) { + MemoryEstimate dictionaryIdAndMemoryIncrease = + dimensionToIdConverter.getIndividualValueDictId(rowObjCasted, rowValIdx); + // We should have already accounted for the memory increase when we call initColumnValues(). Dictionary building for + // all the values in the dimension (potentially multi-valued) should have happened there + assert dictionaryIdAndMemoryIncrease.memoryIncrease() == 0; keyBuffer.putInt( keyBufferPosition, - dimensionToIdConverter.getIndividualValueDictId(rowObjCasted, rowValIdx).lhs + dictionaryIdAndMemoryIncrease.value() ); return true; } else { @@ -141,16 +185,15 @@ public boolean checkRowIndexAndAddValueToGroupingKey( @Override public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) { - Pair multiValueHolder = dimensionToIdConverter.getMultiValueHolder(selector, null); - int multiValueSize = dimensionToIdConverter.multiValueSize(multiValueHolder.lhs); + MemoryEstimate multiValueHolder = dimensionToIdConverter.getMultiValueHolder(selector, null); + int multiValueSize = dimensionToIdConverter.multiValueSize(multiValueHolder.value()); Preconditions.checkState(multiValueSize < 2, "Not supported for multi-value dimensions"); - Pair dictIdAndSizeIncrease = dimensionToIdConverter.getIndividualValueDictId(multiValueHolder.lhs, 0); - final int dictId = multiValueSize == 1 ? dictIdAndSizeIncrease.lhs : GROUP_BY_MISSING_VALUE; + MemoryEstimate dictIdAndSizeIncrease = dimensionToIdConverter.getIndividualValueDictId(multiValueHolder.value(), 0); + final int dictId = multiValueSize == 1 ? dictIdAndSizeIncrease.value() : GROUP_BY_MISSING_VALUE; keyBuffer.putInt(keyBufferPosition, dictId); // The implementations must return a non-nullable and non-negative size increase - //noinspection ConstantConditions - return multiValueHolder.rhs + dictIdAndSizeIncrease.rhs; + return multiValueHolder.memoryIncrease() + dictIdAndSizeIncrease.memoryIncrease(); } @Override @@ -183,7 +226,6 @@ public Grouper.BufferComparator bufferComparator(int keyBufferPosition, @Nullabl @Override public void reset() { - + // Nothing to do here. Implementations which build dictionaries should clean them in the reset method. } - } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MemoryEstimate.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MemoryEstimate.java new file mode 100644 index 000000000000..da02ca143f9d --- /dev/null +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MemoryEstimate.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.groupby.epinephelinae.column; + +/** + * Holder for a value and the memory increase in the internal dictionary associated with the increase + */ +public class MemoryEstimate +{ + private final T value; + private final int memoryIncrease; + + // Reduced visibility + MemoryEstimate(T value, int memoryIncrease) + { + this.value = value; + this.memoryIncrease = memoryIncrease; + } + + public T value() + { + return value; + } + + public int memoryIncrease() + { + return memoryIncrease; + } +} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java index 430c50542803..f7306849f400 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java @@ -21,7 +21,6 @@ import org.apache.druid.common.config.NullHandling; import org.apache.druid.error.DruidException; -import org.apache.druid.java.util.common.Pair; import org.apache.druid.segment.ColumnValueSelector; import org.apache.druid.segment.DimensionSelector; import org.apache.druid.segment.column.ColumnCapabilities; @@ -30,11 +29,18 @@ import javax.annotation.Nullable; -// Note: Avoiding anonymous classes -// This is more of a helper class, as it just creates an instance of the KeyMappingGroupingColumnSelectorStrategy +/** + * Implementation of {@link KeyMappingGroupByColumnSelectorStrategy} that relies on a prebuilt dictionary to map the + * dimension to the dictionaryId. It is more like a helper class, that handles the different ways that dictionaries can be + * provided for different types. Currently, it only handles String dimensions. Array dimensions are also backed by dictionaries, + * but not exposed via the ColumnValueSelector interface, hence this strategy cannot handle array dimensions. + */ public class PrebuiltDictionaryStringGroupByColumnSelectorStrategy { + /** + * Create the strategy for the provided column type + */ public static GroupByColumnSelectorStrategy forType( final ColumnType columnType, final ColumnValueSelector columnValueSelector, @@ -44,7 +50,7 @@ public static GroupByColumnSelectorStrategy forType( if (columnType.equals(ColumnType.STRING)) { return forString(columnValueSelector, columnCapabilities); } else { - // This can change with array columns + // This will change with array columns throw DruidException.defensive("Only string columns expose prebuilt dictionaries"); } } @@ -63,15 +69,20 @@ private static GroupByColumnSelectorStrategy forString( ); } + /** + * Dimension to id converter for string dimensions and {@link DimensionSelector}, where the dictionaries are prebuilt. + * The callers must ensure that's the case by checking that {@link DimensionSelector#getValueCardinality()} is known + * and {@link DimensionSelector#nameLookupPossibleInAdvance()} is true. + */ private static class StringDimensionToIdConverter implements DimensionToIdConverter { @Override - public Pair getMultiValueHolder( + public MemoryEstimate getMultiValueHolder( final ColumnValueSelector selector, final IndexedInts reusableValue ) { - return Pair.of(((DimensionSelector) selector).getRow(), 0); + return new MemoryEstimate<>(((DimensionSelector) selector).getRow(), 0); } @Override @@ -81,16 +92,21 @@ public int multiValueSize(IndexedInts multiValueHolder) } @Override - public Pair getIndividualValueDictId(IndexedInts multiValueHolder, int index) + public MemoryEstimate getIndividualValueDictId(IndexedInts multiValueHolder, int index) { - return Pair.of(multiValueHolder.get(index), 0); + // dictId is already encoded in the indexedInt supplied by the column value selector + return new MemoryEstimate<>(multiValueHolder.get(index), 0); } } + /** + * ID to dimension converter for {@link DimensionSelector} with prebuilt dictionary + */ private static class StringIdToDimensionConverter implements IdToDimensionConverter { final DimensionSelector dimensionSelector; + @Nullable final ColumnCapabilities columnCapabilities; @@ -106,6 +122,7 @@ public StringIdToDimensionConverter( @Override public String idToKey(int id) { + // Converting back to the value is as simple as looking up the value in the prebuilt dictionary return dimensionSelector.lookupName(id); } From f159f67889fb6b6889309bfbd17910c327525538 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 15 Mar 2024 04:00:21 +0530 Subject: [PATCH 11/46] add benchmarks --- .../benchmark/query/SqlGroupByBenchmark.java | 368 ++++++++++++++++++ .../generator/GeneratorBasicSchemas.java | 45 +++ 2 files changed, 413 insertions(+) create mode 100644 benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java new file mode 100644 index 000000000000..f1de72233709 --- /dev/null +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java @@ -0,0 +1,368 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.benchmark.query; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import org.apache.druid.common.config.NullHandling; +import org.apache.druid.data.input.impl.DimensionSchema; +import org.apache.druid.data.input.impl.DimensionsSpec; +import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.java.util.common.granularity.Granularities; +import org.apache.druid.java.util.common.guava.Sequence; +import org.apache.druid.java.util.common.io.Closer; +import org.apache.druid.java.util.common.logger.Logger; +import org.apache.druid.math.expr.ExpressionProcessing; +import org.apache.druid.query.DruidProcessingConfig; +import org.apache.druid.query.QueryRunnerFactoryConglomerate; +import org.apache.druid.query.expression.TestExprMacroTable; +import org.apache.druid.segment.AutoTypeColumnSchema; +import org.apache.druid.segment.IndexSpec; +import org.apache.druid.segment.QueryableIndex; +import org.apache.druid.segment.column.StringEncodingStrategy; +import org.apache.druid.segment.generator.GeneratorBasicSchemas; +import org.apache.druid.segment.generator.GeneratorSchemaInfo; +import org.apache.druid.segment.generator.SegmentGenerator; +import org.apache.druid.segment.transform.ExpressionTransform; +import org.apache.druid.segment.transform.TransformSpec; +import org.apache.druid.server.QueryStackTests; +import org.apache.druid.server.SpecificSegmentsQuerySegmentWalker; +import org.apache.druid.server.security.AuthConfig; +import org.apache.druid.server.security.AuthTestUtils; +import org.apache.druid.sql.calcite.SqlVectorizedExpressionSanityTest; +import org.apache.druid.sql.calcite.planner.CalciteRulesManager; +import org.apache.druid.sql.calcite.planner.CatalogResolver; +import org.apache.druid.sql.calcite.planner.DruidPlanner; +import org.apache.druid.sql.calcite.planner.PlannerConfig; +import org.apache.druid.sql.calcite.planner.PlannerFactory; +import org.apache.druid.sql.calcite.planner.PlannerResult; +import org.apache.druid.sql.calcite.run.SqlEngine; +import org.apache.druid.sql.calcite.schema.DruidSchemaCatalog; +import org.apache.druid.sql.calcite.util.CalciteTests; +import org.apache.druid.timeline.DataSegment; +import org.apache.druid.timeline.partition.LinearShardSpec; +import org.apache.logging.log4j.core.util.Integers; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +import javax.annotation.Nullable; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.TimeUnit; + +@State(Scope.Benchmark) +@Fork(value = 1) +@Warmup(iterations = 3) +@Measurement(iterations = 5) +public class SqlGroupByBenchmark +{ + private static final Logger log = new Logger(SqlGroupByBenchmark.class); + + static { + NullHandling.initializeForTests(); + ExpressionProcessing.initializeForTests(); + } + + private static final DruidProcessingConfig PROCESSING_CONFIG = new DruidProcessingConfig() + { + @Override + public int intermediateComputeSizeBytes() + { + return 512 * 1024 * 1024; + } + + @Override + public int getNumMergeBuffers() + { + return 3; + } + + @Override + public int getNumThreads() + { + return 1; + } + + @Override + public String getFormatString() + { + return "benchmarks-processing-%s"; + } + }; + + @Param({ + "string-Sequential-100_000", + "string-Sequential-10_000_000", + // "string-Sequential-1_000_000_000", + "string-ZipF-1_000_000", + "string-Uniform-1_000_000", + + "multi-string-Sequential-100_000", + "multi-string-Sequential-10_000_000", + // "multi-string-Sequential-1_000_000_000", + "multi-string-ZipF-1_000_000", + "multi-string-Uniform-1_000_000", + + "long-Sequential-100_000", + "long-Sequential-10_000_000", + // "long-Sequential-1_000_000_000", + "long-ZipF-1_000_000", + "long-Uniform-1_000_000", + + "double-ZipF-1_000_000", + "double-Uniform-1_000_000", + + "float-ZipF-1_000_000", + "float-Uniform-1_000_000", + + "stringArray-Sequential-100_000", + "stringArray-Sequential-10_000_000", + // "stringArray-Sequential-1_000_000_000", + "stringArray-ZipF-1_000_000", + "stringArray-Uniform-1_000_000", + + "longArray-Sequential-100_000", + "longArray-Sequential-10_000_000", + // "longArray-Sequential-1_000_000_000", + "longArray-ZipF-1_000_000", + "longArray-Uniform-1_000_000", + + "nested-Sequential-100_000", + "nested-Sequential-10_000_000", + // "nested-Sequential-1_000_000_000", + "nested-ZipF-1_000_000", + "nested-Uniform-1_000_000", + }) + private String groupingDimension; + + private SqlEngine engine; + @Nullable + private PlannerFactory plannerFactory; + private Closer closer = Closer.create(); + + @Setup(Level.Trial) + public void setup() + { + final GeneratorSchemaInfo schemaInfo = GeneratorBasicSchemas.SCHEMA_MAP.get("groupBy-testbench"); + + final DataSegment dataSegment = DataSegment.builder() + .dataSource("foo") + .interval(schemaInfo.getDataInterval()) + .version("1") + .shardSpec(new LinearShardSpec(0)) + .size(0) + .build(); + final DataSegment dataSegment2 = DataSegment.builder() + .dataSource("foo") + .interval(schemaInfo.getDataInterval()) + .version("1") + .shardSpec(new LinearShardSpec(1)) + .size(0) + .build(); + + + final PlannerConfig plannerConfig = new PlannerConfig(); + + String columnCardinalityWithUnderscores = groupingDimension.substring(groupingDimension.lastIndexOf('-') + 1); + int rowsPerSegment = Integers.parseInt(columnCardinalityWithUnderscores.replaceAll("_", "")); + + final SegmentGenerator segmentGenerator = closer.register(new SegmentGenerator()); + log.info("Starting benchmark setup using cacheDir[%s], rows[%,d].", segmentGenerator.getCacheDir(), rowsPerSegment); + + TransformSpec transformSpec = new TransformSpec( + null, + ImmutableList.of( + // string array dims + new ExpressionTransform( + "stringArray-Sequential-100_000", + "array(\"string-Sequential-100_000\")", + TestExprMacroTable.INSTANCE + ), + new ExpressionTransform( + "stringArray-Sequential-10_000_000", + "array(\"string-Sequential-10_000_000\")", + TestExprMacroTable.INSTANCE + ), + /* + new ExpressionTransform( + "stringArray-Sequential-1_000_000_000", + "array(\"string-Sequential-1_000_000_000\")", + TestExprMacroTable.INSTANCE + ),*/ + new ExpressionTransform( + "stringArray-ZipF-1_000_000", + "array(\"string-ZipF-1_000_000\")", + TestExprMacroTable.INSTANCE + ), + new ExpressionTransform( + "stringArray-Uniform-1_000_000", + "array(\"string-Uniform-1_000_000\")", + TestExprMacroTable.INSTANCE + ), + + // long array dims + new ExpressionTransform( + "longArray-Sequential-100_000", + "array(\"long-Sequential-100_000\")", + TestExprMacroTable.INSTANCE + ), + new ExpressionTransform( + "longArray-Sequential-10_000_000", + "array(\"long-Sequential-10_000_000\")", + TestExprMacroTable.INSTANCE + ), + /* + new ExpressionTransform( + "longArray-Sequential-1_000_000_000", + "array(\"long-Sequential-1_000_000_000\")", + TestExprMacroTable.INSTANCE + ),*/ + new ExpressionTransform( + "longArray-ZipF-1_000_000", + "array(\"long-ZipF-1_000_000\")", + TestExprMacroTable.INSTANCE + ), + new ExpressionTransform( + "longArray-Uniform-1_000_000", + "array(\"long-Uniform-1_000_000\")", + TestExprMacroTable.INSTANCE + ), + + // nested complex json dim + new ExpressionTransform( + "nested-Sequential-100_000", + "json_object('long1', \"long-Sequential-100_000\", 'nesteder', json_object('long1', \"long-Sequential-100_000\"))", + TestExprMacroTable.INSTANCE + ), + new ExpressionTransform( + "nested-Sequential-10_000_000", + "json_object('long1', \"long-Sequential-10_000_000\", 'nesteder', json_object('long1', \"long-Sequential-10_000_000\"))", + TestExprMacroTable.INSTANCE + ), + /*new ExpressionTransform( + "nested-Sequential-1_000_000_000", + "json_object('long1', \"long-Sequential-1_000_000_000\", 'nesteder', json_object('long1', \"long-Sequential-1_000_000_000\"))", + TestExprMacroTable.INSTANCE + ),*/ + new ExpressionTransform( + "nested-ZipF-1_000_000", + "json_object('long1', \"long-ZipF-1_000_000\", 'nesteder', json_object('long1', \"long-ZipF-1_000_000\"))", + TestExprMacroTable.INSTANCE + ), + new ExpressionTransform( + "nested-Uniform-1_000_000", + "json_object('long1', \"long-Uniform-1_000_000\", 'nesteder', json_object('long1', \"long-Uniform-1_000_000\"))", + TestExprMacroTable.INSTANCE + ) + ) + ); + List dims = ImmutableList.builder() + .addAll(schemaInfo.getDimensionsSpec().getDimensions()) + .add(new AutoTypeColumnSchema("nested", null)) + .build(); + DimensionsSpec dimsSpec = new DimensionsSpec(dims); + + + + final QueryableIndex index = segmentGenerator.generate( + dataSegment, + schemaInfo, + dimsSpec, + transformSpec, + IndexSpec.builder().withStringDictionaryEncoding(new StringEncodingStrategy.Utf8()).build(), + Granularities.NONE, + rowsPerSegment + ); + + final QueryRunnerFactoryConglomerate conglomerate = QueryStackTests.createQueryRunnerFactoryConglomerate( + closer, + PROCESSING_CONFIG + ); + + final SpecificSegmentsQuerySegmentWalker walker = SpecificSegmentsQuerySegmentWalker.createWalker(conglomerate) + .add(dataSegment, index) + .add(dataSegment2, index); + closer.register(walker); + + final DruidSchemaCatalog rootSchema = + CalciteTests.createMockRootSchema(conglomerate, walker, plannerConfig, AuthTestUtils.TEST_AUTHORIZER_MAPPER); + engine = CalciteTests.createMockSqlEngine(walker, conglomerate); + plannerFactory = new PlannerFactory( + rootSchema, + CalciteTests.createOperatorTable(), + CalciteTests.createExprMacroTable(), + plannerConfig, + AuthTestUtils.TEST_AUTHORIZER_MAPPER, + CalciteTests.getJsonMapper(), + CalciteTests.DRUID_SCHEMA_NAME, + new CalciteRulesManager(ImmutableSet.of()), + CalciteTests.createJoinableFactoryWrapper(), + CatalogResolver.NULL_RESOLVER, + new AuthConfig() + ); + + try { + SqlVectorizedExpressionSanityTest.sanityTestVectorizedSqlQueries( + plannerFactory, + sqlQuery(groupingDimension) + ); + } + catch (Throwable ignored) { + // the show must go on + } + } + + @TearDown(Level.Trial) + public void tearDown() throws Exception + { + closer.close(); + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.MILLISECONDS) + public void querySql(Blackhole blackhole) + { + final String sql = sqlQuery(groupingDimension); + try (final DruidPlanner planner = plannerFactory.createPlannerForTesting(engine, sql, Collections.emptyMap())) { + final PlannerResult plannerResult = planner.plan(); + final Sequence resultSequence = plannerResult.run().getResults(); + final Object[] lastRow = resultSequence.accumulate(null, (accumulated, in) -> in); + blackhole.consume(lastRow); + } + } + + private static String sqlQuery(String groupingDimension) + { + return StringUtils.format("SELECT \"%s\", COUNT(*) FROM foo GROUP BY 1", groupingDimension); + } +} diff --git a/processing/src/main/java/org/apache/druid/segment/generator/GeneratorBasicSchemas.java b/processing/src/main/java/org/apache/druid/segment/generator/GeneratorBasicSchemas.java index c6b05a4dda7d..0727309654b5 100644 --- a/processing/src/main/java/org/apache/druid/segment/generator/GeneratorBasicSchemas.java +++ b/processing/src/main/java/org/apache/druid/segment/generator/GeneratorBasicSchemas.java @@ -35,6 +35,7 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Map; @@ -364,6 +365,50 @@ public class GeneratorBasicSchemas SCHEMA_INFO_BUILDER.put("expression-testbench", expressionsTestsSchema); } + static { + // schema for benchmarking group-by + List expressionsTestsSchemaColumns = ImmutableList.of( + // string dims + GeneratorColumnSchema.makeSequential("string-Sequential-100_000", ValueType.STRING, false, 1, null, 0, 100_000), + GeneratorColumnSchema.makeSequential("string-Sequential-10_000_000", ValueType.STRING, false, 1, null, 0, 10_000_000), + // GeneratorColumnSchema.makeSequential("string-Sequential-1_000_000_000", ValueType.STRING, false, 1, null, 0, 1_000_000_000), + GeneratorColumnSchema.makeLazyZipf("string-ZipF-1_000_000", ValueType.STRING, false, 1, 0.1, 0, 1_000_000, 2.0), + GeneratorColumnSchema.makeLazyDiscreteUniform("string-Uniform-1_000_000", ValueType.STRING, false, 1, 0.3, 0, 1_000_000), + + // multi string dims + GeneratorColumnSchema.makeSequential("multi-string-Sequential-100_000", ValueType.STRING, false, 8, null, 0, 100_000), + GeneratorColumnSchema.makeSequential("multi-string-Sequential-10_000_000", ValueType.STRING, false, 8, null, 0, 10_000_000), + // GeneratorColumnSchema.makeSequential("multi-string-Sequential-1_000_000_000", ValueType.STRING, false, 8, null, 0, 1_000_000_000), + GeneratorColumnSchema.makeLazyZipf("multi-string-ZipF-1_000_000", ValueType.STRING, false, 16, 0.1, 0, 1_000_000, 2.0), + GeneratorColumnSchema.makeLazyDiscreteUniform("multi-string-Uniform-1_000_000", ValueType.STRING, false, 4, null, 0, 1_000_000), + + // numeric dims + GeneratorColumnSchema.makeSequential("long-Sequential-100_000", ValueType.LONG, false, 1, null, 0, 100_000), + GeneratorColumnSchema.makeSequential("long-Sequential-10_000_000", ValueType.LONG, false, 1, null, 0, 10_000_000), + // GeneratorColumnSchema.makeSequential("long-Sequential-1_000_000_000", ValueType.LONG, false, 1, null, 0, 1_000_000_000), + GeneratorColumnSchema.makeLazyZipf("long-ZipF-1_000_000", ValueType.LONG, false, 1, 0.1, 0, 1_000_000, 2.0), + GeneratorColumnSchema.makeLazyDiscreteUniform("long-Uniform-1_000_000", ValueType.LONG, false, 1, 0.3, 0, 1_000_000), + + GeneratorColumnSchema.makeLazyZipf("double-ZipF-1_000_000", ValueType.DOUBLE, false, 1, 0.1, 0, 1_000_000, 2.0), + GeneratorColumnSchema.makeContinuousUniform("double-Uniform-1_000_000", ValueType.DOUBLE, false, 1, null, 0.0, 1_000_000.0), + + GeneratorColumnSchema.makeLazyZipf("float-ZipF-1_000_000", ValueType.FLOAT, false, 1, 0.1, 0, 1_000_000, 2.0), + GeneratorColumnSchema.makeContinuousUniform("float-Uniform-1_000_000", ValueType.FLOAT, false, 1, null, 0.0, 1_000_000.0) + // Generate the array dims, and the complex value dims by wrapping the pre-existing primitive dims within simple expressions + ); + + Interval interval = Intervals.of("2000-01-01/P1D"); + + GeneratorSchemaInfo groupByTestsSchema = new GeneratorSchemaInfo( + expressionsTestsSchemaColumns, + Collections.emptyList(), + interval, + false + ); + + SCHEMA_INFO_BUILDER.put("groupBy-testbench", groupByTestsSchema); + } + static { List inTestsSchemaColumns = ImmutableList.of( GeneratorColumnSchema.makeSequential("long1", ValueType.LONG, false, 1, null, 0, 40000), From 638bf529dbb2ca9d7c4e22dfab6c72632c4b466c Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 15 Mar 2024 04:17:13 +0530 Subject: [PATCH 12/46] all dictionaries now use sorted map --- .../epinephelinae/RowBasedGrouperHelper.java | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java index 4b761880e4b9..65d96fae9c99 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java @@ -1222,20 +1222,26 @@ private static class RowBasedKeySerde implements Grouper.KeySerde Date: Tue, 19 Mar 2024 14:25:43 +0530 Subject: [PATCH 13/46] tests 1 --- .../benchmark/query/SqlGroupByBenchmark.java | 29 ++- .../epinephelinae/DictionaryBuilding.java | 61 +++--- .../GroupByColumnSelectorStrategyFactory.java | 92 ++++++++ .../epinephelinae/GroupByQueryEngine.java | 65 +----- .../epinephelinae/RowBasedGrouperHelper.java | 2 + ...idthGroupByColumnSelectorStrategyTest.java | 198 ++++++++++++++++++ 6 files changed, 347 insertions(+), 100 deletions(-) create mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java create mode 100644 processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java index f1de72233709..7a575c55c1a8 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java @@ -21,6 +21,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; import org.apache.druid.common.config.NullHandling; import org.apache.druid.data.input.impl.DimensionSchema; import org.apache.druid.data.input.impl.DimensionsSpec; @@ -78,6 +79,7 @@ import java.util.Collections; import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; @State(Scope.Benchmark) @Fork(value = 1) @@ -285,18 +287,33 @@ public void setup() ) ) ); - List dims = ImmutableList.builder() - .addAll(schemaInfo.getDimensionsSpec().getDimensions()) - .add(new AutoTypeColumnSchema("nested", null)) - .build(); - DimensionsSpec dimsSpec = new DimensionsSpec(dims); + + List columnSchemas = schemaInfo.getDimensionsSpec() + .getDimensions() + .stream() + .map(x -> new AutoTypeColumnSchema(x.getName(), null)) + .collect(Collectors.toList()); + + List transformSchemas = transformSpec + .getTransforms() + .stream() + .map( + transform -> new AutoTypeColumnSchema(transform.getName(), null) + ) + .collect(Collectors.toList()); final QueryableIndex index = segmentGenerator.generate( dataSegment, schemaInfo, - dimsSpec, + DimensionsSpec.builder() + .setDimensions(ImmutableList.builder() + .addAll(columnSchemas) + .addAll(transformSchemas) + .build() + ) + .build(), transformSpec, IndexSpec.builder().withStringDictionaryEncoding(new StringEncodingStrategy.Utf8()).build(), Granularities.NONE, diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java index c5e65dd37cee..e3628a93e4e0 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java @@ -20,6 +20,7 @@ package org.apache.druid.query.groupby.epinephelinae; import it.unimi.dsi.fastutil.Hash; +import it.unimi.dsi.fastutil.objects.Object2IntAVLTreeMap; import it.unimi.dsi.fastutil.objects.Object2IntMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenCustomHashMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; @@ -65,12 +66,12 @@ public static Object2IntMap createReverseDictionary() return m; } - private static Object2IntMap createReverseDictionary(final Hash.Strategy hashStrategy) - { - final Object2IntOpenCustomHashMap m = new Object2IntOpenCustomHashMap<>(hashStrategy); - m.defaultReturnValue(DimensionDictionary.ABSENT_VALUE_ID); - return m; - } +// private static Object2IntMap createReverseDictionary(final Hash.Strategy hashStrategy) +// { +// final Object2IntOpenCustomHashMap m = new Object2IntOpenCustomHashMap<>(hashStrategy); +// m.defaultReturnValue(DimensionDictionary.ABSENT_VALUE_ID); +// return m; +// } /** * Creates a reverse dictionary which stores the keys in a sorted map. The sorting is decided based on the given @@ -78,9 +79,9 @@ private static Object2IntMap createReverseDictionary(final Hash.Strategy< * * TODO(laksh): This function might be removed, if we decide ot go with hash based dictionaries. Also RB v/s AVL tree */ - public static Object2IntRBTreeMap createTreeSortedReverseDictionary(Comparator comparator) + public static Object2IntMap createTreeSortedReverseDictionary(Comparator comparator) { - final Object2IntRBTreeMap m = new Object2IntRBTreeMap<>(comparator); + final Object2IntAVLTreeMap m = new Object2IntAVLTreeMap<>(comparator); m.defaultReturnValue(DimensionDictionary.ABSENT_VALUE_ID); return m; } @@ -88,28 +89,28 @@ public static Object2IntRBTreeMap createTreeSortedReverseDictionary(Compa /** * Creates a reverse dictionary for arrays of primitive types. */ - public static Object2IntMap createReverseDictionaryForPrimitiveArray(TypeSignature arrayType) - { - if (!arrayType.isPrimitiveArray()) { - throw DruidException.defensive("Dictionary building function expected an array of a primitive type"); - } - return createReverseDictionary(new Hash.Strategy() - { - @Override - public int hashCode(Object[] o) - { - // We don't do a deep comparison, because the array type is primitive, therefore we don't need to incur the extra - // overhead of checking the nestings - return Arrays.hashCode(o); - } - - @Override - public boolean equals(Object[] a, Object[] b) - { - return arrayType.getNullableStrategy().compare(a, b) == 0; - } - }); - } +// public static Object2IntMap createReverseDictionaryForPrimitiveArray(TypeSignature arrayType) +// { +// if (!arrayType.isPrimitiveArray()) { +// throw DruidException.defensive("Dictionary building function expected an array of a primitive type"); +// } +// return createReverseDictionary(new Hash.Strategy() +// { +// @Override +// public int hashCode(Object[] o) +// { +// // We don't do a deep comparison, because the array type is primitive, therefore we don't need to incur the extra +// // overhead of checking the nestings +// return Arrays.hashCode(o); +// } +// +// @Override +// public boolean equals(Object[] a, Object[] b) +// { +// return arrayType.getNullableStrategy().compare(a, b) == 0; +// } +// }); +// } /** * Estimated footprint of a new entry. diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java new file mode 100644 index 000000000000..10adabc8517c --- /dev/null +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.groupby.epinephelinae; + +import org.apache.druid.java.util.common.IAE; +import org.apache.druid.query.dimension.ColumnSelectorStrategyFactory; +import org.apache.druid.query.groupby.epinephelinae.column.DictionaryBuildingGroupByColumnSelectorStrategy; +import org.apache.druid.query.groupby.epinephelinae.column.FixedWidthGroupByColumnSelectorStrategy; +import org.apache.druid.query.groupby.epinephelinae.column.GroupByColumnSelectorStrategy; +import org.apache.druid.query.groupby.epinephelinae.column.PrebuiltDictionaryStringGroupByColumnSelectorStrategy; +import org.apache.druid.segment.ColumnValueSelector; +import org.apache.druid.segment.DimensionSelector; +import org.apache.druid.segment.column.ColumnCapabilities; +import org.apache.druid.segment.column.ColumnType; + +/** + * Creates {@link org.apache.druid.query.dimension.ColumnSelectorStrategy}s for grouping dimensions + */ +public class GroupByColumnSelectorStrategyFactory implements ColumnSelectorStrategyFactory +{ + @Override + public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( + ColumnCapabilities capabilities, + ColumnValueSelector selector + ) + { + switch (capabilities.getType()) { + case STRING: + DimensionSelector dimSelector = (DimensionSelector) selector; + if (dimSelector.getValueCardinality() >= 0 && dimSelector.nameLookupPossibleInAdvance()) { + return PrebuiltDictionaryStringGroupByColumnSelectorStrategy.forType( + ColumnType.STRING, + selector, + capabilities + ); + } else { + return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.STRING); + } + case LONG: + return new FixedWidthGroupByColumnSelectorStrategy( + Byte.BYTES + Long.BYTES, + true, + ColumnType.LONG + ); + case FLOAT: + return new FixedWidthGroupByColumnSelectorStrategy( + Byte.BYTES + Float.BYTES, + true, + ColumnType.FLOAT + ); + case DOUBLE: + return new FixedWidthGroupByColumnSelectorStrategy( + Byte.BYTES + Double.BYTES, + true, + ColumnType.DOUBLE + ); + case ARRAY: + switch (capabilities.getElementType().getType()) { + case LONG: + return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.LONG_ARRAY); + case STRING: + return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.STRING_ARRAY); + case DOUBLE: + return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.DOUBLE_ARRAY); + case FLOAT: + // Array not supported in expressions, ingestion + default: + throw new IAE("Cannot create query type helper from invalid type [%s]", capabilities.asTypeString()); + + } + default: + throw new IAE("Cannot create query type helper from invalid type [%s]", capabilities.asTypeString()); + } + } +} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java index 13ef67e34d36..05a5084e7365 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngine.java @@ -33,7 +33,6 @@ import org.apache.druid.query.DruidProcessingConfig; import org.apache.druid.query.aggregation.AggregatorAdapters; import org.apache.druid.query.aggregation.AggregatorFactory; -import org.apache.druid.query.dimension.ColumnSelectorStrategyFactory; import org.apache.druid.query.dimension.DimensionSpec; import org.apache.druid.query.filter.Filter; import org.apache.druid.query.groupby.GroupByQuery; @@ -41,17 +40,13 @@ import org.apache.druid.query.groupby.GroupByQueryMetrics; import org.apache.druid.query.groupby.GroupingEngine; import org.apache.druid.query.groupby.ResultRow; -import org.apache.druid.query.groupby.epinephelinae.column.DictionaryBuildingGroupByColumnSelectorStrategy; -import org.apache.druid.query.groupby.epinephelinae.column.FixedWidthGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.GroupByColumnSelectorPlus; import org.apache.druid.query.groupby.epinephelinae.column.GroupByColumnSelectorStrategy; -import org.apache.druid.query.groupby.epinephelinae.column.PrebuiltDictionaryStringGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.orderby.DefaultLimitSpec; import org.apache.druid.query.groupby.orderby.OrderByColumnSpec; import org.apache.druid.query.ordering.StringComparator; import org.apache.druid.segment.ColumnInspector; import org.apache.druid.segment.ColumnSelectorFactory; -import org.apache.druid.segment.ColumnValueSelector; import org.apache.druid.segment.Cursor; import org.apache.druid.segment.DimensionHandlerUtils; import org.apache.druid.segment.DimensionSelector; @@ -86,7 +81,7 @@ */ public class GroupByQueryEngine { - private static final GroupByStrategyFactory STRATEGY_FACTORY = new GroupByStrategyFactory(); + private static final GroupByColumnSelectorStrategyFactory STRATEGY_FACTORY = new GroupByColumnSelectorStrategyFactory(); private GroupByQueryEngine() { @@ -233,64 +228,6 @@ private static boolean hasNoImplicitUnnestDimensions( }); } - private static class GroupByStrategyFactory implements ColumnSelectorStrategyFactory - { - @Override - public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( - ColumnCapabilities capabilities, - ColumnValueSelector selector - ) - { - switch (capabilities.getType()) { - case STRING: - DimensionSelector dimSelector = (DimensionSelector) selector; - if (dimSelector.getValueCardinality() >= 0 && dimSelector.nameLookupPossibleInAdvance()) { - return PrebuiltDictionaryStringGroupByColumnSelectorStrategy.forType( - ColumnType.STRING, - selector, - capabilities - ); - } else { - return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.STRING); - } - case LONG: - return new FixedWidthGroupByColumnSelectorStrategy( - Byte.BYTES + Long.BYTES, - true, - ColumnType.LONG - ); - case FLOAT: - return new FixedWidthGroupByColumnSelectorStrategy( - Byte.BYTES + Float.BYTES, - true, - ColumnType.FLOAT - ); - case DOUBLE: - return new FixedWidthGroupByColumnSelectorStrategy( - Byte.BYTES + Double.BYTES, - true, - ColumnType.DOUBLE - ); - case ARRAY: - switch (capabilities.getElementType().getType()) { - case LONG: - return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.LONG_ARRAY); - case STRING: - return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.STRING_ARRAY); - case DOUBLE: - return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.DOUBLE_ARRAY); - case FLOAT: - // Array not supported in expressions, ingestion - default: - throw new IAE("Cannot create query type helper from invalid type [%s]", capabilities.asTypeString()); - - } - default: - throw new IAE("Cannot create query type helper from invalid type [%s]", capabilities.asTypeString()); - } - } - } - private abstract static class GroupByEngineIterator implements Iterator, Closeable { protected final GroupByQuery query; diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java index 24cf797c1e19..f7afcdb7f9b2 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java @@ -1397,6 +1397,8 @@ public void reset() reverseFloatArrayDictionary.clear(); longArrayDictionary.clear(); reverseLongArrayDictionary.clear(); + complexTypeDictionaries.clear(); + complexTypeReverseDictionaries.clear(); rankOfDictionaryIds = null; currentEstimatedSize = 0; } diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java new file mode 100644 index 000000000000..4ffe3b5acf9d --- /dev/null +++ b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.groupby.epinephelinae.column; + +import com.google.common.collect.ImmutableList; +import junitparams.converters.Nullable; +import org.apache.druid.query.IterableRowsCursorHelper; +import org.apache.druid.query.groupby.ResultRow; +import org.apache.druid.query.groupby.epinephelinae.GroupByColumnSelectorStrategyFactory; +import org.apache.druid.query.ordering.StringComparators; +import org.apache.druid.segment.ColumnValueSelector; +import org.apache.druid.segment.Cursor; +import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.column.RowSignature; +import org.apache.druid.testing.InitializedNullHandlingTest; +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.runners.Enclosed; +import org.junit.runner.RunWith; +import org.mockito.Mockito; + +import java.nio.ByteBuffer; +import java.util.List; + +@RunWith(Enclosed.class) +public class FixedWidthGroupByColumnSelectorStrategyTest extends InitializedNullHandlingTest +{ + private static final List DATASOURCE_ROWS = ImmutableList.of( + new Object[]{1L, 1.0f, 1.0d}, + new Object[]{2L, 2.0f, 2.0d}, + new Object[]{null, null, null}, + new Object[]{3L, 3.0f, 3.0d} + ); + private static final GroupByColumnSelectorStrategyFactory STRATEGY_FACTORY = new GroupByColumnSelectorStrategyFactory(); + private static final ByteBuffer BUFFER1 = ByteBuffer.allocate(10); + private static final ByteBuffer BUFFER2 = ByteBuffer.allocate(10); + private static final String LONG_COLUMN = "long"; + private static final String FLOAT_COLUMN = "float"; + private static final String DOUBLE_COLUMN = "double"; + + public static class LongGroupByColumnSelectorStrategyTest + { + private static final GroupByColumnSelectorStrategy STRATEGY = + STRATEGY_FACTORY.makeColumnSelectorStrategy( + createCursor().getColumnSelectorFactory().getColumnCapabilities(LONG_COLUMN), + createCursor().getColumnSelectorFactory().makeColumnValueSelector(LONG_COLUMN) + ); + + @Test + public void testKeySize() + { + Assert.assertEquals(Byte.BYTES + Long.BYTES, STRATEGY.getGroupingKeySizeBytes()); + } + + @Test + public void testWriteToKeyBuffer() + { + Cursor cursor = createCursor(); + ResultRow resultRow = ResultRow.create(1); + ColumnValueSelector columnValueSelector = cursor.getColumnSelectorFactory().makeColumnValueSelector(LONG_COLUMN); + GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); + Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); + + int rowNum = 0; + while (!cursor.isDone()) { + // Check if the round trip serde produces the same results + int sizeIncrease = STRATEGY.writeToKeyBuffer(0, columnValueSelector, BUFFER1); + STRATEGY.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); + // There shouldn't be any internal size increase associated with the fixed width types + Assert.assertEquals(0, sizeIncrease); + Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[0], resultRow.get(0)); + cursor.advance(); + ++rowNum; + } + } + + @Test + public void testInitColumnValues() + { + Cursor cursor = createCursor(); + ColumnValueSelector columnValueSelector = cursor.getColumnSelectorFactory().makeColumnValueSelector(LONG_COLUMN); + Object[] valuess = new Object[1]; + + int rowNum = 0; + while (!cursor.isDone()) { + int sizeIncrease = STRATEGY.initColumnValues(columnValueSelector, 0, valuess); + Assert.assertEquals(0, sizeIncrease); + Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[0], valuess[0]); + cursor.advance(); + ++rowNum; + } + } + + @Test + public void testBufferComparator() + { + // lhs < rhs + writeGroupingKeyToBuffer(BUFFER1, 100L); + writeGroupingKeyToBuffer(BUFFER2, 200L); + Assert.assertEquals(-1, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // lhs == rhs + writeGroupingKeyToBuffer(BUFFER1, 100L); + writeGroupingKeyToBuffer(BUFFER2, 100L); + Assert.assertEquals(0, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // lhs > rhs + writeGroupingKeyToBuffer(BUFFER1, 200L); + writeGroupingKeyToBuffer(BUFFER2, 100L); + Assert.assertEquals(1, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // lhs is null + writeGroupingKeyToBuffer(BUFFER1, null); + writeGroupingKeyToBuffer(BUFFER2, 0L); + Assert.assertEquals(-1, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // rhs is null + writeGroupingKeyToBuffer(BUFFER1, 0L); + writeGroupingKeyToBuffer(BUFFER2, null); + Assert.assertEquals(1, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // lhs and rhs are null + writeGroupingKeyToBuffer(BUFFER1, null); + writeGroupingKeyToBuffer(BUFFER2, null); + Assert.assertEquals(0, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // stringComparator is provided, for lexicographic comparator "2" > "100" + writeGroupingKeyToBuffer(BUFFER1, 2L); + writeGroupingKeyToBuffer(BUFFER2, 100L); + Assert.assertEquals( + 1, + STRATEGY.bufferComparator(0, StringComparators.LEXICOGRAPHIC) + .compare(BUFFER1, BUFFER2, 0, 0) + ); + + // stringComparator is provided, for alphanumeric comparator number("2") < number("100") + writeGroupingKeyToBuffer(BUFFER1, 2L); + writeGroupingKeyToBuffer(BUFFER2, 100L); + Assert.assertEquals( + -1, + STRATEGY.bufferComparator(0, StringComparators.ALPHANUMERIC) + .compare(BUFFER1, BUFFER2, 0, 0) + ); + } + + private static void writeGroupingKeyToBuffer(final ByteBuffer buffer, @Nullable Long key) + { + ColumnValueSelector columnValueSelector1 = Mockito.mock(ColumnValueSelector.class); + + Mockito.when(columnValueSelector1.getObject()).thenReturn(key); + Mockito.when(columnValueSelector1.isNull()).thenReturn(key == null); + + Assert.assertEquals(0, STRATEGY.writeToKeyBuffer(0, columnValueSelector1, buffer)); + } + + @Test + public void testMultiValueHandling() + { + // Returns false, because fixed width strategy doesn't handle multi-value dimensions, therefore even index-0 is + // flagged + Assert.assertFalse(STRATEGY.checkRowIndexAndAddValueToGroupingKey(0, 1L, 0, BUFFER1)); + Assert.assertFalse(STRATEGY.checkRowIndexAndAddValueToGroupingKey(0, 1L, 10, BUFFER1)); + + + + } + } + + private static Cursor createCursor() + { + Cursor cursor = IterableRowsCursorHelper.getCursorFromIterable( + DATASOURCE_ROWS, + RowSignature.builder() + .add("long", ColumnType.LONG) + .add("float", ColumnType.FLOAT) + .add("double", ColumnType.DOUBLE) + .build() + ).lhs; + return cursor; + } +} \ No newline at end of file From 25fc42caaded8c79ff83a764ea27013771fef99c Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Thu, 21 Mar 2024 03:10:37 +0530 Subject: [PATCH 14/46] tests and comments --- .../GroupByColumnSelectorStrategyFactory.java | 10 +- ...BuildingGroupByColumnSelectorStrategy.java | 2 + ...yMappingGroupByColumnSelectorStrategy.java | 28 +- .../druid/segment/column/TypeStrategies.java | 5 + ...idthGroupByColumnSelectorStrategyTest.java | 317 +++++++++++++++++- ...lumnGroupByColumnSelectorStrategyTest.java | 137 ++++++++ .../calcite/CalciteNestedDataQueryTest.java | 2 +- .../druid/sql/calcite/QueryTestRunner.java | 2 +- 8 files changed, 482 insertions(+), 21 deletions(-) create mode 100644 processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java index 10adabc8517c..2e8be2829c1c 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java @@ -19,6 +19,7 @@ package org.apache.druid.query.groupby.epinephelinae; +import org.apache.druid.error.DruidException; import org.apache.druid.java.util.common.IAE; import org.apache.druid.query.dimension.ColumnSelectorStrategyFactory; import org.apache.druid.query.groupby.epinephelinae.column.DictionaryBuildingGroupByColumnSelectorStrategy; @@ -41,6 +42,9 @@ public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( ColumnValueSelector selector ) { + if (capabilities == null || capabilities.getType() == null) { + throw DruidException.defensive("Unable to deduce type for the grouping dimension"); + } switch (capabilities.getType()) { case STRING: DimensionSelector dimSelector = (DimensionSelector) selector; @@ -74,17 +78,17 @@ public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( case ARRAY: switch (capabilities.getElementType().getType()) { case LONG: - return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.LONG_ARRAY); case STRING: - return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.STRING_ARRAY); case DOUBLE: - return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.DOUBLE_ARRAY); + return DictionaryBuildingGroupByColumnSelectorStrategy.forType(capabilities.toColumnType()); case FLOAT: // Array not supported in expressions, ingestion default: throw new IAE("Cannot create query type helper from invalid type [%s]", capabilities.asTypeString()); } + case COMPLEX: + return DictionaryBuildingGroupByColumnSelectorStrategy.forType(capabilities.toColumnType()); default: throw new IAE("Cannot create query type helper from invalid type [%s]", capabilities.asTypeString()); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java index 0d9f7fc07163..220f7f42ee2f 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -286,6 +286,8 @@ public MemoryEstimate getIndividualValueDictId(Object multiValueHolder, reverseDictionary.put(multiValueHolder, size); dictId = size; // TODO(laksh): confirm if this is the same for sorted dictionaries as well + // MultiValueHOlder is always expected to handle the type, once the coercion is complete + //noinspection unchecked footprintIncrease = DictionaryBuilding.estimateEntryFootprint( nullableTypeStrategy.estimateSizeBytes(multiValueHolder) ); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java index bea2bacb62fe..67d8f157a352 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -38,18 +38,27 @@ * require an additional step of mapping them to an integer index. The integer index can be materialized on the buffer within * a fixed width, and is often backed by a dictionary representing the actual dimension object. It is used for arrays, * strings, and complex types. - * + *

* The visibility of the class is limited, and the callers must use one of the two variants of the mapping strategy: * 1. {@link PrebuiltDictionaryStringGroupByColumnSelectorStrategy} * 2. {@link DictionaryBuildingGroupByColumnSelectorStrategy} + *

+ * TODO(laksh): Vet this change + * {@code null} can be represented by either -1 or the position of null in the dictionary it was stored when it was + * encountered. This is fine, because most of the time, the dictionary id has no value of its own, and is converted back to + * the value it represents, before doing further operations. The only place where it would matter would be when + * {@link IdToDimensionConverter#canCompareIds()} is true, and we compare directly on the dictionary ids for prebuilt + * dictionaries (we can't compare ids for the dictionaries built on the fly in the grouping strategy). However, in that case, + * it is guaranteed that the dictionaryId of null represented by the pre-built dictionary would be the lowest (most likely 0) + * and therefore nulls (-1) would be adjacent to nulls (represented by the lowest non-negative dictionary id), and would get + * grouped in the later merge stages. * - * @param > Class of the dimension + * @param > Class of the dimension * @param Class of the "dimension holder". For single-value dimensions, the holder's type and the - * holder's object are equivalent to the dimension. For multi-value dimensions (only strings), - * the holder's type and the object are different, where the type would be {@link org.apache.druid.segment.data.IndexedInts} - * representing all the values in the multi-valued string, while the dimension type would be - * String - * + * holder's object are equivalent to the dimension. For multi-value dimensions (only strings), + * the holder's type and the object are different, where the type would be {@link org.apache.druid.segment.data.IndexedInts} + * representing all the values in the multi-valued string, while the dimension type would be + * String * @see DimensionToIdConverter encoding logic for converting value to dictionary * @see IdToDimensionConverter decoding logic for converting back dictionary to value */ @@ -188,7 +197,10 @@ public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, MemoryEstimate multiValueHolder = dimensionToIdConverter.getMultiValueHolder(selector, null); int multiValueSize = dimensionToIdConverter.multiValueSize(multiValueHolder.value()); Preconditions.checkState(multiValueSize < 2, "Not supported for multi-value dimensions"); - MemoryEstimate dictIdAndSizeIncrease = dimensionToIdConverter.getIndividualValueDictId(multiValueHolder.value(), 0); + MemoryEstimate dictIdAndSizeIncrease = dimensionToIdConverter.getIndividualValueDictId( + multiValueHolder.value(), + 0 + ); final int dictId = multiValueSize == 1 ? dictIdAndSizeIncrease.value() : GROUP_BY_MISSING_VALUE; keyBuffer.putInt(keyBufferPosition, dictId); diff --git a/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java b/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java index 3afcfdb7074c..95bc36e5f244 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java +++ b/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java @@ -27,6 +27,8 @@ import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.segment.nested.NestedDataComplexTypeSerde; +import org.apache.druid.segment.serde.ComplexMetrics; import javax.annotation.Nullable; import java.nio.ByteBuffer; @@ -47,6 +49,9 @@ public class TypeStrategies public static final StringTypeStrategy STRING = new StringTypeStrategy(); public static final ConcurrentHashMap> COMPLEX_STRATEGIES = new ConcurrentHashMap<>(); + static { + ComplexMetrics.registerSerde(ColumnType.NESTED_DATA.getComplexTypeName(), new NestedDataComplexTypeSerde()); + } /** * Get an {@link TypeStrategy} registered to some {@link TypeSignature#getComplexTypeName()}. */ diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java index 4ffe3b5acf9d..54a31a900e98 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java @@ -20,7 +20,6 @@ package org.apache.druid.query.groupby.epinephelinae.column; import com.google.common.collect.ImmutableList; -import junitparams.converters.Nullable; import org.apache.druid.query.IterableRowsCursorHelper; import org.apache.druid.query.groupby.ResultRow; import org.apache.druid.query.groupby.epinephelinae.GroupByColumnSelectorStrategyFactory; @@ -36,6 +35,7 @@ import org.junit.runner.RunWith; import org.mockito.Mockito; +import javax.annotation.Nullable; import java.nio.ByteBuffer; import java.util.List; @@ -173,26 +173,327 @@ private static void writeGroupingKeyToBuffer(final ByteBuffer buffer, @Nullable @Test public void testMultiValueHandling() { - // Returns false, because fixed width strategy doesn't handle multi-value dimensions, therefore even index-0 is - // flagged + // Returns false, because fixed width strategy doesn't handle multi-value dimensions, therefore it returns false Assert.assertFalse(STRATEGY.checkRowIndexAndAddValueToGroupingKey(0, 1L, 0, BUFFER1)); Assert.assertFalse(STRATEGY.checkRowIndexAndAddValueToGroupingKey(0, 1L, 10, BUFFER1)); + } + + @Test + public void testInitGroupingKeyColumnValue() + { + GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); + Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); + int[] stack = new int[1]; + ResultRow resultRow = ResultRow.create(1); + + STRATEGY.initGroupingKeyColumnValue(0, 0, 1001L, BUFFER1, stack); + Assert.assertEquals(1, stack[0]); + STRATEGY.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); + Assert.assertEquals(1001L, resultRow.get(0)); + + + STRATEGY.initGroupingKeyColumnValue(0, 0, null, BUFFER1, stack); + Assert.assertEquals(0, stack[0]); + STRATEGY.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); + Assert.assertEquals(null, resultRow.get(0)); + } + } + + public static class FloatGroupByColumnSelectorStrategyTest + { + private static final GroupByColumnSelectorStrategy STRATEGY = + STRATEGY_FACTORY.makeColumnSelectorStrategy( + createCursor().getColumnSelectorFactory().getColumnCapabilities(FLOAT_COLUMN), + createCursor().getColumnSelectorFactory().makeColumnValueSelector(FLOAT_COLUMN) + ); + + @Test + public void testKeySize() + { + Assert.assertEquals(Byte.BYTES + Float.BYTES, STRATEGY.getGroupingKeySizeBytes()); + } + + @Test + public void testWriteToKeyBuffer() + { + Cursor cursor = createCursor(); + ResultRow resultRow = ResultRow.create(1); + ColumnValueSelector columnValueSelector = cursor.getColumnSelectorFactory().makeColumnValueSelector(FLOAT_COLUMN); + GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); + Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); + + int rowNum = 0; + while (!cursor.isDone()) { + int sizeIncrease = STRATEGY.writeToKeyBuffer(0, columnValueSelector, BUFFER1); + STRATEGY.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); + Assert.assertEquals(0, sizeIncrease); + Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[1], resultRow.get(0)); + cursor.advance(); + ++rowNum; + } + } + + @Test + public void testInitColumnValues() + { + Cursor cursor = createCursor(); + ColumnValueSelector columnValueSelector = cursor.getColumnSelectorFactory().makeColumnValueSelector(FLOAT_COLUMN); + Object[] valuess = new Object[1]; + + int rowNum = 0; + while (!cursor.isDone()) { + int sizeIncrease = STRATEGY.initColumnValues(columnValueSelector, 0, valuess); + Assert.assertEquals(0, sizeIncrease); + Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[1], valuess[0]); + cursor.advance(); + ++rowNum; + } + } + + @Test + public void testBufferComparator() + { + // lhs < rhs + writeGroupingKeyToBuffer(BUFFER1, 100.0F); + writeGroupingKeyToBuffer(BUFFER2, 200.0F); + Assert.assertEquals(-1, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // lhs == rhs + writeGroupingKeyToBuffer(BUFFER1, 100.0F); + writeGroupingKeyToBuffer(BUFFER2, 100.0F); + Assert.assertEquals(0, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // lhs > rhs + writeGroupingKeyToBuffer(BUFFER1, 200.0F); + writeGroupingKeyToBuffer(BUFFER2, 100.0F); + Assert.assertEquals(1, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // lhs is null + writeGroupingKeyToBuffer(BUFFER1, null); + writeGroupingKeyToBuffer(BUFFER2, 0.0F); + Assert.assertEquals(-1, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // rhs is null + writeGroupingKeyToBuffer(BUFFER1, 0.0F); + writeGroupingKeyToBuffer(BUFFER2, null); + Assert.assertEquals(1, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // lhs and rhs are null + writeGroupingKeyToBuffer(BUFFER1, null); + writeGroupingKeyToBuffer(BUFFER2, null); + Assert.assertEquals(0, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // stringComparator is provided, for lexicographic comparator "2.0" > "100.0" + writeGroupingKeyToBuffer(BUFFER1, 2.0F); + writeGroupingKeyToBuffer(BUFFER2, 100.0F); + Assert.assertEquals( + 1, + STRATEGY.bufferComparator(0, StringComparators.LEXICOGRAPHIC) + .compare(BUFFER1, BUFFER2, 0, 0) + ); + + // stringComparator is provided, for alphanumeric comparator number("2") < number("100") + writeGroupingKeyToBuffer(BUFFER1, 2.0F); + writeGroupingKeyToBuffer(BUFFER2, 100.0F); + Assert.assertEquals( + -1, + STRATEGY.bufferComparator(0, StringComparators.ALPHANUMERIC) + .compare(BUFFER1, BUFFER2, 0, 0) + ); + } + + private static void writeGroupingKeyToBuffer(final ByteBuffer buffer, @Nullable Float key) + { + ColumnValueSelector columnValueSelector1 = Mockito.mock(ColumnValueSelector.class); + + Mockito.when(columnValueSelector1.getObject()).thenReturn(key); + Mockito.when(columnValueSelector1.isNull()).thenReturn(key == null); + + Assert.assertEquals(0, STRATEGY.writeToKeyBuffer(0, columnValueSelector1, buffer)); + } + + @Test + public void testMultiValueHandling() + { + // Returns false, because fixed width strategy doesn't handle multi-value dimensions, therefore it returns false + Assert.assertFalse(STRATEGY.checkRowIndexAndAddValueToGroupingKey(0, 1.0F, 0, BUFFER1)); + Assert.assertFalse(STRATEGY.checkRowIndexAndAddValueToGroupingKey(0, 1.0F, 10, BUFFER1)); + } + + @Test + public void testInitGroupingKeyColumnValue() + { + GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); + Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); + int[] stack = new int[1]; + ResultRow resultRow = ResultRow.create(1); + + STRATEGY.initGroupingKeyColumnValue(0, 0, 1001.0F, BUFFER1, stack); + Assert.assertEquals(1, stack[0]); + STRATEGY.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); + Assert.assertEquals(1001.0F, resultRow.get(0)); + + + STRATEGY.initGroupingKeyColumnValue(0, 0, null, BUFFER1, stack); + Assert.assertEquals(0, stack[0]); + STRATEGY.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); + Assert.assertEquals(null, resultRow.get(0)); + } + } + + public static class DoubleGroupByColumnSelectorStrategyTest + { + private static final GroupByColumnSelectorStrategy STRATEGY = + STRATEGY_FACTORY.makeColumnSelectorStrategy( + createCursor().getColumnSelectorFactory().getColumnCapabilities(DOUBLE_COLUMN), + createCursor().getColumnSelectorFactory().makeColumnValueSelector(DOUBLE_COLUMN) + ); + + @Test + public void testKeySize() + { + Assert.assertEquals(Byte.BYTES + Double.BYTES, STRATEGY.getGroupingKeySizeBytes()); + } + + @Test + public void testWriteToKeyBuffer() + { + Cursor cursor = createCursor(); + ResultRow resultRow = ResultRow.create(1); + ColumnValueSelector columnValueSelector = cursor.getColumnSelectorFactory() + .makeColumnValueSelector(DOUBLE_COLUMN); + GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); + Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); + + int rowNum = 0; + while (!cursor.isDone()) { + int sizeIncrease = STRATEGY.writeToKeyBuffer(0, columnValueSelector, BUFFER1); + STRATEGY.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); + Assert.assertEquals(0, sizeIncrease); + Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[2], resultRow.get(0)); + cursor.advance(); + ++rowNum; + } + } + + @Test + public void testInitColumnValues() + { + Cursor cursor = createCursor(); + ColumnValueSelector columnValueSelector = cursor.getColumnSelectorFactory() + .makeColumnValueSelector(DOUBLE_COLUMN); + Object[] valuess = new Object[1]; + + int rowNum = 0; + while (!cursor.isDone()) { + int sizeIncrease = STRATEGY.initColumnValues(columnValueSelector, 0, valuess); + Assert.assertEquals(0, sizeIncrease); + Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[2], valuess[0]); + cursor.advance(); + ++rowNum; + } + } + + @Test + public void testBufferComparator() + { + // lhs < rhs + writeGroupingKeyToBuffer(BUFFER1, 100.0D); + writeGroupingKeyToBuffer(BUFFER2, 200.0D); + Assert.assertEquals(-1, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // lhs == rhs + writeGroupingKeyToBuffer(BUFFER1, 100.0D); + writeGroupingKeyToBuffer(BUFFER2, 100.0D); + Assert.assertEquals(0, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // lhs > rhs + writeGroupingKeyToBuffer(BUFFER1, 200.0D); + writeGroupingKeyToBuffer(BUFFER2, 100.0D); + Assert.assertEquals(1, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // lhs is null + writeGroupingKeyToBuffer(BUFFER1, null); + writeGroupingKeyToBuffer(BUFFER2, 0.0D); + Assert.assertEquals(-1, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // rhs is null + writeGroupingKeyToBuffer(BUFFER1, 0.0D); + writeGroupingKeyToBuffer(BUFFER2, null); + Assert.assertEquals(1, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // lhs and rhs are null + writeGroupingKeyToBuffer(BUFFER1, null); + writeGroupingKeyToBuffer(BUFFER2, null); + Assert.assertEquals(0, STRATEGY.bufferComparator(0, null).compare(BUFFER1, BUFFER2, 0, 0)); + + // stringComparator is provided, for lexicographic comparator "2.0" > "100.0" + writeGroupingKeyToBuffer(BUFFER1, 2.0D); + writeGroupingKeyToBuffer(BUFFER2, 100.0D); + Assert.assertEquals( + 1, + STRATEGY.bufferComparator(0, StringComparators.LEXICOGRAPHIC) + .compare(BUFFER1, BUFFER2, 0, 0) + ); + + // stringComparator is provided, for alphanumeric comparator number("2.0D") < number("100.0D") + writeGroupingKeyToBuffer(BUFFER1, 2.0D); + writeGroupingKeyToBuffer(BUFFER2, 100.0D); + Assert.assertEquals( + -1, + STRATEGY.bufferComparator(0, StringComparators.ALPHANUMERIC) + .compare(BUFFER1, BUFFER2, 0, 0) + ); + } + + private static void writeGroupingKeyToBuffer(final ByteBuffer buffer, @Nullable Double key) + { + ColumnValueSelector columnValueSelector1 = Mockito.mock(ColumnValueSelector.class); + + Mockito.when(columnValueSelector1.getObject()).thenReturn(key); + Mockito.when(columnValueSelector1.isNull()).thenReturn(key == null); + + Assert.assertEquals(0, STRATEGY.writeToKeyBuffer(0, columnValueSelector1, buffer)); + } + + @Test + public void testMultiValueHandling() + { + // Returns false, because fixed width strategy doesn't handle multi-value dimensions, therefore it returns false + Assert.assertFalse(STRATEGY.checkRowIndexAndAddValueToGroupingKey(0, 1.0D, 0, BUFFER1)); + Assert.assertFalse(STRATEGY.checkRowIndexAndAddValueToGroupingKey(0, 1.0D, 10, BUFFER1)); + } + + @Test + public void testInitGroupingKeyColumnValue() + { + GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); + Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); + int[] stack = new int[1]; + ResultRow resultRow = ResultRow.create(1); + STRATEGY.initGroupingKeyColumnValue(0, 0, 1001.0D, BUFFER1, stack); + Assert.assertEquals(1, stack[0]); + STRATEGY.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); + Assert.assertEquals(1001.0D, resultRow.get(0)); + STRATEGY.initGroupingKeyColumnValue(0, 0, null, BUFFER1, stack); + Assert.assertEquals(0, stack[0]); + STRATEGY.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); + Assert.assertEquals(null, resultRow.get(0)); } } private static Cursor createCursor() { - Cursor cursor = IterableRowsCursorHelper.getCursorFromIterable( + return IterableRowsCursorHelper.getCursorFromIterable( DATASOURCE_ROWS, RowSignature.builder() - .add("long", ColumnType.LONG) - .add("float", ColumnType.FLOAT) - .add("double", ColumnType.DOUBLE) + .add(LONG_COLUMN, ColumnType.LONG) + .add(FLOAT_COLUMN, ColumnType.FLOAT) + .add(DOUBLE_COLUMN, ColumnType.DOUBLE) .build() ).lhs; - return cursor; } } \ No newline at end of file diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java new file mode 100644 index 000000000000..f400b0efd864 --- /dev/null +++ b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.groupby.epinephelinae.column; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import org.apache.druid.query.IterableRowsCursorHelper; +import org.apache.druid.query.groupby.epinephelinae.GroupByColumnSelectorStrategyFactory; +import org.apache.druid.segment.ColumnValueSelector; +import org.apache.druid.segment.Cursor; +import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.column.RowSignature; +import org.apache.druid.segment.nested.StructuredData; +import org.apache.druid.testing.InitializedNullHandlingTest; +import org.junit.Assert; +import org.junit.Test; + +import java.nio.ByteBuffer; +import java.util.List; + +/** + * Serves as tests for class {@link DictionaryBuildingGroupByColumnSelectorStrategy} when a complex type is specified + */ +public class NestedColumnGroupByColumnSelectorStrategyTest extends InitializedNullHandlingTest +{ + private static final GroupByColumnSelectorStrategyFactory STRATEGY_FACTORY = new GroupByColumnSelectorStrategyFactory(); + + // No datasource would exist like this, however the inline datasource is an easy way to create the required column value selectors + private static final List DATASOURCE_ROWS = ImmutableList.of( + new Object[]{StructuredData.wrap(ImmutableList.of("x", "y", "z"))}, + new Object[]{StructuredData.wrap(ImmutableMap.of("x", 1.1, "y", 2L))}, + new Object[]{null}, + new Object[]{StructuredData.wrap("hello")} + ); + + private static final String NESTED_COLUMN = "nested"; + /** + * Row with null value in the column + */ + private static final int NULL_ROW_NUMBER = 2; + private static final ByteBuffer BUFFER1 = ByteBuffer.allocate(10); + private static final ByteBuffer BUFFER2 = ByteBuffer.allocate(10); + + @Test + public void testInitColumnValues() + { + final GroupByColumnSelectorStrategy strategy = createStrategy(); + + Cursor cursor = createCursor(); + ColumnValueSelector columnValueSelector = cursor.getColumnSelectorFactory().makeColumnValueSelector(NESTED_COLUMN); + Object[] valuess = new Object[1]; + int rowNum = 0; + while (!cursor.isDone()) { + int sz = strategy.initColumnValues(columnValueSelector, 0, valuess); + // While adding the values for the first time, the initialisation should have a non-zero footprint + Assert.assertTrue(sz > 0); + Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[0], valuess[0]); + + cursor.advance(); + ++rowNum; + } + + cursor = createCursor(); + columnValueSelector = cursor.getColumnSelectorFactory().makeColumnValueSelector(NESTED_COLUMN); + rowNum = 0; + while (!cursor.isDone()) { + int sz = strategy.initColumnValues(columnValueSelector, 0, valuess); + // While adding the values for the first time, the initialisation should have a non-zero footprint + Assert.assertEquals(0, sz); + Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[0], valuess[0]); + + cursor.advance(); + ++rowNum; + } + } + + @Test + public void testWriteToKeyBuffer() + { + final GroupByColumnSelectorStrategy strategy = createStrategy(); + + Cursor cursor = createCursor(); + ColumnValueSelector columnValueSelector = cursor.getColumnSelectorFactory().makeColumnValueSelector(NESTED_COLUMN); + + int rowNum = 0; + while (!cursor.isDone()) { + int sz = strategy.writeToKeyBuffer(0, columnValueSelector, BUFFER1); + Assert.assertTrue(sz > 0); + // null is represented by GROUP_BY_MISSING_VALUE on the buffer, even though it gets its own dictionaryId in the dictionary + Assert.assertEquals(rowNum == NULL_ROW_NUMBER ? -1 : rowNum, BUFFER1.getInt(0)); + cursor.advance(); + ++rowNum; + } + } + + @Test + public void testKeySize() + { + Assert.assertEquals(Integer.BYTES, createStrategy().getGroupingKeySizeBytes()); + } + + private static GroupByColumnSelectorStrategy createStrategy() + { + return STRATEGY_FACTORY.makeColumnSelectorStrategy( + createCursor().getColumnSelectorFactory().getColumnCapabilities(NESTED_COLUMN), + createCursor().getColumnSelectorFactory().makeColumnValueSelector(NESTED_COLUMN) + ); + } + + + private static Cursor createCursor() + { + return IterableRowsCursorHelper.getCursorFromIterable( + DATASOURCE_ROWS, + RowSignature.builder() + .add(NESTED_COLUMN, ColumnType.NESTED_DATA) + .build() + ).lhs; + } +} \ No newline at end of file diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java index 58fa7ad0de3b..4f02789b6655 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java @@ -550,7 +550,7 @@ public void testTopNPath() public void testGroupByNested() { testQuery( - "SELECT nester, COUNT(*) FROM druid.nested GROUP BY 1", + "SELECT * FROM druid.nested", ImmutableList.of(), ImmutableList.of() ); diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java b/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java index ccb2012b94c4..04c34f5e720a 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java @@ -380,7 +380,7 @@ public VerifyNativeQueries(BaseExecuteQuery execStep) public void verify() { for (QueryResults queryResults : execStep.results()) { - verifyQuery(queryResults); +// verifyQuery(queryResults); } } From 58f8834549027e64d68e13081a744ee0592f2e9d Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Thu, 21 Mar 2024 23:54:54 +0530 Subject: [PATCH 15/46] tests --- .../druid/segment/column/TypeStrategies.java | 3 -- ...lumnGroupByColumnSelectorStrategyTest.java | 34 +++++++++++++++---- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java b/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java index 95bc36e5f244..48570578b360 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java +++ b/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java @@ -49,9 +49,6 @@ public class TypeStrategies public static final StringTypeStrategy STRING = new StringTypeStrategy(); public static final ConcurrentHashMap> COMPLEX_STRATEGIES = new ConcurrentHashMap<>(); - static { - ComplexMetrics.registerSerde(ColumnType.NESTED_DATA.getComplexTypeName(), new NestedDataComplexTypeSerde()); - } /** * Get an {@link TypeStrategy} registered to some {@link TypeSignature#getComplexTypeName()}. */ diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java index f400b0efd864..1ef4fb8415c9 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java @@ -22,6 +22,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import org.apache.druid.query.IterableRowsCursorHelper; +import org.apache.druid.query.groupby.ResultRow; import org.apache.druid.query.groupby.epinephelinae.GroupByColumnSelectorStrategyFactory; import org.apache.druid.segment.ColumnValueSelector; import org.apache.druid.segment.Cursor; @@ -31,6 +32,7 @@ import org.apache.druid.testing.InitializedNullHandlingTest; import org.junit.Assert; import org.junit.Test; +import org.mockito.Mockito; import java.nio.ByteBuffer; import java.util.List; @@ -59,10 +61,15 @@ public class NestedColumnGroupByColumnSelectorStrategyTest extends InitializedNu private static final ByteBuffer BUFFER2 = ByteBuffer.allocate(10); @Test - public void testInitColumnValues() + public void testKeySize() { - final GroupByColumnSelectorStrategy strategy = createStrategy(); + Assert.assertEquals(Integer.BYTES, createStrategy().getGroupingKeySizeBytes()); + } + @Test + public void testInitColumnValues() + { + GroupByColumnSelectorStrategy strategy = createStrategy(); Cursor cursor = createCursor(); ColumnValueSelector columnValueSelector = cursor.getColumnSelectorFactory().makeColumnValueSelector(NESTED_COLUMN); Object[] valuess = new Object[1]; @@ -94,8 +101,10 @@ public void testInitColumnValues() @Test public void testWriteToKeyBuffer() { - final GroupByColumnSelectorStrategy strategy = createStrategy(); - + GroupByColumnSelectorStrategy strategy = createStrategy(); + ResultRow resultRow = ResultRow.create(1); + GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); + Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); Cursor cursor = createCursor(); ColumnValueSelector columnValueSelector = cursor.getColumnSelectorFactory().makeColumnValueSelector(NESTED_COLUMN); @@ -105,17 +114,30 @@ public void testWriteToKeyBuffer() Assert.assertTrue(sz > 0); // null is represented by GROUP_BY_MISSING_VALUE on the buffer, even though it gets its own dictionaryId in the dictionary Assert.assertEquals(rowNum == NULL_ROW_NUMBER ? -1 : rowNum, BUFFER1.getInt(0)); + // Readback the value + strategy.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); + Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[0], resultRow.get(0)); + cursor.advance(); ++rowNum; } } @Test - public void testKeySize() + public void testInitGroupingKeyColumnValue() { - Assert.assertEquals(Integer.BYTES, createStrategy().getGroupingKeySizeBytes()); + GroupByColumnSelectorStrategy strategy = createStrategy(); + GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); + Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); + int[] stack = new int[1]; + ResultRow resultRow = ResultRow.create(1); + + // strategy.initGroupingKeyColumnValue( 0, 0, StructuredData.wrap(ImmutableList.of("x", "y", "z")), BUFFER1, ); + } + // test reset works fine + private static GroupByColumnSelectorStrategy createStrategy() { return STRATEGY_FACTORY.makeColumnSelectorStrategy( From 5939715fc44320c8c43629195692ae6443357ee3 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 22 Mar 2024 10:48:25 +0530 Subject: [PATCH 16/46] fixup big mistake --- .../benchmark/query/SqlGroupByBenchmark.java | 44 +++++++++---------- .../druid/segment/DimensionHandlerUtils.java | 8 ++-- ...lumnGroupByColumnSelectorStrategyTest.java | 10 ++++- .../calcite/CalciteNestedDataQueryTest.java | 2 +- 4 files changed, 36 insertions(+), 28 deletions(-) diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java index 7a575c55c1a8..b8d1d2ef2d36 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java @@ -122,44 +122,44 @@ public String getFormatString() }; @Param({ - "string-Sequential-100_000", - "string-Sequential-10_000_000", +// "string-Sequential-100_000", +// "string-Sequential-10_000_000", // "string-Sequential-1_000_000_000", - "string-ZipF-1_000_000", - "string-Uniform-1_000_000", +// "string-ZipF-1_000_000", +// "string-Uniform-1_000_000", - "multi-string-Sequential-100_000", - "multi-string-Sequential-10_000_000", +// "multi-string-Sequential-100_000", +// "multi-string-Sequential-10_000_000", // "multi-string-Sequential-1_000_000_000", - "multi-string-ZipF-1_000_000", - "multi-string-Uniform-1_000_000", +// "multi-string-ZipF-1_000_000", +// "multi-string-Uniform-1_000_000", - "long-Sequential-100_000", - "long-Sequential-10_000_000", +// "long-Sequential-100_000", +// "long-Sequential-10_000_000", // "long-Sequential-1_000_000_000", - "long-ZipF-1_000_000", - "long-Uniform-1_000_000", +// "long-ZipF-1_000_000", +// "long-Uniform-1_000_000", - "double-ZipF-1_000_000", - "double-Uniform-1_000_000", +// "double-ZipF-1_000_000", +// "double-Uniform-1_000_000", - "float-ZipF-1_000_000", - "float-Uniform-1_000_000", +// "float-ZipF-1_000_000", +// "float-Uniform-1_000_000", "stringArray-Sequential-100_000", - "stringArray-Sequential-10_000_000", + "stringArray-Sequential-3_000_000", // "stringArray-Sequential-1_000_000_000", "stringArray-ZipF-1_000_000", "stringArray-Uniform-1_000_000", "longArray-Sequential-100_000", - "longArray-Sequential-10_000_000", + "longArray-Sequential-3_000_000", // "longArray-Sequential-1_000_000_000", "longArray-ZipF-1_000_000", "longArray-Uniform-1_000_000", "nested-Sequential-100_000", - "nested-Sequential-10_000_000", + "nested-Sequential-3_000_000", // "nested-Sequential-1_000_000_000", "nested-ZipF-1_000_000", "nested-Uniform-1_000_000", @@ -210,7 +210,7 @@ public void setup() TestExprMacroTable.INSTANCE ), new ExpressionTransform( - "stringArray-Sequential-10_000_000", + "stringArray-Sequential-3_000_000", "array(\"string-Sequential-10_000_000\")", TestExprMacroTable.INSTANCE ), @@ -238,7 +238,7 @@ public void setup() TestExprMacroTable.INSTANCE ), new ExpressionTransform( - "longArray-Sequential-10_000_000", + "longArray-Sequential-3_000_000", "array(\"long-Sequential-10_000_000\")", TestExprMacroTable.INSTANCE ), @@ -266,7 +266,7 @@ public void setup() TestExprMacroTable.INSTANCE ), new ExpressionTransform( - "nested-Sequential-10_000_000", + "nested-Sequential-3_000_000", "json_object('long1', \"long-Sequential-10_000_000\", 'nesteder', json_object('long1', \"long-Sequential-10_000_000\"))", TestExprMacroTable.INSTANCE ), diff --git a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java index 87fa9243bc79..55e0db1b93f6 100644 --- a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java +++ b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java @@ -249,10 +249,10 @@ private static ColumnCapabilities getEffectiveCapabilities( capabilities = DEFAULT_STRING_CAPABILITIES; } - // Complex dimension type is not supported - if (capabilities.is(ValueType.COMPLEX)) { - capabilities = DEFAULT_STRING_CAPABILITIES; - } +// // Complex dimension type is not supported +// if (capabilities.is(ValueType.COMPLEX)) { +// capabilities = DEFAULT_STRING_CAPABILITIES; +// } // Currently, all extractionFns output Strings, so the column will return String values via a // DimensionSelector if an extractionFn is present. diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java index 1ef4fb8415c9..8db647701594 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java @@ -131,9 +131,17 @@ public void testInitGroupingKeyColumnValue() Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); int[] stack = new int[1]; ResultRow resultRow = ResultRow.create(1); + Object obj = StructuredData.wrap(ImmutableList.of("x", "y", "z")); - // strategy.initGroupingKeyColumnValue( 0, 0, StructuredData.wrap(ImmutableList.of("x", "y", "z")), BUFFER1, ); + strategy.initGroupingKeyColumnValue(0, 0, obj, BUFFER1, stack); + Assert.assertEquals(1, stack[0]); + strategy.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); + Assert.assertEquals(obj, resultRow.get(0)); + strategy.initGroupingKeyColumnValue(0, 0, null, BUFFER1, stack); + Assert.assertEquals(0, stack[0]); + strategy.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); + Assert.assertNull(resultRow.get(0)); } // test reset works fine diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java index 4f02789b6655..e75dda97b967 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java @@ -550,7 +550,7 @@ public void testTopNPath() public void testGroupByNested() { testQuery( - "SELECT * FROM druid.nested", + "SELECT nester, SUM(strlen(string)) FROM druid.nested GROUP BY 1", ImmutableList.of(), ImmutableList.of() ); From d8a250c4bbf50b6cbe66007dbcf11066785f1159 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Wed, 27 Mar 2024 14:40:43 +0530 Subject: [PATCH 17/46] cleanup, refactor for diff strategies --- ...ding.java => DictionaryBuildingUtils.java} | 14 +- .../GroupByColumnSelectorStrategyFactory.java | 36 +-- .../epinephelinae/RowBasedGrouperHelper.java | 36 +-- ...BuildingGroupByColumnSelectorStrategy.java | 155 +-------- .../column/DimensionToIdConverter.java | 51 +-- ...xedWidthGroupByColumnSelectorStrategy.java | 36 +-- .../column/IdToDimensionConverter.java | 2 +- ...yMappingGroupByColumnSelectorStrategy.java | 95 +++--- ...ltiValueGroupByColumnSelectorStrategy.java | 302 ++++++++++++++++++ ...ctionaryGroupByColumnSelectorStrategy.java | 52 +++ ...ryStringGroupByColumnSelectorStrategy.java | 138 -------- ...alueStringGroupByVectorColumnSelector.java | 4 +- 12 files changed, 471 insertions(+), 450 deletions(-) rename processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/{DictionaryBuilding.java => DictionaryBuildingUtils.java} (93%) create mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingMultiValueGroupByColumnSelectorStrategy.java create mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java delete mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuildingUtils.java similarity index 93% rename from processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java rename to processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuildingUtils.java index e3628a93e4e0..fc95378e6ba5 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuilding.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuildingUtils.java @@ -38,7 +38,7 @@ /** * Utilities for parts of the groupBy engine that need to build dictionaries. */ -public class DictionaryBuilding +public class DictionaryBuildingUtils { // Entry in dictionary, node pointer in reverseDictionary, hash + k/v/next pointer in reverseDictionary nodes private static final int ROUGH_OVERHEAD_PER_DICTIONARY_ENTRY = Long.BYTES * 5 + Integer.BYTES; @@ -66,12 +66,12 @@ public static Object2IntMap createReverseDictionary() return m; } -// private static Object2IntMap createReverseDictionary(final Hash.Strategy hashStrategy) -// { -// final Object2IntOpenCustomHashMap m = new Object2IntOpenCustomHashMap<>(hashStrategy); -// m.defaultReturnValue(DimensionDictionary.ABSENT_VALUE_ID); -// return m; -// } + private static Object2IntMap createReverseDictionary(final Hash.Strategy hashStrategy) + { + final Object2IntOpenCustomHashMap m = new Object2IntOpenCustomHashMap<>(hashStrategy); + m.defaultReturnValue(DimensionDictionary.ABSENT_VALUE_ID); + return m; + } /** * Creates a reverse dictionary which stores the keys in a sorted map. The sorting is decided based on the given diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java index 2e8be2829c1c..02f49a9a47ff 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java @@ -25,7 +25,7 @@ import org.apache.druid.query.groupby.epinephelinae.column.DictionaryBuildingGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.FixedWidthGroupByColumnSelectorStrategy; import org.apache.druid.query.groupby.epinephelinae.column.GroupByColumnSelectorStrategy; -import org.apache.druid.query.groupby.epinephelinae.column.PrebuiltDictionaryStringGroupByColumnSelectorStrategy; +import org.apache.druid.query.groupby.epinephelinae.column.KeyMappingMultiValueGroupByColumnSelectorStrategy; import org.apache.druid.segment.ColumnValueSelector; import org.apache.druid.segment.DimensionSelector; import org.apache.druid.segment.column.ColumnCapabilities; @@ -33,6 +33,8 @@ /** * Creates {@link org.apache.druid.query.dimension.ColumnSelectorStrategy}s for grouping dimensions + * + * TODO(laksh): Describe the steps and mv-handling */ public class GroupByColumnSelectorStrategyFactory implements ColumnSelectorStrategyFactory { @@ -47,33 +49,27 @@ public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( } switch (capabilities.getType()) { case STRING: - DimensionSelector dimSelector = (DimensionSelector) selector; - if (dimSelector.getValueCardinality() >= 0 && dimSelector.nameLookupPossibleInAdvance()) { - return PrebuiltDictionaryStringGroupByColumnSelectorStrategy.forType( - ColumnType.STRING, - selector, - capabilities - ); - } else { - return DictionaryBuildingGroupByColumnSelectorStrategy.forType(ColumnType.STRING); - } + return KeyMappingMultiValueGroupByColumnSelectorStrategy.create(capabilities, (DimensionSelector) selector); case LONG: - return new FixedWidthGroupByColumnSelectorStrategy( + return new FixedWidthGroupByColumnSelectorStrategy<>( Byte.BYTES + Long.BYTES, - true, - ColumnType.LONG + ColumnType.LONG, + ColumnValueSelector::getLong, + ColumnValueSelector::isNull ); case FLOAT: - return new FixedWidthGroupByColumnSelectorStrategy( + return new FixedWidthGroupByColumnSelectorStrategy<>( Byte.BYTES + Float.BYTES, - true, - ColumnType.FLOAT + ColumnType.FLOAT, + ColumnValueSelector::getFloat, + ColumnValueSelector::isNull ); case DOUBLE: - return new FixedWidthGroupByColumnSelectorStrategy( + return new FixedWidthGroupByColumnSelectorStrategy<>( Byte.BYTES + Double.BYTES, - true, - ColumnType.DOUBLE + ColumnType.DOUBLE, + ColumnValueSelector::getDouble, + ColumnValueSelector::isNull ); case ARRAY: switch (capabilities.getElementType().getType()) { diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java index f7afcdb7f9b2..719deed7c009 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java @@ -1159,7 +1159,7 @@ private static int compareDimsInRowsWithAggs( static long estimateStringKeySize(@Nullable String key) { - return DictionaryBuilding.estimateEntryFootprint((key == null ? 0 : key.length()) * Character.BYTES); + return DictionaryBuildingUtils.estimateEntryFootprint((key == null ? 0 : key.length()) * Character.BYTES); } private static class RowBasedKeySerde implements Grouper.KeySerde @@ -1223,25 +1223,25 @@ private static class RowBasedKeySerde implements Grouper.KeySerde DictionaryBuilding.createDictionary() + ignored -> DictionaryBuildingUtils.createDictionary() ); this.complexTypeReverseDictionary = complexTypeReverseDictionaries.computeIfAbsent( complexTypeName, - ignored -> DictionaryBuilding.createTreeSortedReverseDictionary(complexType.getNullableStrategy()) + ignored -> DictionaryBuildingUtils.createTreeSortedReverseDictionary(complexType.getNullableStrategy()) ); this.bufferComparator = (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> complexType.getNullableStrategy().compare( diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java index 220f7f42ee2f..ff5cf1d21884 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -20,16 +20,10 @@ package org.apache.druid.query.groupby.epinephelinae.column; import it.unimi.dsi.fastutil.objects.Object2IntMap; -import org.apache.druid.common.config.NullHandling; import org.apache.druid.error.DruidException; -import org.apache.druid.query.groupby.epinephelinae.DictionaryBuilding; -import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.DimensionHandlerUtils; -import org.apache.druid.segment.DimensionSelector; +import org.apache.druid.query.groupby.epinephelinae.DictionaryBuildingUtils; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.column.NullableTypeStrategy; -import org.apache.druid.segment.data.ArrayBasedIndexedInts; -import org.apache.druid.segment.data.IndexedInts; import javax.annotation.concurrent.NotThreadSafe; import java.util.List; @@ -41,13 +35,12 @@ *

* This strategy can handle any dimension that can be addressed on a reverse-dictionary. Reverse dictionary uses * a sorted map, rather than a hashmap. - * TODO(laksh): Benchmark results *

* This is the most expensive of all the strategies, and hence must be used only when other strategies aren't valid. */ @NotThreadSafe -public class DictionaryBuildingGroupByColumnSelectorStrategy - extends KeyMappingGroupByColumnSelectorStrategy +public class DictionaryBuildingGroupByColumnSelectorStrategy + extends KeyMappingGroupByColumnSelectorStrategy { /** @@ -68,7 +61,7 @@ public class DictionaryBuildingGroupByColumnSelectorStrategy reverseDictionary; private DictionaryBuildingGroupByColumnSelectorStrategy( - DimensionToIdConverter dimensionToIdConverter, + DimensionToIdConverter dimensionToIdConverter, ColumnType columnType, NullableTypeStrategy nullableTypeStrategy, DimensionType defaultValue, @@ -89,7 +82,7 @@ public static GroupByColumnSelectorStrategy forType(final ColumnType columnType) { if (columnType.equals(ColumnType.STRING)) { // String types are handled specially because they can have multi-value dimensions - return forString(); + throw DruidException.defensive("Should use special variant which handles multi-value dimensions"); } else if ( // Defensive check, primitives should be using a faster fixed-width strategy columnType.equals(ColumnType.DOUBLE) @@ -102,25 +95,6 @@ public static GroupByColumnSelectorStrategy forType(final ColumnType columnType) return forArrayAndComplexTypes(columnType); } - /** - * Implementation of the dictionary building strategy for string types. - */ - private static GroupByColumnSelectorStrategy forString() - { - final List dictionary = DictionaryBuilding.createDictionary(); - final Object2IntMap reverseDictionary = - DictionaryBuilding.createReverseDictionary(); - return new DictionaryBuildingGroupByColumnSelectorStrategy<>( - new StringDimensionToIdConverter(dictionary, reverseDictionary), - ColumnType.STRING, - ColumnType.STRING.getNullableStrategy(), - NullHandling.defaultStringValue(), - new DictionaryIdToDimensionConverter<>(dictionary), - dictionary, - reverseDictionary - ); - } - /** * Implemenatation of dictionary building strategy for types other than strings (since they can be multi-valued and need * to be handled separately) and numeric primitives (since they can be handled by fixed-width strategy). @@ -133,11 +107,11 @@ private static GroupByColumnSelectorStrategy forString() */ private static GroupByColumnSelectorStrategy forArrayAndComplexTypes(final ColumnType columnType) { - final List dictionary = DictionaryBuilding.createDictionary(); + final List dictionary = DictionaryBuildingUtils.createDictionary(); final Object2IntMap reverseDictionary = - DictionaryBuilding.createTreeSortedReverseDictionary(columnType.getNullableStrategy()); + DictionaryBuildingUtils.createTreeSortedReverseDictionary(columnType.getNullableStrategy()); return new DictionaryBuildingGroupByColumnSelectorStrategy<>( - new UniValueDimensionToIdConverter(dictionary, reverseDictionary, columnType, columnType.getNullableStrategy()), + new UniValueDimensionToIdConverter(dictionary, reverseDictionary, columnType.getNullableStrategy()), columnType, columnType.getNullableStrategy(), null, @@ -147,136 +121,27 @@ private static GroupByColumnSelectorStrategy forArrayAndComplexTypes(final Colum ); } - /** - * Encodes the multi-valued string dimension to the ids. It replaces the original IndexedInts, with the one containing - * the global dictionary ids, This removes an extra redirection involved while looking up the value. - * - * Therefore, if the input dimension column has two rows, with dimensions like: - * - * (Input) - * Column1 - [1, 2] - lookupName(1) = foo, lookupName(2) = bar - * Column2 - [1, 2, 2] - lookupName(1) = baz, lookupName(2) = foo - * - * The multi-value holders for the column, after conversion would look like: - * Column1 - [1, 2] - * Column2 - [3, 1] - * - * And the dictionary-reverse dictionary would look like: - * Dictionary: [foo, bar, baz] - * Reverse dictionary: (foo, 1), (bar, 2), (baz, 3) - * - * Converting a value from the returned row to the dictId is as simple as fetching the int present at the given location. - */ - private static class StringDimensionToIdConverter implements DimensionToIdConverter - { - private final List dictionary; - private final Object2IntMap reverseDictionary; - - public StringDimensionToIdConverter( - List dictionary, - Object2IntMap reverseDictionary - ) - { - this.dictionary = dictionary; - this.reverseDictionary = reverseDictionary; - } - - @Override - public MemoryEstimate getMultiValueHolder( - final ColumnValueSelector selector, - final IndexedInts reusableValue - ) - { - final DimensionSelector dimensionSelector = (DimensionSelector) selector; - final IndexedInts row = dimensionSelector.getRow(); - int footprintIncrease = 0; - ArrayBasedIndexedInts newRow = (ArrayBasedIndexedInts) reusableValue; - if (newRow == null) { - newRow = new ArrayBasedIndexedInts(); - } - int rowSize = row.size(); - newRow.ensureSize(rowSize); - for (int i = 0; i < rowSize; ++i) { - final String value = dimensionSelector.lookupName(row.get(i)); - final int dictId = reverseDictionary.getInt(value); - if (dictId < 0) { - final int nextId = dictionary.size(); - dictionary.add(value); - reverseDictionary.put(value, nextId); - newRow.setValue(i, nextId); - footprintIncrease += DictionaryBuilding.estimateEntryFootprint( - (value == null ? 0 : value.length()) * Character.BYTES - ); - } else { - newRow.setValue(i, dictId); - } - } - newRow.setSize(rowSize); - return new MemoryEstimate<>(newRow, footprintIncrease); - } - - @Override - public int multiValueSize(IndexedInts multiValueHolder) - { - return multiValueHolder.size(); - } - - @Override - public MemoryEstimate getIndividualValueDictId(IndexedInts multiValueHolder, int index) - { - // Already converted it to the dictionary id - return new MemoryEstimate<>(multiValueHolder.get(index), 0); - } - } - private static class UniValueDimensionToIdConverter implements DimensionToIdConverter { private final List dictionary; private final Object2IntMap reverseDictionary; - private final ColumnType columnType; @SuppressWarnings("rawtypes") private final NullableTypeStrategy nullableTypeStrategy; public UniValueDimensionToIdConverter( final List dictionary, final Object2IntMap reverseDictionary, - final ColumnType columnType, final NullableTypeStrategy nullableTypeStrategy ) { this.dictionary = dictionary; this.reverseDictionary = reverseDictionary; - this.columnType = columnType; this.nullableTypeStrategy = nullableTypeStrategy; } @Override - public MemoryEstimate getMultiValueHolder(ColumnValueSelector selector, Object reusableValue) - { - final Object value = DimensionHandlerUtils.convertObjectToType(selector.getObject(), columnType); - final int dictId = reverseDictionary.getInt(value); - int footprintIncrease = 0; - if (dictId < 0) { - final int size = dictionary.size(); - dictionary.add(value); - reverseDictionary.put(value, size); - footprintIncrease = DictionaryBuilding.estimateEntryFootprint(nullableTypeStrategy.estimateSizeBytes(value)); - - } - return new MemoryEstimate<>(value, footprintIncrease); - } - - @Override - public int multiValueSize(Object multiValueHolder) - { - //noinspection VariableNotUsedInsideIf - return multiValueHolder == null ? 0 : 1; - } - - @Override - public MemoryEstimate getIndividualValueDictId(Object multiValueHolder, int index) + public MemoryEstimate lookupId(Object multiValueHolder) { - assert index == 0; int dictId = reverseDictionary.getInt(multiValueHolder); int footprintIncrease = 0; // Even if called again, then this is no-op @@ -288,7 +153,7 @@ public MemoryEstimate getIndividualValueDictId(Object multiValueHolder, // TODO(laksh): confirm if this is the same for sorted dictionaries as well // MultiValueHOlder is always expected to handle the type, once the coercion is complete //noinspection unchecked - footprintIncrease = DictionaryBuilding.estimateEntryFootprint( + footprintIncrease = DictionaryBuildingUtils.estimateEntryFootprint( nullableTypeStrategy.estimateSizeBytes(multiValueHolder) ); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java index d989a73b18ec..56a6c159376a 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java @@ -19,59 +19,20 @@ package org.apache.druid.query.groupby.epinephelinae.column; -import org.apache.druid.segment.ColumnValueSelector; - -import javax.annotation.Nullable; - /** * Interface for converters of dimension to dictionary id. * - * This is a slightly convoluted interface because it also encapsulates the additional logic for handling multi-value - * dimensions. It has an additional step that converts the given dimensions to "dimension holders", which represent the - * multi-value holders for a given dimension. - * Therefore, the conversion goes from ColumnValueSelector -> DimensionHolder -> DictionaryID (for each dimension in the holder) - * - * The dimension holder is only applicable for multi-value strings. - * For other dimensions that cannot have multi-values the dimension holder is identical to the dimension. They can be - * defensively cast or homogenised, for example doubles to floats for float selectors or Long[] to Object[] for array - * selectors, so that the upstream callers can assume the class of the dimensions. The size of these dimensions is always 1, - * and only contain a value at index 0. - * - * Converting a value to its dictionary id might require building dictionaries on the fly while computing the id. The - * return type of the methods, except {@link #multiValueSize}, takes that into account. - * - * The implementations can pre-convert the value to the dictionaryId while extracting the dimensionHolder. Extracting - * dictionary id for a specific value from the (potentially multi-value dimension holder) can be done by calling - * {@link #getIndividualValueDictId} and passing the index to the multi-value. + * It only handles single-value dimensions. Handle multi-value dimensions (i.e. strings) using the + * {@link KeyMappingMultiValueGroupByColumnSelectorStrategy} * * @see IdToDimensionConverter for converting the dictionary values back to dimensions * - * @param Type of the dimension holder + * @param Type of the dimension holder */ -public interface DimensionToIdConverter +public interface DimensionToIdConverter { /** - * @param selector Column value selector to extract the dimension holder from - * @param reusableValue Dimension holder can be reused throughout multiple calls to prevent reallocation of memory - * or arrays. The older value can be disregarded and the object can be reused for freely by this call. - * @return DimensionHolder associated with the selector, and the internal dictionary increase associated with it - */ - MemoryEstimate getMultiValueHolder( - ColumnValueSelector selector, - // TODO(laksh): This is always null. Find a way to use this or remove this parameter - @Nullable DimensionHolderType reusableValue - ); - - /** - * @param multiValueHolder Multi value holder obtained from call to {@link #getMultiValueHolder} - * @return Size of the multi-value dimension - */ - int multiValueSize(DimensionHolderType multiValueHolder); - - /** - * @param multiValueHolder Multi value holder obtained from call to {@link #getMultiValueHolder} - * @param index Index of the value inside the multi-value holder to obtain - * @return DictionaryId of the object at the given index + * @return DictionaryId of the object at the given index and the memory increase associated with it */ - MemoryEstimate getIndividualValueDictId(DimensionHolderType multiValueHolder, int index); + MemoryEstimate lookupId(DimensionType multiValueHolder); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java index 91972ab42140..378689a42059 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategy.java @@ -25,13 +25,13 @@ import org.apache.druid.query.groupby.epinephelinae.Grouper; import org.apache.druid.query.ordering.StringComparator; import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.DimensionHandlerUtils; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.column.NullableTypeStrategy; import javax.annotation.Nullable; import javax.annotation.concurrent.NotThreadSafe; import java.nio.ByteBuffer; +import java.util.function.Function; /** * Strategy for grouping dimensions which have fixed-width objects. It is only used for numeric primitive types, @@ -48,11 +48,6 @@ public class FixedWidthGroupByColumnSelectorStrategy implements GroupByColumn */ final int keySizeBytes; - /** - * Indicates whether the type is primitive or not - */ - final boolean isPrimitive; - /** * Type of the dimension on which the grouping strategy is being used */ @@ -63,16 +58,21 @@ public class FixedWidthGroupByColumnSelectorStrategy implements GroupByColumn */ final NullableTypeStrategy nullableTypeStrategy; + final Function, T> valueGetter; + final Function, Boolean> nullityGetter; + public FixedWidthGroupByColumnSelectorStrategy( int keySizeBytes, - boolean isPrimitive, - ColumnType columnType + ColumnType columnType, + Function, T> valueGetter, + Function, Boolean> nullityGetter ) { this.keySizeBytes = keySizeBytes; - this.isPrimitive = isPrimitive; this.columnType = columnType; this.nullableTypeStrategy = columnType.getNullableStrategy(); + this.valueGetter = valueGetter; + this.nullityGetter = nullityGetter; } @Override @@ -183,18 +183,6 @@ public void reset() // Nothing to reset } - /** - * Returns true if the value at the selector is null. It unifies the null handling of primitive numeric types and the - * other types - */ - private boolean selectorIsNull(ColumnValueSelector columnValueSelector) - { - if (isPrimitive && columnValueSelector.isNull()) { - return true; - } - return !isPrimitive && (columnValueSelector.getObject() == null); - } - /** * Returns the value of the selector. It handles nullity of the value and casts it to the proper type so that the * upstream callers donot need to worry about handling incorrect types (for example, if a double column value selector @@ -203,12 +191,10 @@ private boolean selectorIsNull(ColumnValueSelector columnValueSelector) @Nullable private T getValue(ColumnValueSelector columnValueSelector) { - if (selectorIsNull(columnValueSelector)) { + if (nullityGetter.apply(columnValueSelector)) { return null; } - // TODO(laksh): Check if calling .getObject() on primitive selectors be problematic?? // Convert the object to the desired type - //noinspection unchecked - return (T) DimensionHandlerUtils.convertObjectToType(columnValueSelector.getObject(), columnType); + return valueGetter.apply(columnValueSelector); } } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java index 979f8f7b0829..0e306277679b 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java @@ -28,7 +28,7 @@ * * Encoding * 1. {@link DimensionToIdConverter} extracts the multi-value holder for the given row, which get's stored somewhere - * 2. For each entry in the multi-value object, the value gets encoded into a dictionaryId, using {@link DimensionToIdConverter#getIndividualValueDictId} + * 2. For each entry in the multi-value object, the value gets encoded into a dictionaryId, using {@link DimensionToIdConverter#lookupId} * 3. The callers can use this integer dictionaryID to materialize the results somewhere * * Decoding diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java index 67d8f157a352..ac2f6c1a7b6c 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -26,6 +26,7 @@ import org.apache.druid.query.ordering.StringComparator; import org.apache.druid.segment.ColumnValueSelector; import org.apache.druid.segment.DimensionHandlerUtils; +import org.apache.druid.segment.DimensionSelector; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.column.NullableTypeStrategy; @@ -40,10 +41,9 @@ * strings, and complex types. *

* The visibility of the class is limited, and the callers must use one of the two variants of the mapping strategy: - * 1. {@link PrebuiltDictionaryStringGroupByColumnSelectorStrategy} + * 1. {@link PrebuiltDictionaryGroupByColumnSelectorStrategy} * 2. {@link DictionaryBuildingGroupByColumnSelectorStrategy} *

- * TODO(laksh): Vet this change * {@code null} can be represented by either -1 or the position of null in the dictionary it was stored when it was * encountered. This is fine, because most of the time, the dictionary id has no value of its own, and is converted back to * the value it represents, before doing further operations. The only place where it would matter would be when @@ -52,6 +52,12 @@ * it is guaranteed that the dictionaryId of null represented by the pre-built dictionary would be the lowest (most likely 0) * and therefore nulls (-1) would be adjacent to nulls (represented by the lowest non-negative dictionary id), and would get * grouped in the later merge stages. + *

+ * It only handles single value dimensions, i.e. all types except for strings. Strings are handled by the implementations + * of {@link KeyMappingMultiValueGroupByColumnSelectorStrategy} + *

+ * It only handles non-primitive types, because numeric primitives are handled by the {@link FixedWidthGroupByColumnSelectorStrategy} + * and the string primitives are handled by the {@link KeyMappingMultiValueGroupByColumnSelectorStrategy} * * @param > Class of the dimension * @param Class of the "dimension holder". For single-value dimensions, the holder's type and the @@ -63,13 +69,13 @@ * @see IdToDimensionConverter decoding logic for converting back dictionary to value */ @NotThreadSafe -class KeyMappingGroupByColumnSelectorStrategy +class KeyMappingGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy { /** * Converts the dimension to equivalent dictionaryId. */ - final DimensionToIdConverter dimensionToIdConverter; + final DimensionToIdConverter dimensionToIdConverter; /** * Type of the dimension on which the grouping strategy is used @@ -85,10 +91,11 @@ class KeyMappingGroupByColumnSelectorStrategy idToDimensionConverter; KeyMappingGroupByColumnSelectorStrategy( - final DimensionToIdConverter dimensionToIdConverter, + final DimensionToIdConverter dimensionToIdConverter, final ColumnType columnType, final NullableTypeStrategy nullableTypeStrategy, final DimensionType defaultValue, @@ -130,9 +137,19 @@ public void processValueFromGroupingKey( @Override public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] valuess) { - MemoryEstimate multiValueHolder = dimensionToIdConverter.getMultiValueHolder(selector, null); - valuess[columnIndex] = multiValueHolder.value(); - return multiValueHolder.memoryIncrease(); + //noinspection unchecked + final DimensionType value = (DimensionType) DimensionHandlerUtils.convertObjectToType( + selector.getObject(), + columnType + ); + if (value == null) { + valuess[columnIndex] = GROUP_BY_MISSING_VALUE; + return 0; + } else { + MemoryEstimate idAndMemoryEstimate = dimensionToIdConverter.lookupId(value); + valuess[columnIndex] = idAndMemoryEstimate.value(); + return idAndMemoryEstimate.memoryIncrease(); + } } @Override @@ -144,24 +161,19 @@ public void initGroupingKeyColumnValue( int[] stack ) { - // It is always called with the DimensionHolderType, created + // It is always called with the dictionaryId that we'd have initialized //noinspection unchecked - DimensionHolderType rowObjCasted = (DimensionHolderType) rowObj; - int rowSize = dimensionToIdConverter.multiValueSize(rowObjCasted); - if (rowSize == 0) { - keyBuffer.putInt(keyBufferPosition, GROUP_BY_MISSING_VALUE); + int dictId = (int) rowObj; + keyBuffer.putInt(keyBufferPosition, dictId); + if (dictId == GROUP_BY_MISSING_VALUE) { stack[dimensionIndex] = 0; } else { - MemoryEstimate dictionaryIdAndMemoryIncrease = - dimensionToIdConverter.getIndividualValueDictId(rowObjCasted, 0); - // We should have already accounted for the memory increase when we call initColumnValues(). Dictionary building for - // all the values in the dimension (potentially multi-valued) should have happened there - assert dictionaryIdAndMemoryIncrease.memoryIncrease() == 0; - keyBuffer.putInt(keyBufferPosition, dictionaryIdAndMemoryIncrease.value()); stack[dimensionIndex] = 1; } } + // The method is only used for single value dimensions, therefore doesn't have any actual implementation of this + // method, which is only called for multi-value dimensions @Override public boolean checkRowIndexAndAddValueToGroupingKey( int keyBufferPosition, @@ -170,42 +182,27 @@ public boolean checkRowIndexAndAddValueToGroupingKey( ByteBuffer keyBuffer ) { - // Casting is fine, because while extracting the multiValueHolder, the implementations must ensure that the returned "multi-value" - // type is what the callers here expect - //noinspection unchecked - DimensionHolderType rowObjCasted = (DimensionHolderType) rowObj; - int rowSize = dimensionToIdConverter.multiValueSize(rowObjCasted); - if (rowValIdx < rowSize) { - MemoryEstimate dictionaryIdAndMemoryIncrease = - dimensionToIdConverter.getIndividualValueDictId(rowObjCasted, rowValIdx); - // We should have already accounted for the memory increase when we call initColumnValues(). Dictionary building for - // all the values in the dimension (potentially multi-valued) should have happened there - assert dictionaryIdAndMemoryIncrease.memoryIncrease() == 0; - keyBuffer.putInt( - keyBufferPosition, - dictionaryIdAndMemoryIncrease.value() - ); - return true; - } else { - return false; - } + return false; } @Override public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) { - MemoryEstimate multiValueHolder = dimensionToIdConverter.getMultiValueHolder(selector, null); - int multiValueSize = dimensionToIdConverter.multiValueSize(multiValueHolder.value()); - Preconditions.checkState(multiValueSize < 2, "Not supported for multi-value dimensions"); - MemoryEstimate dictIdAndSizeIncrease = dimensionToIdConverter.getIndividualValueDictId( - multiValueHolder.value(), - 0 + //noinspection unchecked + final DimensionType value = (DimensionType) DimensionHandlerUtils.convertObjectToType( + selector.getObject(), + columnType ); - final int dictId = multiValueSize == 1 ? dictIdAndSizeIncrease.value() : GROUP_BY_MISSING_VALUE; - keyBuffer.putInt(keyBufferPosition, dictId); - - // The implementations must return a non-nullable and non-negative size increase - return multiValueHolder.memoryIncrease() + dictIdAndSizeIncrease.memoryIncrease(); + final int memoryIncrease; + if (value == null) { + keyBuffer.putInt(keyBufferPosition, GROUP_BY_MISSING_VALUE); + return 0; + } else { + MemoryEstimate idAndMemoryIncrease = dimensionToIdConverter.lookupId(value); + keyBuffer.putInt(keyBufferPosition, idAndMemoryIncrease.value()); + memoryIncrease = idAndMemoryIncrease.memoryIncrease(); + } + return memoryIncrease; } @Override diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingMultiValueGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingMultiValueGroupByColumnSelectorStrategy.java new file mode 100644 index 000000000000..4f9d1c6b9ec4 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingMultiValueGroupByColumnSelectorStrategy.java @@ -0,0 +1,302 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.groupby.epinephelinae.column; + +import com.google.common.base.Preconditions; +import it.unimi.dsi.fastutil.objects.Object2IntMap; +import org.apache.druid.common.config.NullHandling; +import org.apache.druid.query.groupby.ResultRow; +import org.apache.druid.query.groupby.epinephelinae.DictionaryBuildingUtils; +import org.apache.druid.query.groupby.epinephelinae.Grouper; +import org.apache.druid.query.ordering.StringComparator; +import org.apache.druid.query.ordering.StringComparators; +import org.apache.druid.segment.ColumnValueSelector; +import org.apache.druid.segment.DimensionDictionary; +import org.apache.druid.segment.DimensionSelector; +import org.apache.druid.segment.column.ColumnCapabilities; +import org.apache.druid.segment.data.ArrayBasedIndexedInts; +import org.apache.druid.segment.data.IndexedInts; + +import javax.annotation.Nullable; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.function.IntFunction; + +/** + * Like {@link KeyMappingGroupByColumnSelectorStrategy}, but for multi-value dimensions, i.e. strings. It can only handle + * {@link DimensionSelector} + */ +public abstract class KeyMappingMultiValueGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy +{ + + public static GroupByColumnSelectorStrategy create( + ColumnCapabilities capabilities, + DimensionSelector dimensionSelector + ) + { + if (dimensionSelector.getValueCardinality() >= 0 && dimensionSelector.nameLookupPossibleInAdvance()) { + return new PrebuiltDictionary(capabilities, dimensionSelector::lookupName); + } + return new DictionaryBuilding(); + } + + @Override + public int getGroupingKeySizeBytes() + { + return Integer.BYTES; + } + + + @Override + public void initGroupingKeyColumnValue( + int keyBufferPosition, + int dimensionIndex, + Object rowObj, + ByteBuffer keyBuffer, + int[] stack + ) + { + IndexedInts row = (IndexedInts) rowObj; + int rowSize = row.size(); + + keyBuffer.putInt( + keyBufferPosition, + rowSize == 0 ? GROUP_BY_MISSING_VALUE : row.get(0) + ); + + stack[dimensionIndex] = rowSize == 0 ? 0 : 1; + } + + @Override + public boolean checkRowIndexAndAddValueToGroupingKey( + int keyBufferPosition, + Object rowObj, + int rowValIdx, + ByteBuffer keyBuffer + ) + { + IndexedInts row = (IndexedInts) rowObj; + int rowSize = row.size(); + + if (rowValIdx < rowSize) { + keyBuffer.putInt( + keyBufferPosition, + row.get(rowValIdx) + ); + return true; + } else { + return false; + } + } + + public static class PrebuiltDictionary extends KeyMappingMultiValueGroupByColumnSelectorStrategy + { + private final ColumnCapabilities capabilities; + private final IntFunction dictionaryLookup; + + public PrebuiltDictionary( + ColumnCapabilities capabilities, + IntFunction dictionaryLookup + ) + { + this.capabilities = capabilities; + this.dictionaryLookup = dictionaryLookup; + } + + @Override + public void processValueFromGroupingKey( + GroupByColumnSelectorPlus selectorPlus, + ByteBuffer key, ResultRow resultRow, + int keyBufferPosition + ) + { + final int id = key.getInt(keyBufferPosition); + if (id != GROUP_BY_MISSING_VALUE) { + resultRow.set( + selectorPlus.getResultRowPosition(), + ((DimensionSelector) selectorPlus.getSelector()).lookupName(id) + ); + } else { + // Since this is used for String dimensions only, we can directly put the default string value here + resultRow.set(selectorPlus.getResultRowPosition(), NullHandling.defaultStringValue()); + } + } + + @Override + public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] valuess) + { + DimensionSelector dimSelector = (DimensionSelector) selector; + IndexedInts row = dimSelector.getRow(); + valuess[columnIndex] = row; + return 0; + } + + @Override + public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) + { + final DimensionSelector dimSelector = (DimensionSelector) selector; + final IndexedInts row = dimSelector.getRow(); + Preconditions.checkState(row.size() < 2, "Not supported for multi-value dimensions"); + final int dictId = row.size() == 1 ? row.get(0) : GROUP_BY_MISSING_VALUE; + keyBuffer.putInt(keyBufferPosition, dictId); + return 0; + } + + @Override + public Grouper.BufferComparator bufferComparator( + int keyBufferPosition, + @Nullable StringComparator stringComparator + ) + { + final boolean canCompareInts = + capabilities != null && + capabilities.isDictionaryEncoded().and( + capabilities.areDictionaryValuesSorted().and( + capabilities.areDictionaryValuesUnique() + ) + ).isTrue(); + + final StringComparator comparator = stringComparator == null ? StringComparators.LEXICOGRAPHIC : stringComparator; + if (canCompareInts && StringComparators.LEXICOGRAPHIC.equals(comparator)) { + return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> Integer.compare( + lhsBuffer.getInt(lhsPosition + keyBufferPosition), + rhsBuffer.getInt(rhsPosition + keyBufferPosition) + ); + } else { + Preconditions.checkState(dictionaryLookup != null, "null dictionary lookup"); + return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> { + String lhsStr = dictionaryLookup.apply(lhsBuffer.getInt(lhsPosition + keyBufferPosition)); + String rhsStr = dictionaryLookup.apply(rhsBuffer.getInt(rhsPosition + keyBufferPosition)); + return comparator.compare(lhsStr, rhsStr); + }; + } + } + + @Override + public void reset() + { + // Nothing to do. + } + } + + public static class DictionaryBuilding extends KeyMappingMultiValueGroupByColumnSelectorStrategy + { + + private final List dictionary = DictionaryBuildingUtils.createDictionary(); + private final Object2IntMap reverseDictionary = DictionaryBuildingUtils.createReverseDictionary(); + + @Override + public void processValueFromGroupingKey( + GroupByColumnSelectorPlus selectorPlus, + ByteBuffer key, + ResultRow resultRow, + int keyBufferPosition + ) + { + final int id = key.getInt(keyBufferPosition); + + // GROUP_BY_MISSING_VALUE is used to indicate empty rows, which are omitted from the result map. + if (id != GROUP_BY_MISSING_VALUE) { + final String value = dictionary.get(id); + resultRow.set(selectorPlus.getResultRowPosition(), value); + } else { + resultRow.set(selectorPlus.getResultRowPosition(), NullHandling.defaultStringValue()); + } + } + + @Override + public int initColumnValues(ColumnValueSelector selector, int columnIndex, Object[] valuess) + { + final DimensionSelector dimSelector = (DimensionSelector) selector; + final IndexedInts row = dimSelector.getRow(); + int stateFootprintIncrease = 0; + ArrayBasedIndexedInts newRow = (ArrayBasedIndexedInts) valuess[columnIndex]; + if (newRow == null) { + newRow = new ArrayBasedIndexedInts(); + valuess[columnIndex] = newRow; + } + int rowSize = row.size(); + newRow.ensureSize(rowSize); + for (int i = 0; i < rowSize; i++) { + final String value = dimSelector.lookupName(row.get(i)); + final int dictId = reverseDictionary.getInt(value); + if (dictId < 0) { + final int nextId = dictionary.size(); + dictionary.add(value); + reverseDictionary.put(value, nextId); + newRow.setValue(i, nextId); + stateFootprintIncrease += + DictionaryBuildingUtils.estimateEntryFootprint((value == null ? 0 : value.length()) * Character.BYTES); + } else { + newRow.setValue(i, dictId); + } + } + newRow.setSize(rowSize); + return stateFootprintIncrease; + } + + @Override + public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, ByteBuffer keyBuffer) + { + final DimensionSelector dimSelector = (DimensionSelector) selector; + final IndexedInts row = dimSelector.getRow(); + + Preconditions.checkState(row.size() < 2, "Not supported for multi-value dimensions"); + + if (row.size() == 0) { + keyBuffer.putInt(keyBufferPosition, GROUP_BY_MISSING_VALUE); + return 0; + } + + final String value = dimSelector.lookupName(row.get(0)); + final int dictId = reverseDictionary.getInt(value); + if (dictId == DimensionDictionary.ABSENT_VALUE_ID) { + final int nextId = dictionary.size(); + dictionary.add(value); + reverseDictionary.put(value, nextId); + keyBuffer.putInt(keyBufferPosition, nextId); + return DictionaryBuildingUtils.estimateEntryFootprint((value == null ? 0 : value.length()) * Character.BYTES); + } else { + keyBuffer.putInt(keyBufferPosition, dictId); + return 0; + } + } + + @Override + public Grouper.BufferComparator bufferComparator(int keyBufferPosition, @Nullable StringComparator stringComparator) + { + final StringComparator realComparator = stringComparator == null ? + StringComparators.LEXICOGRAPHIC : + stringComparator; + return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> { + String lhsStr = dictionary.get(lhsBuffer.getInt(lhsPosition + keyBufferPosition)); + String rhsStr = dictionary.get(rhsBuffer.getInt(rhsPosition + keyBufferPosition)); + return realComparator.compare(lhsStr, rhsStr); + }; + } + + @Override + public void reset() + { + dictionary.clear(); + reverseDictionary.clear(); + } + } +} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java new file mode 100644 index 000000000000..d891ff0c28ea --- /dev/null +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.query.groupby.epinephelinae.column; + +import org.apache.druid.common.config.NullHandling; +import org.apache.druid.error.DruidException; +import org.apache.druid.segment.ColumnValueSelector; +import org.apache.druid.segment.DimensionSelector; +import org.apache.druid.segment.column.ColumnCapabilities; +import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.data.IndexedInts; + +import javax.annotation.Nullable; + +/** + * Implementation of {@link KeyMappingGroupByColumnSelectorStrategy} that relies on a prebuilt dictionary to map the + * dimension to the dictionaryId. It is more like a helper class, that handles the different ways that dictionaries can be + * provided for different types. Array dimensions are backed by dictionaries, but not exposed via the ColumnValueSelector interface, + * hence this strategy cannot handle array dimensions currently. + */ +public class PrebuiltDictionaryGroupByColumnSelectorStrategy +{ + /** + * Create the strategy for the provided column type + */ + public static GroupByColumnSelectorStrategy forType( + final ColumnType columnType, + final ColumnValueSelector columnValueSelector, + final ColumnCapabilities columnCapabilities + ) + { + throw DruidException.defensive("Only string columns expose prebuilt dictionaries, and they should be " + + "handled separately"); + } +} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java deleted file mode 100644 index f7306849f400..000000000000 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryStringGroupByColumnSelectorStrategy.java +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -import org.apache.druid.common.config.NullHandling; -import org.apache.druid.error.DruidException; -import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.DimensionSelector; -import org.apache.druid.segment.column.ColumnCapabilities; -import org.apache.druid.segment.column.ColumnType; -import org.apache.druid.segment.data.IndexedInts; - -import javax.annotation.Nullable; - -/** - * Implementation of {@link KeyMappingGroupByColumnSelectorStrategy} that relies on a prebuilt dictionary to map the - * dimension to the dictionaryId. It is more like a helper class, that handles the different ways that dictionaries can be - * provided for different types. Currently, it only handles String dimensions. Array dimensions are also backed by dictionaries, - * but not exposed via the ColumnValueSelector interface, hence this strategy cannot handle array dimensions. - */ -public class PrebuiltDictionaryStringGroupByColumnSelectorStrategy -{ - - /** - * Create the strategy for the provided column type - */ - public static GroupByColumnSelectorStrategy forType( - final ColumnType columnType, - final ColumnValueSelector columnValueSelector, - final ColumnCapabilities columnCapabilities - ) - { - if (columnType.equals(ColumnType.STRING)) { - return forString(columnValueSelector, columnCapabilities); - } else { - // This will change with array columns - throw DruidException.defensive("Only string columns expose prebuilt dictionaries"); - } - } - - private static GroupByColumnSelectorStrategy forString( - final ColumnValueSelector columnValueSelector, - final ColumnCapabilities columnCapabilities - ) - { - return new KeyMappingGroupByColumnSelectorStrategy<>( - new StringDimensionToIdConverter(), - ColumnType.STRING, - ColumnType.STRING.getNullableStrategy(), - NullHandling.defaultStringValue(), - new StringIdToDimensionConverter((DimensionSelector) columnValueSelector, columnCapabilities) - ); - } - - /** - * Dimension to id converter for string dimensions and {@link DimensionSelector}, where the dictionaries are prebuilt. - * The callers must ensure that's the case by checking that {@link DimensionSelector#getValueCardinality()} is known - * and {@link DimensionSelector#nameLookupPossibleInAdvance()} is true. - */ - private static class StringDimensionToIdConverter implements DimensionToIdConverter - { - @Override - public MemoryEstimate getMultiValueHolder( - final ColumnValueSelector selector, - final IndexedInts reusableValue - ) - { - return new MemoryEstimate<>(((DimensionSelector) selector).getRow(), 0); - } - - @Override - public int multiValueSize(IndexedInts multiValueHolder) - { - return multiValueHolder.size(); - } - - @Override - public MemoryEstimate getIndividualValueDictId(IndexedInts multiValueHolder, int index) - { - // dictId is already encoded in the indexedInt supplied by the column value selector - return new MemoryEstimate<>(multiValueHolder.get(index), 0); - } - } - - /** - * ID to dimension converter for {@link DimensionSelector} with prebuilt dictionary - */ - private static class StringIdToDimensionConverter implements IdToDimensionConverter - { - - final DimensionSelector dimensionSelector; - - @Nullable - final ColumnCapabilities columnCapabilities; - - public StringIdToDimensionConverter( - final DimensionSelector dimensionSelector, - @Nullable final ColumnCapabilities columnCapabilities - ) - { - this.dimensionSelector = dimensionSelector; - this.columnCapabilities = columnCapabilities; - } - - @Override - public String idToKey(int id) - { - // Converting back to the value is as simple as looking up the value in the prebuilt dictionary - return dimensionSelector.lookupName(id); - } - - @Override - public boolean canCompareIds() - { - return columnCapabilities != null - && columnCapabilities.hasBitmapIndexes() - && (columnCapabilities.areDictionaryValuesSorted() - .and(columnCapabilities.areDictionaryValuesUnique())).isTrue(); - } - } -} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/vector/DictionaryBuildingSingleValueStringGroupByVectorColumnSelector.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/vector/DictionaryBuildingSingleValueStringGroupByVectorColumnSelector.java index 67025aa855b3..393c7173ebb4 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/vector/DictionaryBuildingSingleValueStringGroupByVectorColumnSelector.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/vector/DictionaryBuildingSingleValueStringGroupByVectorColumnSelector.java @@ -23,7 +23,7 @@ import org.apache.datasketches.memory.WritableMemory; import org.apache.druid.common.config.NullHandling; import org.apache.druid.query.groupby.ResultRow; -import org.apache.druid.query.groupby.epinephelinae.DictionaryBuilding; +import org.apache.druid.query.groupby.epinephelinae.DictionaryBuildingUtils; import org.apache.druid.query.groupby.epinephelinae.collection.MemoryPointer; import org.apache.druid.segment.DimensionHandlerUtils; import org.apache.druid.segment.vector.VectorObjectSelector; @@ -83,7 +83,7 @@ public int writeKeys( // Use same ROUGH_OVERHEAD_PER_DICTIONARY_ENTRY as the nonvectorized version; dictionary structure is the same. stateFootprintIncrease += - DictionaryBuilding.estimateEntryFootprint((value == null ? 0 : value.length()) * Character.BYTES); + DictionaryBuildingUtils.estimateEntryFootprint((value == null ? 0 : value.length()) * Character.BYTES); } else { keySpace.putInt(j, dictId); } From 2479155ec2ce041262fa04c6306270fb188c5eb6 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Thu, 28 Mar 2024 16:15:59 +0530 Subject: [PATCH 18/46] group by on nested arrays, disallow topN and vector engine --- .../DictionaryBuildingUtils.java | 29 +-------- .../epinephelinae/RowBasedGrouperHelper.java | 62 ++++++++++++------- ...BuildingGroupByColumnSelectorStrategy.java | 1 - ...ctionaryGroupByColumnSelectorStrategy.java | 5 -- .../vector/VectorGroupByEngine.java | 4 +- .../druid/segment/DimensionHandlerUtils.java | 17 ++--- .../druid/sql/calcite/rel/DruidQuery.java | 4 +- 7 files changed, 51 insertions(+), 71 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuildingUtils.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuildingUtils.java index fc95378e6ba5..b870437eccbb 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuildingUtils.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuildingUtils.java @@ -66,6 +66,9 @@ public static Object2IntMap createReverseDictionary() return m; } + /** + * Create reverse dictionary that relies on the given HashStrategy for hashing and comparing equality + */ private static Object2IntMap createReverseDictionary(final Hash.Strategy hashStrategy) { final Object2IntOpenCustomHashMap m = new Object2IntOpenCustomHashMap<>(hashStrategy); @@ -86,32 +89,6 @@ public static Object2IntMap createTreeSortedReverseDictionary(Comparator< return m; } - /** - * Creates a reverse dictionary for arrays of primitive types. - */ -// public static Object2IntMap createReverseDictionaryForPrimitiveArray(TypeSignature arrayType) -// { -// if (!arrayType.isPrimitiveArray()) { -// throw DruidException.defensive("Dictionary building function expected an array of a primitive type"); -// } -// return createReverseDictionary(new Hash.Strategy() -// { -// @Override -// public int hashCode(Object[] o) -// { -// // We don't do a deep comparison, because the array type is primitive, therefore we don't need to incur the extra -// // overhead of checking the nestings -// return Arrays.hashCode(o); -// } -// -// @Override -// public boolean equals(Object[] a, Object[] b) -// { -// return arrayType.getNullableStrategy().compare(a, b) == 0; -// } -// }); -// } - /** * Estimated footprint of a new entry. */ diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java index 719deed7c009..dea5297ec0b5 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java @@ -1191,8 +1191,8 @@ private static class RowBasedKeySerde implements Grouper.KeySerde doubleArrayDictionary; private final Object2IntMap reverseDoubleArrayDictionary; - private final Map> complexTypeDictionaries = new HashMap<>(); - private final Map> complexTypeReverseDictionaries = new HashMap<>(); + private final Map> genericDictionaries = new HashMap<>(); + private final Map> genericReverseDictionaries = new HashMap<>(); // Size limiting for the dictionary, in (roughly estimated) bytes. private final long maxDictionarySize; @@ -1397,8 +1397,8 @@ public void reset() reverseFloatArrayDictionary.clear(); longArrayDictionary.clear(); reverseLongArrayDictionary.clear(); - complexTypeDictionaries.clear(); - complexTypeReverseDictionaries.clear(); + genericDictionaries.clear(); + genericReverseDictionaries.clear(); rankOfDictionaryIds = null; currentEstimatedSize = 0; } @@ -1459,7 +1459,7 @@ private RowBasedKeySerdeHelper makeSerdeHelper( && !DimensionComparisonUtils.isNaturalComparator(valueType.getType(), stringComparator)) { throw DruidException.defensive("Unexpected string comparator supplied"); } - return new ComplexRowBasedKeySerdeHelper(keyBufferPosition, valueType); + return new GenericRowBasedKeySerdeHelper(keyBufferPosition, valueType); case ARRAY: switch (valueType.getElementType().getType()) { case STRING: @@ -1596,40 +1596,54 @@ public void getFromByteBuffer(ByteBuffer buffer, int initialOffset, int dimValId public abstract Object2IntMap getReverseDictionary(); } - private class ComplexRowBasedKeySerdeHelper extends DictionaryBuildingSingleValuedRowBasedKeySerdeHelper + private class GenericRowBasedKeySerdeHelper extends DictionaryBuildingSingleValuedRowBasedKeySerdeHelper { final int keyBufferPosition; final BufferComparator bufferComparator; - final ColumnType complexType; - final String complexTypeName; + final ColumnType columnType; + final String columnTypeName; - final List complexTypeDictionary; - final Object2IntMap complexTypeReverseDictionary; + final List dictionary; + final Object2IntMap reverseDictionary; - public ComplexRowBasedKeySerdeHelper( + public GenericRowBasedKeySerdeHelper( int keyBufferPosition, - ColumnType complexType + ColumnType columnType ) { super(keyBufferPosition); this.keyBufferPosition = keyBufferPosition; - this.complexType = complexType; - this.complexTypeName = Preconditions.checkNotNull(complexType.getComplexTypeName(), "complex type name expected"); - this.complexTypeDictionary = complexTypeDictionaries.computeIfAbsent( - complexTypeName, + validateColumnType(columnType); + this.columnType = columnType; + this.columnTypeName = columnType.asTypeString(); + this.dictionary = genericDictionaries.computeIfAbsent( + columnTypeName, ignored -> DictionaryBuildingUtils.createDictionary() ); - this.complexTypeReverseDictionary = complexTypeReverseDictionaries.computeIfAbsent( - complexTypeName, - ignored -> DictionaryBuildingUtils.createTreeSortedReverseDictionary(complexType.getNullableStrategy()) + this.reverseDictionary = genericReverseDictionaries.computeIfAbsent( + columnTypeName, + ignored -> DictionaryBuildingUtils.createTreeSortedReverseDictionary(columnType.getNullableStrategy()) ); this.bufferComparator = (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> - complexType.getNullableStrategy().compare( - complexTypeDictionary.get(lhsBuffer.getInt(lhsPosition + keyBufferPosition)), - complexTypeDictionary.get(rhsBuffer.getInt(rhsPosition + keyBufferPosition)) + columnType.getNullableStrategy().compare( + dictionary.get(lhsBuffer.getInt(lhsPosition + keyBufferPosition)), + dictionary.get(rhsBuffer.getInt(rhsPosition + keyBufferPosition)) ); } + // Asserts that we don't entertain any complex types without a typename, to prevent intermixing dictionaries of + // different types. + private void validateColumnType(TypeSignature columnType) + { + if (columnType.isArray()) { + validateColumnType(columnType.getElementType()); + } else if (columnType.is(ValueType.COMPLEX)) { + if (columnType.getComplexTypeName() == null) { + throw DruidException.defensive("complex type name expected"); + } + } + } + @Override public BufferComparator getBufferComparator() { @@ -1639,13 +1653,13 @@ public BufferComparator getBufferComparator() @Override public List getDictionary() { - return complexTypeDictionary; + return dictionary; } @Override public Object2IntMap getReverseDictionary() { - return complexTypeReverseDictionary; + return reverseDictionary; } } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java index ff5cf1d21884..bbb0e6908686 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -150,7 +150,6 @@ public MemoryEstimate lookupId(Object multiValueHolder) dictionary.add(multiValueHolder); reverseDictionary.put(multiValueHolder, size); dictId = size; - // TODO(laksh): confirm if this is the same for sorted dictionaries as well // MultiValueHOlder is always expected to handle the type, once the coercion is complete //noinspection unchecked footprintIncrease = DictionaryBuildingUtils.estimateEntryFootprint( diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java index d891ff0c28ea..e78bca238e8e 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java @@ -19,15 +19,10 @@ package org.apache.druid.query.groupby.epinephelinae.column; -import org.apache.druid.common.config.NullHandling; import org.apache.druid.error.DruidException; import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.DimensionSelector; import org.apache.druid.segment.column.ColumnCapabilities; import org.apache.druid.segment.column.ColumnType; -import org.apache.druid.segment.data.IndexedInts; - -import javax.annotation.Nullable; /** * Implementation of {@link KeyMappingGroupByColumnSelectorStrategy} that relies on a prebuilt dictionary to map the diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/vector/VectorGroupByEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/vector/VectorGroupByEngine.java index a87723732c69..a2fc9cec8a6e 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/vector/VectorGroupByEngine.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/vector/VectorGroupByEngine.java @@ -219,8 +219,8 @@ private static boolean canVectorizeDimensions( return false; } - if (dimension.getOutputType().isArray()) { - // group by on arrays is not currently supported in the vector processing engine + if (!dimension.getOutputType().isPrimitive()) { + // group by on arrays and complex types is not currently supported in the vector processing engine return false; } diff --git a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java index 55e0db1b93f6..93f99c8e5ded 100644 --- a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java +++ b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java @@ -404,16 +404,10 @@ public static Object convertObjectToType( case STRING: return convertObjectToString(obj); case ARRAY: - switch (type.getElementType().getType()) { - case STRING: - return coerceToStringArray(obj); - case LONG: - return coerceToObjectArrayWithElementCoercionFunction(obj, DimensionHandlerUtils::convertObjectToLong); - case FLOAT: - return coerceToObjectArrayWithElementCoercionFunction(obj, DimensionHandlerUtils::convertObjectToFloat); - case DOUBLE: - return coerceToObjectArrayWithElementCoercionFunction(obj, DimensionHandlerUtils::convertObjectToDouble); - } + return coerceToObjectArrayWithElementCoercionFunction( + obj, + x -> DimensionHandlerUtils.convertObjectToType(x, type.getElementType()) + ); case COMPLEX: // Can't coerce complex objects, and we shouldn't need to. If in future selectors behave weirdly, or we need to // cast them (for some unknown reason), we can have that casting knowledge in the type strategy @@ -430,8 +424,9 @@ public static Object[] convertToArray(Object obj, TypeSignature eleme } + @Nullable public static Object[] coerceToObjectArrayWithElementCoercionFunction( - Object obj, + @Nullable Object obj, Function coercionFunction ) { diff --git a/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java b/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java index 2c16a5e05048..064e45cc1804 100644 --- a/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java +++ b/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java @@ -1249,8 +1249,8 @@ private TopNQuery toTopNQuery() } final DimensionSpec dimensionSpec = Iterables.getOnlyElement(grouping.getDimensions()).toDimensionSpec(); - // grouping col cannot be type array - if (dimensionSpec.getOutputType().isArray()) { + // grouping col cannot be arrays or complex types + if (!dimensionSpec.getOutputType().isPrimitive()) { return null; } final OrderByColumnSpec limitColumn; From e49206f40afd775262d885046ffb2334a2bbf715 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 29 Mar 2024 00:40:58 +0530 Subject: [PATCH 19/46] hash stuff --- .../DictionaryBuildingUtils.java | 46 ++----- .../epinephelinae/RowBasedGrouperHelper.java | 17 +-- ...BuildingGroupByColumnSelectorStrategy.java | 2 +- ...yMappingGroupByColumnSelectorStrategy.java | 2 - ...ltiValueGroupByColumnSelectorStrategy.java | 3 +- .../druid/segment/DimensionHandlerUtils.java | 5 - .../segment/column/NullableTypeStrategy.java | 23 +++- .../druid/segment/column/TypeStrategies.java | 122 +++++++++++++++++- .../druid/segment/column/TypeStrategy.java | 30 ++++- ...idthGroupByColumnSelectorStrategyTest.java | 2 +- ...lumnGroupByColumnSelectorStrategyTest.java | 2 +- .../segment/column/TypeStrategiesTest.java | 12 ++ 12 files changed, 203 insertions(+), 63 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuildingUtils.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuildingUtils.java index b870437eccbb..243aedf22c24 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuildingUtils.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/DictionaryBuildingUtils.java @@ -20,19 +20,11 @@ package org.apache.druid.query.groupby.epinephelinae; import it.unimi.dsi.fastutil.Hash; -import it.unimi.dsi.fastutil.objects.Object2IntAVLTreeMap; import it.unimi.dsi.fastutil.objects.Object2IntMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenCustomHashMap; -import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; -import it.unimi.dsi.fastutil.objects.Object2IntRBTreeMap; -import org.apache.druid.error.DruidException; import org.apache.druid.segment.DimensionDictionary; -import org.apache.druid.segment.column.TypeSignature; -import org.apache.druid.segment.column.ValueType; import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; import java.util.List; /** @@ -52,39 +44,19 @@ public static List createDictionary() } /** - * Creates a reverse dictionary (value -> dictionary ID). If a value is not present in the reverse dictionary, - * {@link Object2IntMap#getInt} will return {@link DimensionDictionary#ABSENT_VALUE_ID}. + * Create reverse dictionary (value -> dictionary ID) that relies on the given {@link Hash.Strategy} for + * hashing and comparing equality. It explicitly requires a hashing strategy, so that callers are aware of the + * correct implementation of the .hashCode and the .equals method used to store and address the objects * - * WARNING: This assumes that the .hashCode and the .equals of the method are implemented correctly. This does not - * apply for primitive array types, which donot consider new Object[]{1L, 2L} = new Object[]{1, 2}. For such objects, - * (especially arrays), a custom hash strategy must be passed. - */ - public static Object2IntMap createReverseDictionary() - { - final Object2IntOpenHashMap m = new Object2IntOpenHashMap<>(); - m.defaultReturnValue(DimensionDictionary.ABSENT_VALUE_ID); - return m; - } - - /** - * Create reverse dictionary that relies on the given HashStrategy for hashing and comparing equality - */ - private static Object2IntMap createReverseDictionary(final Hash.Strategy hashStrategy) - { - final Object2IntOpenCustomHashMap m = new Object2IntOpenCustomHashMap<>(hashStrategy); - m.defaultReturnValue(DimensionDictionary.ABSENT_VALUE_ID); - return m; - } - - /** - * Creates a reverse dictionary which stores the keys in a sorted map. The sorting is decided based on the given - * comparator + * If a value is not present in the reverse dictionary, {@link Object2IntMap#getInt} will + * return {@link DimensionDictionary#ABSENT_VALUE_ID}. * - * TODO(laksh): This function might be removed, if we decide ot go with hash based dictionaries. Also RB v/s AVL tree + * The object's {@link org.apache.druid.segment.column.NullableTypeStrategy} is often enough to create a reverse + * dictionary for those objects */ - public static Object2IntMap createTreeSortedReverseDictionary(Comparator comparator) + public static Object2IntMap createReverseDictionary(final Hash.Strategy hashStrategy) { - final Object2IntAVLTreeMap m = new Object2IntAVLTreeMap<>(comparator); + final Object2IntOpenCustomHashMap m = new Object2IntOpenCustomHashMap<>(hashStrategy); m.defaultReturnValue(DimensionDictionary.ABSENT_VALUE_ID); return m; } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java index dea5297ec0b5..2397cf758747 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java @@ -1224,24 +1224,19 @@ private static class RowBasedKeySerde implements Grouper.KeySerde DictionaryBuildingUtils.createTreeSortedReverseDictionary(columnType.getNullableStrategy()) + ignored -> DictionaryBuildingUtils.createReverseDictionary(columnType.getNullableStrategy()) ); this.bufferComparator = (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> columnType.getNullableStrategy().compare( diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java index bbb0e6908686..ea57cffd2d2f 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -109,7 +109,7 @@ private static GroupByColumnSelectorStrategy forArrayAndComplexTypes(final Colum { final List dictionary = DictionaryBuildingUtils.createDictionary(); final Object2IntMap reverseDictionary = - DictionaryBuildingUtils.createTreeSortedReverseDictionary(columnType.getNullableStrategy()); + DictionaryBuildingUtils.createReverseDictionary(columnType.getNullableStrategy()); return new DictionaryBuildingGroupByColumnSelectorStrategy<>( new UniValueDimensionToIdConverter(dictionary, reverseDictionary, columnType.getNullableStrategy()), columnType, diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java index ac2f6c1a7b6c..d202599ec8ea 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -19,14 +19,12 @@ package org.apache.druid.query.groupby.epinephelinae.column; -import com.google.common.base.Preconditions; import org.apache.druid.query.DimensionComparisonUtils; import org.apache.druid.query.groupby.ResultRow; import org.apache.druid.query.groupby.epinephelinae.Grouper; import org.apache.druid.query.ordering.StringComparator; import org.apache.druid.segment.ColumnValueSelector; import org.apache.druid.segment.DimensionHandlerUtils; -import org.apache.druid.segment.DimensionSelector; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.column.NullableTypeStrategy; diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingMultiValueGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingMultiValueGroupByColumnSelectorStrategy.java index 4f9d1c6b9ec4..aac9f2f14e7a 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingMultiValueGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingMultiValueGroupByColumnSelectorStrategy.java @@ -31,6 +31,7 @@ import org.apache.druid.segment.DimensionDictionary; import org.apache.druid.segment.DimensionSelector; import org.apache.druid.segment.column.ColumnCapabilities; +import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.data.ArrayBasedIndexedInts; import org.apache.druid.segment.data.IndexedInts; @@ -200,7 +201,7 @@ public static class DictionaryBuilding extends KeyMappingMultiValueGroupByColumn { private final List dictionary = DictionaryBuildingUtils.createDictionary(); - private final Object2IntMap reverseDictionary = DictionaryBuildingUtils.createReverseDictionary(); + private final Object2IntMap reverseDictionary = DictionaryBuildingUtils.createReverseDictionary(ColumnType.STRING.getNullableStrategy()); @Override public void processValueFromGroupingKey( diff --git a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java index 93f99c8e5ded..d4c15864275c 100644 --- a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java +++ b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java @@ -249,11 +249,6 @@ private static ColumnCapabilities getEffectiveCapabilities( capabilities = DEFAULT_STRING_CAPABILITIES; } -// // Complex dimension type is not supported -// if (capabilities.is(ValueType.COMPLEX)) { -// capabilities = DEFAULT_STRING_CAPABILITIES; -// } - // Currently, all extractionFns output Strings, so the column will return String values via a // DimensionSelector if an extractionFn is present. if (dimSpec.getExtractionFn() != null) { diff --git a/processing/src/main/java/org/apache/druid/segment/column/NullableTypeStrategy.java b/processing/src/main/java/org/apache/druid/segment/column/NullableTypeStrategy.java index ad5af7089655..044a092a3020 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/NullableTypeStrategy.java +++ b/processing/src/main/java/org/apache/druid/segment/column/NullableTypeStrategy.java @@ -19,6 +19,7 @@ package org.apache.druid.segment.column; +import it.unimi.dsi.fastutil.Hash; import org.apache.druid.common.config.NullHandling; import javax.annotation.CheckReturnValue; @@ -38,7 +39,7 @@ * * @see TypeStrategy */ -public final class NullableTypeStrategy implements Comparator +public final class NullableTypeStrategy implements Comparator, Hash.Strategy { private final TypeStrategy delegate; private final Comparator delegateComparator; @@ -132,4 +133,24 @@ public int compare(T o1, T o2) { return delegateComparator.compare(o1, o2); } + + public boolean groupable() + { + return delegate.groupable(); + } + + @Override + public int hashCode(@Nullable T o) + { + return o == null ? 0 : delegate.hashCode(o); + } + + @Override + public boolean equals(@Nullable T a, @Nullable T b) + { + if (a == null) { + return b == null; + } + return b != null && delegate.equals(a, b); + } } diff --git a/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java b/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java index 48570578b360..544e837addc9 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java +++ b/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java @@ -27,8 +27,6 @@ import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.StringUtils; -import org.apache.druid.segment.nested.NestedDataComplexTypeSerde; -import org.apache.druid.segment.serde.ComplexMetrics; import javax.annotation.Nullable; import java.nio.ByteBuffer; @@ -259,6 +257,12 @@ public Long read(ByteBuffer buffer, int offset) return buffer.getLong(offset); } + @Override + public boolean groupable() + { + return true; + } + @Override public boolean readRetainsBufferReference() { @@ -283,6 +287,18 @@ public int compare(Object o1, Object o2) { return Longs.compare(((Number) o1).longValue(), ((Number) o2).longValue()); } + + @Override + public int hashCode(Long o) + { + return o.hashCode(); + } + + @Override + public boolean equals(Long a, Long b) + { + return a.equals(b); + } } /** @@ -310,6 +326,12 @@ public Float read(ByteBuffer buffer, int offset) return buffer.getFloat(offset); } + @Override + public boolean groupable() + { + return true; + } + @Override public boolean readRetainsBufferReference() { @@ -334,6 +356,18 @@ public int compare(Object o1, Object o2) { return Floats.compare(((Number) o1).floatValue(), ((Number) o2).floatValue()); } + + @Override + public int hashCode(Float o) + { + return o.hashCode(); + } + + @Override + public boolean equals(Float a, Float b) + { + return a.equals(b); + } } /** @@ -362,6 +396,12 @@ public Double read(ByteBuffer buffer, int offset) return buffer.getDouble(offset); } + @Override + public boolean groupable() + { + return true; + } + @Override public boolean readRetainsBufferReference() { @@ -386,6 +426,18 @@ public int compare(Object o1, Object o2) { return Double.compare(((Number) o1).doubleValue(), ((Number) o2).doubleValue()); } + + @Override + public int hashCode(Double o) + { + return o.hashCode(); + } + + @Override + public boolean equals(Double a, Double b) + { + return a.equals(b); + } } /** @@ -436,6 +488,12 @@ public int write(ByteBuffer buffer, String value, int maxSizeBytes) return remaining; } + @Override + public boolean groupable() + { + return true; + } + @Override public int compare(Object s, Object s2) { @@ -449,6 +507,18 @@ public int compare(Object s, Object s2) return ORDERING.compare((String) s, (String) s2); } + + @Override + public int hashCode(String o) + { + return o.hashCode(); + } + + @Override + public boolean equals(String a, String b) + { + return a.equals(b); + } } /** @@ -520,6 +590,12 @@ public int write(ByteBuffer buffer, Object[] value, int maxSizeBytes) return extraNeeded < 0 ? extraNeeded : sizeBytes; } + @Override + public boolean groupable() + { + return true; + } + @Override public int compare(@Nullable Object o1Obj, @Nullable Object o2Obj) { @@ -546,5 +622,47 @@ public int compare(@Nullable Object o1Obj, @Nullable Object o2Obj) } return Integer.compare(o1.length, o2.length); } + + /** + * Implements {@link Arrays#hashCode(Object[])} but the element hashing uses the element's type strategy + */ + @Override + public int hashCode(Object[] o) + { + if (o == null) { + return 0; + } else { + int result = 1; + for (Object element : o) { + result = 31 * result + (element == null ? 0 : elementStrategy.hashCode(element)); + } + return result; + } + } + /** + * Implements {@link Arrays#equals} but the element equality uses the element's type strategy + */ + @Override + public boolean equals(@Nullable Object[] a, @Nullable Object[] b) + { + //noinspection ArrayEquality + if (a == b) { + return true; + } else if (a != null && b != null) { + int length = a.length; + if (b.length != length) { + return false; + } else { + for (int i = 0; i < length; ++i) { + if (!elementStrategy.equals(a[i], b[i])) { + return false; + } + } + return true; + } + } else { + return false; + } + } } } diff --git a/processing/src/main/java/org/apache/druid/segment/column/TypeStrategy.java b/processing/src/main/java/org/apache/druid/segment/column/TypeStrategy.java index 888e8203f4c1..9bd4ded9e985 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/TypeStrategy.java +++ b/processing/src/main/java/org/apache/druid/segment/column/TypeStrategy.java @@ -19,7 +19,9 @@ package org.apache.druid.segment.column; +import it.unimi.dsi.fastutil.Hash; import org.apache.druid.common.config.NullHandling; +import org.apache.druid.error.DruidException; import java.nio.ByteBuffer; import java.util.Comparator; @@ -63,7 +65,7 @@ * {@code Comparator}. So, we fall back to effectively erasing the generic type and having them all be * {@code Comparator}. */ -public interface TypeStrategy extends Comparator +public interface TypeStrategy extends Comparator, Hash.Strategy { /** * Estimate the size in bytes that writing this value to memory would require. This method is not required to be @@ -171,4 +173,30 @@ default T fromBytes(byte[] value) { throw new IllegalStateException("Not supported"); } + + /** + * Whether the type is groupable or not. This is always true for all the primitive types, arrays, and nested arrays + * therefore the SQL and the native layer might ignore this flag for those types. For complex types, this flag can be + * true or false, depending on whether the semantics and implementation of the type naturally leads to groupability + * or not. For example, it makes sense for JSON columns to be groupable, however there is little sense in grouping + * sketches (before finalizing). + * + * If a type is groupable, it MUST implement the {@link #hashCode} and {@link #equals} correctly + */ + default boolean groupable() + { + return false; + } + + @Override + default int hashCode(T o) + { + throw DruidException.defensive("Not implemented. Check groupable() first"); + } + + @Override + default boolean equals(T a, T b) + { + throw DruidException.defensive("Not implemented. Check groupable() first"); + } } diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java index 54a31a900e98..d6b499de74c7 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java @@ -496,4 +496,4 @@ private static Cursor createCursor() .build() ).lhs; } -} \ No newline at end of file +} diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java index 8db647701594..89ddb94c1c09 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java @@ -164,4 +164,4 @@ private static Cursor createCursor() .build() ).lhs; } -} \ No newline at end of file +} diff --git a/processing/src/test/java/org/apache/druid/segment/column/TypeStrategiesTest.java b/processing/src/test/java/org/apache/druid/segment/column/TypeStrategiesTest.java index 19b49212ecf0..670da7a5cf66 100644 --- a/processing/src/test/java/org/apache/druid/segment/column/TypeStrategiesTest.java +++ b/processing/src/test/java/org/apache/druid/segment/column/TypeStrategiesTest.java @@ -110,6 +110,12 @@ public int compare(Object o1, Object o2) { return 0; } + + @Override + public boolean groupable() + { + return false; + } }); } @@ -693,6 +699,12 @@ public NullableLongPair fromBytes(byte[] value) { return read(ByteBuffer.wrap(value)); } + + @Override + public boolean groupable() + { + return false; + } } @Test From 973fa88795f67dacbd3a2f9b3b882c54fd9752c5 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Mon, 1 Apr 2024 16:35:51 +0530 Subject: [PATCH 20/46] some review, more tests --- .../GroupByColumnSelectorStrategyFactory.java | 13 +++++- .../ObjectStrategyComplexTypeStrategy.java | 44 ++++++++++++++++++- .../druid/segment/column/TypeStrategies.java | 2 +- .../druid/segment/column/TypeStrategy.java | 1 - .../nested/NestedDataComplexTypeSerde.java | 31 +++++++++++++ .../druid/segment/nested/StructuredData.java | 8 ++++ .../druid/sql/calcite/rel/DruidQuery.java | 14 +++--- .../sql/calcite/CalciteGroupByQueryTest.java | 24 ++++++++++ 8 files changed, 125 insertions(+), 12 deletions(-) create mode 100644 sql/src/test/java/org/apache/druid/sql/calcite/CalciteGroupByQueryTest.java diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java index 02f49a9a47ff..3f475770fcf1 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java @@ -20,6 +20,7 @@ package org.apache.druid.query.groupby.epinephelinae; import org.apache.druid.error.DruidException; +import org.apache.druid.error.InvalidInput; import org.apache.druid.java.util.common.IAE; import org.apache.druid.query.dimension.ColumnSelectorStrategyFactory; import org.apache.druid.query.groupby.epinephelinae.column.DictionaryBuildingGroupByColumnSelectorStrategy; @@ -33,8 +34,11 @@ /** * Creates {@link org.apache.druid.query.dimension.ColumnSelectorStrategy}s for grouping dimensions - * - * TODO(laksh): Describe the steps and mv-handling + * If the type is STRING, then it delegates the group by handling to {@link KeyMappingMultiValueGroupByColumnSelectorStrategy} + * which is specialized for {@link DimensionSelector}s and multi-value dimensions. + * If the type is numeric, then it delegates the handling to the {@link FixedWidthGroupByColumnSelectorStrategy} + * Else, it delegates the handling to {@link DictionaryBuildingGroupByColumnSelectorStrategy} which is a generic strategy + * and builds dictionaries on the fly. */ public class GroupByColumnSelectorStrategyFactory implements ColumnSelectorStrategyFactory { @@ -47,6 +51,11 @@ public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( if (capabilities == null || capabilities.getType() == null) { throw DruidException.defensive("Unable to deduce type for the grouping dimension"); } + if (!capabilities.toColumnType().getNullableStrategy().groupable()) { + // InvalidInput because the SQL planner would have already flagged these dimensions, therefore this will only happen + // if native queries have been submitted. + throw InvalidInput.exception("Unable to group on the type [%s]", capabilities.toColumnType()); + } switch (capabilities.getType()) { case STRING: return KeyMappingMultiValueGroupByColumnSelectorStrategy.create(capabilities, (DimensionSelector) selector); diff --git a/processing/src/main/java/org/apache/druid/segment/column/ObjectStrategyComplexTypeStrategy.java b/processing/src/main/java/org/apache/druid/segment/column/ObjectStrategyComplexTypeStrategy.java index d40ee5dee8e1..03b7f12d4213 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/ObjectStrategyComplexTypeStrategy.java +++ b/processing/src/main/java/org/apache/druid/segment/column/ObjectStrategyComplexTypeStrategy.java @@ -19,6 +19,8 @@ package org.apache.druid.segment.column; +import it.unimi.dsi.fastutil.Hash; +import org.apache.druid.error.DruidException; import org.apache.druid.segment.data.ObjectStrategy; import javax.annotation.Nullable; @@ -27,7 +29,7 @@ /** * Default implementation of {@link TypeStrategy} for all {@link org.apache.druid.segment.serde.ComplexMetricSerde} * implementations that just wraps the {@link ObjectStrategy} they are required to implement. - * + *

* This is not likely to be the most efficient way to do things, especially since writing must first produce a byte * array before it can be written to the buffer, but it is cheap and should work correctly, which is important. */ @@ -35,11 +37,27 @@ public class ObjectStrategyComplexTypeStrategy implements TypeStrategy { private final ObjectStrategy objectStrategy; private final TypeSignature typeSignature; + private final boolean groupable; + @Nullable + private final Hash.Strategy hashStrategy; public ObjectStrategyComplexTypeStrategy(ObjectStrategy objectStrategy, TypeSignature signature) + { + this(objectStrategy, signature, false, null); + } + + public ObjectStrategyComplexTypeStrategy( + ObjectStrategy objectStrategy, + TypeSignature signature, + boolean groupable, + final Hash.Strategy hashStrategy + ) { this.objectStrategy = objectStrategy; this.typeSignature = signature; + this.groupable = groupable; + this.hashStrategy = hashStrategy; + } @Override @@ -94,4 +112,28 @@ public T fromBytes(byte[] value) { return objectStrategy.fromByteBufferSafe(ByteBuffer.wrap(value), value.length); } + + @Override + public boolean groupable() + { + return groupable; + } + + @Override + public int hashCode(T o) + { + if (hashStrategy == null) { + throw DruidException.defensive("hashStrategy not provided"); + } + return hashStrategy.hashCode(o); + } + + @Override + public boolean equals(T a, T b) + { + if (hashStrategy == null) { + throw DruidException.defensive("hashStrategy not provided"); + } + return hashStrategy.equals(a, b); + } } diff --git a/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java b/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java index 544e837addc9..bae29179b4d5 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java +++ b/processing/src/main/java/org/apache/druid/segment/column/TypeStrategies.java @@ -593,7 +593,7 @@ public int write(ByteBuffer buffer, Object[] value, int maxSizeBytes) @Override public boolean groupable() { - return true; + return elementStrategy.groupable(); } @Override diff --git a/processing/src/main/java/org/apache/druid/segment/column/TypeStrategy.java b/processing/src/main/java/org/apache/druid/segment/column/TypeStrategy.java index 9bd4ded9e985..3d2493bc80c5 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/TypeStrategy.java +++ b/processing/src/main/java/org/apache/druid/segment/column/TypeStrategy.java @@ -145,7 +145,6 @@ default T read(ByteBuffer buffer, int offset) * Callers MUST check that the return value is positive which indicates a successful write, while a negative response * a partial write. * - * // TODO(laksh): Can be optimised for the primitive types * @return number of bytes written */ default int write(ByteBuffer buffer, int offset, T value, int maxSizeBytes) diff --git a/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java b/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java index c7c569545e15..8492bef7da1d 100644 --- a/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java +++ b/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java @@ -23,6 +23,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.smile.SmileFactory; import com.fasterxml.jackson.dataformat.smile.SmileGenerator; +import it.unimi.dsi.fastutil.Hash; import org.apache.druid.data.input.impl.DimensionSchema; import org.apache.druid.guice.NestedDataModule; import org.apache.druid.jackson.DefaultObjectMapper; @@ -37,6 +38,8 @@ import org.apache.druid.segment.column.ColumnConfig; import org.apache.druid.segment.column.ColumnFormat; import org.apache.druid.segment.column.ColumnType; +import org.apache.druid.segment.column.ObjectStrategyComplexTypeStrategy; +import org.apache.druid.segment.column.TypeStrategy; import org.apache.druid.segment.data.ObjectStrategy; import org.apache.druid.segment.serde.ComplexMetricExtractor; import org.apache.druid.segment.serde.ComplexMetricSerde; @@ -157,6 +160,34 @@ public byte[] toBytes(@Nullable Object val) }; } + @Override + public > TypeStrategy getTypeStrategy() + { + return new ObjectStrategyComplexTypeStrategy<>( + getObjectStrategy(), + ColumnType.ofComplex(TYPE_NAME), + true, + new Hash.Strategy() + { + @Override + public int hashCode(Object o) + { + return StructuredData.wrap(o).equalityHash(); +// return StructuredData.wrap(o).hashCode(); + } + + @Override + public boolean equals(Object a, Object b) + { + // .equals() implementation of structured data is not very good for our purpose. It resorts to the object + // equality + return StructuredData.wrap(a).compareTo(StructuredData.wrap(b)) == 0; +// return StructuredData.wrap(a).equals(StructuredData.wrap(b)); + } + } + ); + } + public static class NestedColumnFormatV4 implements ColumnFormat { @Override diff --git a/processing/src/main/java/org/apache/druid/segment/nested/StructuredData.java b/processing/src/main/java/org/apache/druid/segment/nested/StructuredData.java index 32513d62e025..fb5c9cc17902 100644 --- a/processing/src/main/java/org/apache/druid/segment/nested/StructuredData.java +++ b/processing/src/main/java/org/apache/druid/segment/nested/StructuredData.java @@ -21,6 +21,7 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.core.JsonProcessingException; +import com.google.common.primitives.Longs; import net.jpountz.xxhash.XXHash64; import net.jpountz.xxhash.XXHashFactory; import org.apache.druid.java.util.common.guava.Comparators; @@ -185,6 +186,13 @@ public int hashCode() return Objects.hash(value); } + // hashCode that relies on the object equality. Translates the hashcode to an integer as well + // TODO(laksh): better name + public int equalityHash() + { + return Longs.hashCode(hash.getAsLong()); + } + @Override public String toString() { diff --git a/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java b/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java index 064e45cc1804..c87eeca8edf5 100644 --- a/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java +++ b/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java @@ -484,16 +484,16 @@ private static List computeDimensions( final RelDataType dataType = rexNode.getType(); final ColumnType outputType = Calcites.getColumnTypeForRelDataType(dataType); - // TODO(laksh): This might change if we disallow certain complex types from grouping if (outputType == null) { - // Can't group on unknown or COMPLEX types. - plannerContext.setPlanningError( - "SQL requires a group-by on a column with unknown type that is unsupported.", - outputType - ); + // Can't group on unknown types. + plannerContext.setPlanningError("SQL requires a group-by on a column with unknown type that is unsupported."); + throw new CannotBuildQueryException(aggregate, rexNode); + } + if (!outputType.getNullableStrategy().groupable()) { + // Can't group on 'ungroupable' types. + plannerContext.setPlanningError("SQL requires a group-by column with ungroupable type [%s].", outputType); throw new CannotBuildQueryException(aggregate, rexNode); } - final String dimOutputName = outputNamePrefix + outputNameCounter++; if (!druidExpression.isSimpleExtraction()) { final String virtualColumn = virtualColumnRegistry.getOrCreateVirtualColumnForExpression( diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteGroupByQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteGroupByQueryTest.java new file mode 100644 index 000000000000..4595de28fcac --- /dev/null +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteGroupByQueryTest.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.sql.calcite; + +public class CalciteGroupByQueryTest +{ +} From 8b62073ebf6f6fcac2b12d1089ae3d3b3119af8e Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Tue, 2 Apr 2024 11:27:21 +0530 Subject: [PATCH 21/46] fixup benchmark --- .../benchmark/query/SqlGroupByBenchmark.java | 39 +++++++++++-------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java index b8d1d2ef2d36..0cc81fc2fc28 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java @@ -21,10 +21,10 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Lists; import org.apache.druid.common.config.NullHandling; import org.apache.druid.data.input.impl.DimensionSchema; import org.apache.druid.data.input.impl.DimensionsSpec; +import org.apache.druid.guice.NestedDataModule; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.granularity.Granularities; import org.apache.druid.java.util.common.guava.Sequence; @@ -92,6 +92,7 @@ public class SqlGroupByBenchmark static { NullHandling.initializeForTests(); ExpressionProcessing.initializeForTests(); + NestedDataModule.registerHandlersAndSerde(); } private static final DruidProcessingConfig PROCESSING_CONFIG = new DruidProcessingConfig() @@ -122,29 +123,29 @@ public String getFormatString() }; @Param({ -// "string-Sequential-100_000", -// "string-Sequential-10_000_000", + "string-Sequential-100_000", + "string-Sequential-10_000_000", // "string-Sequential-1_000_000_000", -// "string-ZipF-1_000_000", -// "string-Uniform-1_000_000", + "string-ZipF-1_000_000", + "string-Uniform-1_000_000", -// "multi-string-Sequential-100_000", -// "multi-string-Sequential-10_000_000", + "multi-string-Sequential-100_000", + "multi-string-Sequential-10_000_000", // "multi-string-Sequential-1_000_000_000", -// "multi-string-ZipF-1_000_000", -// "multi-string-Uniform-1_000_000", + "multi-string-ZipF-1_000_000", + "multi-string-Uniform-1_000_000", -// "long-Sequential-100_000", -// "long-Sequential-10_000_000", + "long-Sequential-100_000", + "long-Sequential-10_000_000", // "long-Sequential-1_000_000_000", -// "long-ZipF-1_000_000", -// "long-Uniform-1_000_000", + "long-ZipF-1_000_000", + "long-Uniform-1_000_000", -// "double-ZipF-1_000_000", -// "double-Uniform-1_000_000", + "double-ZipF-1_000_000", + "double-Uniform-1_000_000", -// "float-ZipF-1_000_000", -// "float-Uniform-1_000_000", + "float-ZipF-1_000_000", + "float-Uniform-1_000_000", "stringArray-Sequential-100_000", "stringArray-Sequential-3_000_000", @@ -330,6 +331,10 @@ public void setup() .add(dataSegment2, index); closer.register(walker); + // Hacky and pollutes global namespace, but it is fine since benchmarks are run in isolation. Wasn't able + // to work up a cleaner way of doing it by modifying the injector. + CalciteTests.getJsonMapper().registerModules(NestedDataModule.getJacksonModulesList()); + final DruidSchemaCatalog rootSchema = CalciteTests.createMockRootSchema(conglomerate, walker, plannerConfig, AuthTestUtils.TEST_AUTHORIZER_MAPPER); engine = CalciteTests.createMockSqlEngine(walker, conglomerate); From c07d77dd5c913445e3590b708b94603448692018 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Mon, 8 Apr 2024 16:12:20 +0530 Subject: [PATCH 22/46] checkstyle --- .../segment/nested/NestedDataComplexTypeSerde.java | 10 +++++----- .../org/apache/druid/sql/calcite/QueryTestRunner.java | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java b/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java index 8492bef7da1d..27df8fc2cb10 100644 --- a/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java +++ b/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java @@ -172,17 +172,17 @@ public > TypeStrategy getTypeStrategy() @Override public int hashCode(Object o) { - return StructuredData.wrap(o).equalityHash(); -// return StructuredData.wrap(o).hashCode(); + // TODO(laksh): VET, Check if StructuredData.wrap(o).hashCode() makes sense, given that most of the objects inside + // are primitives or those that have implemented .hashCode correctly + return StructuredData.wrap(o).equalityHash(); } @Override public boolean equals(Object a, Object b) { - // .equals() implementation of structured data is not very good for our purpose. It resorts to the object - // equality + // TODO(laksh): VET, .equals() implementation of structured data is not very good for our purpose. It + // resorts to the object equality return StructuredData.wrap(a).compareTo(StructuredData.wrap(b)) == 0; -// return StructuredData.wrap(a).equals(StructuredData.wrap(b)); } } ); diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java b/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java index 168a0f02bc4b..58fa5635e8f5 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/QueryTestRunner.java @@ -385,7 +385,7 @@ public VerifyNativeQueries(BaseExecuteQuery execStep) public void verify() { for (QueryResults queryResults : execStep.results()) { -// verifyQuery(queryResults); + verifyQuery(queryResults); } } From 2e0fe2645752565e53b08d5a7076fb5f0736a7e9 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Mon, 8 Apr 2024 16:49:23 +0530 Subject: [PATCH 23/46] codeql --- .../org/apache/druid/benchmark/query/SqlGroupByBenchmark.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java index 0cc81fc2fc28..bec37960a52d 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java @@ -196,7 +196,7 @@ public void setup() final PlannerConfig plannerConfig = new PlannerConfig(); String columnCardinalityWithUnderscores = groupingDimension.substring(groupingDimension.lastIndexOf('-') + 1); - int rowsPerSegment = Integers.parseInt(columnCardinalityWithUnderscores.replaceAll("_", "")); + int rowsPerSegment = Integers.parseInt(StringUtils.replace(columnCardinalityWithUnderscores, "_", "")); final SegmentGenerator segmentGenerator = closer.register(new SegmentGenerator()); log.info("Starting benchmark setup using cacheDir[%s], rows[%,d].", segmentGenerator.getCacheDir(), rowsPerSegment); From a5803264071b21929b617fe12a570a857e87556b Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Tue, 9 Apr 2024 02:15:00 +0530 Subject: [PATCH 24/46] static check --- .../org/apache/druid/benchmark/query/SqlGroupByBenchmark.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java index bec37960a52d..ed9460fa8b40 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java @@ -29,7 +29,6 @@ import org.apache.druid.java.util.common.granularity.Granularities; import org.apache.druid.java.util.common.guava.Sequence; import org.apache.druid.java.util.common.io.Closer; -import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.math.expr.ExpressionProcessing; import org.apache.druid.query.DruidProcessingConfig; import org.apache.druid.query.QueryRunnerFactoryConglomerate; @@ -87,8 +86,6 @@ @Measurement(iterations = 5) public class SqlGroupByBenchmark { - private static final Logger log = new Logger(SqlGroupByBenchmark.class); - static { NullHandling.initializeForTests(); ExpressionProcessing.initializeForTests(); @@ -199,7 +196,6 @@ public void setup() int rowsPerSegment = Integers.parseInt(StringUtils.replace(columnCardinalityWithUnderscores, "_", "")); final SegmentGenerator segmentGenerator = closer.register(new SegmentGenerator()); - log.info("Starting benchmark setup using cacheDir[%s], rows[%,d].", segmentGenerator.getCacheDir(), rowsPerSegment); TransformSpec transformSpec = new TransformSpec( null, From 15fb3bbd4cfc1430e34489d93f893dd9a346a58f Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Tue, 9 Apr 2024 16:37:30 +0530 Subject: [PATCH 25/46] static check --- .../org/apache/druid/benchmark/query/SqlGroupByBenchmark.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java index ed9460fa8b40..52745e62fb30 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java @@ -58,7 +58,6 @@ import org.apache.druid.sql.calcite.util.CalciteTests; import org.apache.druid.timeline.DataSegment; import org.apache.druid.timeline.partition.LinearShardSpec; -import org.apache.logging.log4j.core.util.Integers; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.Fork; @@ -193,7 +192,7 @@ public void setup() final PlannerConfig plannerConfig = new PlannerConfig(); String columnCardinalityWithUnderscores = groupingDimension.substring(groupingDimension.lastIndexOf('-') + 1); - int rowsPerSegment = Integers.parseInt(StringUtils.replace(columnCardinalityWithUnderscores, "_", "")); + int rowsPerSegment = Integer.parseInt(StringUtils.replace(columnCardinalityWithUnderscores, "_", "")); final SegmentGenerator segmentGenerator = closer.register(new SegmentGenerator()); From 30669424023120b01c4143a1082a07f283ded650 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Tue, 9 Apr 2024 16:46:05 +0530 Subject: [PATCH 26/46] static check --- .../epinephelinae/RowBasedGrouperHelper.java | 4 ---- ...KeyMappingGroupByColumnSelectorStrategy.java | 16 +++++----------- ...DictionaryGroupByColumnSelectorStrategy.java | 17 ----------------- 3 files changed, 5 insertions(+), 32 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java index 2397cf758747..12591c39981a 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java @@ -1593,9 +1593,7 @@ public void getFromByteBuffer(ByteBuffer buffer, int initialOffset, int dimValId private class GenericRowBasedKeySerdeHelper extends DictionaryBuildingSingleValuedRowBasedKeySerdeHelper { - final int keyBufferPosition; final BufferComparator bufferComparator; - final ColumnType columnType; final String columnTypeName; final List dictionary; @@ -1607,9 +1605,7 @@ public GenericRowBasedKeySerdeHelper( ) { super(keyBufferPosition); - this.keyBufferPosition = keyBufferPosition; validateColumnType(columnType); - this.columnType = columnType; this.columnTypeName = columnType.asTypeString(); this.dictionary = genericDictionaries.computeIfAbsent( columnTypeName, diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java index d202599ec8ea..4dc3c1b383f0 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -33,10 +33,10 @@ import java.nio.ByteBuffer; /** - * Strategy for grouping dimensions which can have variable-width objects. Materializing such objects on the buffer + * Strategy for grouping single value dimensions which can have variable-width objects. Materializing such objects on the buffer * require an additional step of mapping them to an integer index. The integer index can be materialized on the buffer within * a fixed width, and is often backed by a dictionary representing the actual dimension object. It is used for arrays, - * strings, and complex types. + * and complex types. *

* The visibility of the class is limited, and the callers must use one of the two variants of the mapping strategy: * 1. {@link PrebuiltDictionaryGroupByColumnSelectorStrategy} @@ -51,24 +51,18 @@ * and therefore nulls (-1) would be adjacent to nulls (represented by the lowest non-negative dictionary id), and would get * grouped in the later merge stages. *

- * It only handles single value dimensions, i.e. all types except for strings. Strings are handled by the implementations + * It only handles single value dimensions, i.e. every type except for strings. Strings are handled by the implementations * of {@link KeyMappingMultiValueGroupByColumnSelectorStrategy} *

* It only handles non-primitive types, because numeric primitives are handled by the {@link FixedWidthGroupByColumnSelectorStrategy} * and the string primitives are handled by the {@link KeyMappingMultiValueGroupByColumnSelectorStrategy} * - * @param > Class of the dimension - * @param Class of the "dimension holder". For single-value dimensions, the holder's type and the - * holder's object are equivalent to the dimension. For multi-value dimensions (only strings), - * the holder's type and the object are different, where the type would be {@link org.apache.druid.segment.data.IndexedInts} - * representing all the values in the multi-valued string, while the dimension type would be - * String + * @param > Class of the dimension * @see DimensionToIdConverter encoding logic for converting value to dictionary * @see IdToDimensionConverter decoding logic for converting back dictionary to value */ @NotThreadSafe -class KeyMappingGroupByColumnSelectorStrategy - implements GroupByColumnSelectorStrategy +class KeyMappingGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy { /** * Converts the dimension to equivalent dictionaryId. diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java index e78bca238e8e..c2e1cc364f16 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java @@ -19,11 +19,6 @@ package org.apache.druid.query.groupby.epinephelinae.column; -import org.apache.druid.error.DruidException; -import org.apache.druid.segment.ColumnValueSelector; -import org.apache.druid.segment.column.ColumnCapabilities; -import org.apache.druid.segment.column.ColumnType; - /** * Implementation of {@link KeyMappingGroupByColumnSelectorStrategy} that relies on a prebuilt dictionary to map the * dimension to the dictionaryId. It is more like a helper class, that handles the different ways that dictionaries can be @@ -32,16 +27,4 @@ */ public class PrebuiltDictionaryGroupByColumnSelectorStrategy { - /** - * Create the strategy for the provided column type - */ - public static GroupByColumnSelectorStrategy forType( - final ColumnType columnType, - final ColumnValueSelector columnValueSelector, - final ColumnCapabilities columnCapabilities - ) - { - throw DruidException.defensive("Only string columns expose prebuilt dictionaries, and they should be " - + "handled separately"); - } } From 8cd955e5509a407f9ee33f22521c4dd312a61f6d Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Tue, 9 Apr 2024 17:13:15 +0530 Subject: [PATCH 27/46] delete --- ...ctionaryGroupByColumnSelectorStrategy.java | 30 ------------------- 1 file changed, 30 deletions(-) delete mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java deleted file mode 100644 index c2e1cc364f16..000000000000 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/PrebuiltDictionaryGroupByColumnSelectorStrategy.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -/** - * Implementation of {@link KeyMappingGroupByColumnSelectorStrategy} that relies on a prebuilt dictionary to map the - * dimension to the dictionaryId. It is more like a helper class, that handles the different ways that dictionaries can be - * provided for different types. Array dimensions are backed by dictionaries, but not exposed via the ColumnValueSelector interface, - * hence this strategy cannot handle array dimensions currently. - */ -public class PrebuiltDictionaryGroupByColumnSelectorStrategy -{ -} From be500d5f97a4b950afcc642d83c1eb321882ad6d Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 12 Apr 2024 11:01:25 +0530 Subject: [PATCH 28/46] tests fix --- .../filter/ArrayContainsElementFilter.java | 6 +++- .../ObjectStrategyComplexTypeStrategy.java | 2 +- .../calcite/CalciteNestedDataQueryTest.java | 29 ++++++++++++++----- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/filter/ArrayContainsElementFilter.java b/processing/src/main/java/org/apache/druid/query/filter/ArrayContainsElementFilter.java index 129c8570dbb9..6fbe38451522 100644 --- a/processing/src/main/java/org/apache/druid/query/filter/ArrayContainsElementFilter.java +++ b/processing/src/main/java/org/apache/druid/query/filter/ArrayContainsElementFilter.java @@ -26,6 +26,7 @@ import com.google.common.base.Suppliers; import com.google.common.collect.ImmutableSet; import com.google.common.collect.RangeSet; +import org.apache.druid.error.DruidException; import org.apache.druid.error.InvalidInput; import org.apache.druid.java.util.common.IAE; import org.apache.druid.math.expr.ExprEval; @@ -108,7 +109,10 @@ public byte[] getCacheKey() final NullableTypeStrategy typeStrategy = elementMatchValueEval.type().getNullableStrategy(); final int size = typeStrategy.estimateSizeBytes(elementMatchValueEval.value()); final ByteBuffer valueBuffer = ByteBuffer.allocate(size); - typeStrategy.write(valueBuffer, elementMatchValueEval.value(), size); + if (typeStrategy.write(valueBuffer, elementMatchValueEval.value(), size) < 0) { + // Defensive check, since the size had already been estimated from the same type strategy + throw DruidException.defensive("Unable to write the value"); + } return new CacheKeyBuilder(DimFilterUtils.ARRAY_CONTAINS_CACHE_ID) .appendByte(DimFilterUtils.STRING_SEPARATOR) .appendString(column) diff --git a/processing/src/main/java/org/apache/druid/segment/column/ObjectStrategyComplexTypeStrategy.java b/processing/src/main/java/org/apache/druid/segment/column/ObjectStrategyComplexTypeStrategy.java index 03b7f12d4213..52e4599586b9 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/ObjectStrategyComplexTypeStrategy.java +++ b/processing/src/main/java/org/apache/druid/segment/column/ObjectStrategyComplexTypeStrategy.java @@ -50,7 +50,7 @@ public ObjectStrategyComplexTypeStrategy( ObjectStrategy objectStrategy, TypeSignature signature, boolean groupable, - final Hash.Strategy hashStrategy + @Nullable final Hash.Strategy hashStrategy ) { this.objectStrategy = objectStrategy; diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java index 88f7e7dd8aec..4bec9e16eb9d 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java @@ -108,12 +108,6 @@ public class CalciteNestedDataQueryTest extends BaseCalciteQueryTest .put("long", 4L) .put("nester", "hello") .build(), - ImmutableMap.builder() - .put("t", "2000-01-01") - .put("string", "bbb") - .put("long", 4L) - .put("nester", "hello") - .build(), ImmutableMap.builder() .put("t", "2000-01-01") .put("string", "ccc") @@ -547,10 +541,29 @@ public void testTopNPath() @Test public void testGroupByNested() { + cannotVectorize(); testQuery( "SELECT nester, SUM(strlen(string)) FROM druid.nested GROUP BY 1", - ImmutableList.of(), - ImmutableList.of() + ImmutableList.of( + GroupByQuery.builder() + .setDataSource(DATA_SOURCE) + .setInterval(querySegmentSpec(Filtration.eternity())) + .setGranularity(Granularities.ALL) + .setVirtualColumns( + new ExpressionVirtualColumn("v0", "strlen(\"string\")", ColumnType.LONG, queryFramework().macroTable()) + ) + .setDimensions(dimensions(new DefaultDimensionSpec("nester", "d0", ColumnType.NESTED_DATA))) + .setAggregatorSpecs(aggregators(new LongSumAggregatorFactory("a0", "v0"))) + .setContext(QUERY_CONTEXT_DEFAULT) + .build() + ), + ImmutableList.of( + new Object[]{null, 9L}, + new Object[]{"\"hello\"", 3L}, + new Object[]{"2", 3L}, + new Object[]{"{\"array\":[\"a\",\"b\"],\"n\":{\"x\":\"hello\"}}", 3L}, + new Object[]{"{\"array\":[\"a\",\"b\"],\"n\":{\"x\":1}}", 3L} + ) ); } From 6090d82db761db7d130a81c0fe9abc94d8acf54c Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 12 Apr 2024 12:11:33 +0530 Subject: [PATCH 29/46] openrewrite --- .../column/FixedWidthGroupByColumnSelectorStrategyTest.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java index d6b499de74c7..61e7d5ee1932 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java @@ -31,15 +31,12 @@ import org.apache.druid.testing.InitializedNullHandlingTest; import org.junit.Assert; import org.junit.Test; -import org.junit.experimental.runners.Enclosed; -import org.junit.runner.RunWith; import org.mockito.Mockito; import javax.annotation.Nullable; import java.nio.ByteBuffer; import java.util.List; -@RunWith(Enclosed.class) public class FixedWidthGroupByColumnSelectorStrategyTest extends InitializedNullHandlingTest { private static final List DATASOURCE_ROWS = ImmutableList.of( From f9bbc21d1e0708735a18ec153eaf5e391c09a0b6 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 12 Apr 2024 12:26:58 +0530 Subject: [PATCH 30/46] Revert "openrewrite" This reverts commit 6090d82db761db7d130a81c0fe9abc94d8acf54c. --- .../column/FixedWidthGroupByColumnSelectorStrategyTest.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java index 61e7d5ee1932..d6b499de74c7 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java @@ -31,12 +31,15 @@ import org.apache.druid.testing.InitializedNullHandlingTest; import org.junit.Assert; import org.junit.Test; +import org.junit.experimental.runners.Enclosed; +import org.junit.runner.RunWith; import org.mockito.Mockito; import javax.annotation.Nullable; import java.nio.ByteBuffer; import java.util.List; +@RunWith(Enclosed.class) public class FixedWidthGroupByColumnSelectorStrategyTest extends InitializedNullHandlingTest { private static final List DATASOURCE_ROWS = ImmutableList.of( From c28eb797dc287247839ca5121954530098ab042a Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 12 Apr 2024 12:31:59 +0530 Subject: [PATCH 31/46] openrewrite test --- .github/workflows/static-checks.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/static-checks.yml b/.github/workflows/static-checks.yml index a87000ac07e0..a374cf72ccfe 100644 --- a/.github/workflows/static-checks.yml +++ b/.github/workflows/static-checks.yml @@ -163,9 +163,17 @@ jobs: ${MVN} install -q -ff -pl 'distribution' ${MAVEN_SKIP} ${MAVEN_SKIP_TESTS} - name: rewrite:dryRun + id: rewrite-dryRun run: | ${MVN} rewrite:dryRun ${MAVEN_SKIP} + - name: Upload open rewrite patch + if: ${{ failure() && steps.rewrite-dryRun.conclusion == 'failure' }} + uses: actions/upload-artifact@master + with: + name: Rewrite patch + path: ./target/rewrite/rewrite.patch + web-checks: strategy: fail-fast: false From b51248f673711f7ec276b01e31e4bcc1e0c3e845 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 12 Apr 2024 13:01:59 +0530 Subject: [PATCH 32/46] Revert "openrewrite test" This reverts commit c28eb797dc287247839ca5121954530098ab042a. --- .github/workflows/static-checks.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/static-checks.yml b/.github/workflows/static-checks.yml index a374cf72ccfe..a87000ac07e0 100644 --- a/.github/workflows/static-checks.yml +++ b/.github/workflows/static-checks.yml @@ -163,17 +163,9 @@ jobs: ${MVN} install -q -ff -pl 'distribution' ${MAVEN_SKIP} ${MAVEN_SKIP_TESTS} - name: rewrite:dryRun - id: rewrite-dryRun run: | ${MVN} rewrite:dryRun ${MAVEN_SKIP} - - name: Upload open rewrite patch - if: ${{ failure() && steps.rewrite-dryRun.conclusion == 'failure' }} - uses: actions/upload-artifact@master - with: - name: Rewrite patch - path: ./target/rewrite/rewrite.patch - web-checks: strategy: fail-fast: false From 031f5863dec4e97aa873818f35669c57c785f536 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 12 Apr 2024 13:02:05 +0530 Subject: [PATCH 33/46] Revert "Revert "openrewrite test"" This reverts commit b51248f673711f7ec276b01e31e4bcc1e0c3e845. --- .github/workflows/static-checks.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/static-checks.yml b/.github/workflows/static-checks.yml index a87000ac07e0..a374cf72ccfe 100644 --- a/.github/workflows/static-checks.yml +++ b/.github/workflows/static-checks.yml @@ -163,9 +163,17 @@ jobs: ${MVN} install -q -ff -pl 'distribution' ${MAVEN_SKIP} ${MAVEN_SKIP_TESTS} - name: rewrite:dryRun + id: rewrite-dryRun run: | ${MVN} rewrite:dryRun ${MAVEN_SKIP} + - name: Upload open rewrite patch + if: ${{ failure() && steps.rewrite-dryRun.conclusion == 'failure' }} + uses: actions/upload-artifact@master + with: + name: Rewrite patch + path: ./target/rewrite/rewrite.patch + web-checks: strategy: fail-fast: false From 170d8d90265a20480bf7d6509882dfacb7374223 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 12 Apr 2024 15:33:34 +0530 Subject: [PATCH 34/46] openrewrite test --- .../column/FixedWidthGroupByColumnSelectorStrategyTest.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java index d6b499de74c7..61e7d5ee1932 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java @@ -31,15 +31,12 @@ import org.apache.druid.testing.InitializedNullHandlingTest; import org.junit.Assert; import org.junit.Test; -import org.junit.experimental.runners.Enclosed; -import org.junit.runner.RunWith; import org.mockito.Mockito; import javax.annotation.Nullable; import java.nio.ByteBuffer; import java.util.List; -@RunWith(Enclosed.class) public class FixedWidthGroupByColumnSelectorStrategyTest extends InitializedNullHandlingTest { private static final List DATASOURCE_ROWS = ImmutableList.of( From 60c2b42b49dca9fab7e1aee5447550b486c46b4f Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 12 Apr 2024 15:42:56 +0530 Subject: [PATCH 35/46] review comments --- .../java/org/apache/druid/sql/calcite/rel/DruidQuery.java | 5 +++-- .../java/org/apache/druid/sql/calcite/CalciteQueryTest.java | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java b/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java index 7b2f64c2c6cc..95a337921896 100644 --- a/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java +++ b/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java @@ -491,7 +491,7 @@ private static List computeDimensions( } if (!outputType.getNullableStrategy().groupable()) { // Can't group on 'ungroupable' types. - plannerContext.setPlanningError("SQL requires a group-by column with ungroupable type [%s].", outputType); + plannerContext.setPlanningError("SQL requires a group-by on a column with type [%s] that is unsupported.", outputType); throw new CannotBuildQueryException(aggregate, rexNode); } final String dimOutputName = outputNamePrefix + outputNameCounter++; @@ -1249,7 +1249,8 @@ private TopNQuery toTopNQuery() } final DimensionSpec dimensionSpec = Iterables.getOnlyElement(grouping.getDimensions()).toDimensionSpec(); - // grouping col cannot be arrays or complex types + // TopN queries can't handle arrays or complex dimensions. Return's null so that they get planned as a group by query + // which does support complex and array dimensions if (!dimensionSpec.getOutputType().isPrimitive()) { return null; } diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteQueryTest.java index 125fd65047e6..91b0cebd3cba 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteQueryTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteQueryTest.java @@ -5849,7 +5849,7 @@ public void testUnplannableExactCountDistinctOnSketch() assertQueryIsUnplannable( PLANNER_CONFIG_NO_HLL, "SELECT unique_dim1, COUNT(*) FROM druid.foo GROUP BY 1", - "SQL requires a group-by on a column of type COMPLEX that is unsupported." + "SQL requires a group-by on a column with type [COMPLEX] that is unsupported." ); } From ad44b5f8bd68f948071e2c0ae2503bd5e201e70d Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Mon, 15 Apr 2024 11:19:13 +0530 Subject: [PATCH 36/46] test fix --- .../apache/druid/msq/exec/MSQSelectTest.java | 2 +- ...gregatorColumnSelectorStrategyFactory.java | 6 ++++++ .../ColumnSelectorStrategyFactory.java | 3 +++ .../GroupByColumnSelectorStrategyFactory.java | 6 ++++++ .../epinephelinae/RowBasedGrouperHelper.java | 7 +++++++ ...yMappingGroupByColumnSelectorStrategy.java | 2 +- .../druid/query/search/SearchQueryRunner.java | 6 ++++++ .../TopNColumnAggregatesProcessorFactory.java | 6 ++++++ .../druid/segment/DimensionHandlerUtils.java | 18 +++++++++++++----- .../query/groupby/GroupByQueryRunnerTest.java | 19 +++---------------- 10 files changed, 52 insertions(+), 23 deletions(-) diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQSelectTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQSelectTest.java index c2d7e1af6c58..7c4af7389f6c 100644 --- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQSelectTest.java +++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQSelectTest.java @@ -2028,7 +2028,7 @@ public void testGroupByWithComplexColumnThrowsUnsupportedException(String contex .setExpectedExecutionErrorMatcher(CoreMatchers.allOf( CoreMatchers.instanceOf(DruidException.class), ThrowableMessageMatcher.hasMessage(CoreMatchers.containsString( - "SQL requires a group-by on a column of type COMPLEX that is unsupported")) + "SQL requires a group-by on a column with type [COMPLEX] that is unsupported.")) )) .verifyExecutionError(); } diff --git a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/CardinalityAggregatorColumnSelectorStrategyFactory.java b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/CardinalityAggregatorColumnSelectorStrategyFactory.java index 12ed3c5f04b4..237acc20ab29 100644 --- a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/CardinalityAggregatorColumnSelectorStrategyFactory.java +++ b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/CardinalityAggregatorColumnSelectorStrategyFactory.java @@ -46,4 +46,10 @@ public CardinalityAggregatorColumnSelectorStrategy makeColumnSelectorStrategy( throw new IAE("Cannot create query type helper from invalid type [%s]", capabilities.asTypeString()); } } + + @Override + public boolean supportsNestedArraysAndComplexTypes() + { + return false; + } } diff --git a/processing/src/main/java/org/apache/druid/query/dimension/ColumnSelectorStrategyFactory.java b/processing/src/main/java/org/apache/druid/query/dimension/ColumnSelectorStrategyFactory.java index dc9304d26962..ba36b8085dfb 100644 --- a/processing/src/main/java/org/apache/druid/query/dimension/ColumnSelectorStrategyFactory.java +++ b/processing/src/main/java/org/apache/druid/query/dimension/ColumnSelectorStrategyFactory.java @@ -25,4 +25,7 @@ public interface ColumnSelectorStrategyFactory { ColumnSelectorStrategyClass makeColumnSelectorStrategy(ColumnCapabilities capabilities, ColumnValueSelector selector); + + // TODO(laksh): Javadoc + boolean supportsNestedArraysAndComplexTypes(); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java index 3f475770fcf1..b46b8f753a4e 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java @@ -98,4 +98,10 @@ public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( throw new IAE("Cannot create query type helper from invalid type [%s]", capabilities.asTypeString()); } } + + @Override + public boolean supportsNestedArraysAndComplexTypes() + { + return true; + } } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java index 12591c39981a..4edc0d87d62e 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java @@ -739,6 +739,7 @@ public Supplier makeInputRawSupplier(DimensionSelector selector) } } + // TODO(laksh): Figure out why this isn't getting triggered private static class InputRawSupplierColumnSelectorStrategyFactory implements ColumnSelectorStrategyFactory { @@ -783,6 +784,12 @@ public InputRawSupplierColumnSelectorStrategy makeColumnSelectorStrategy( throw new IAE("Cannot create query type helper from invalid type [%s]", capabilities.asTypeString()); } } + + @Override + public boolean supportsNestedArraysAndComplexTypes() + { + return true; + } } @SuppressWarnings("unchecked") diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java index 4dc3c1b383f0..9373cb6018b4 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -39,7 +39,7 @@ * and complex types. *

* The visibility of the class is limited, and the callers must use one of the two variants of the mapping strategy: - * 1. {@link PrebuiltDictionaryGroupByColumnSelectorStrategy} + * 1. TODO(laksh): PrebuiltDictionaryGroupByColumnSelectorStrategy - (not available, because no one is using it) * 2. {@link DictionaryBuildingGroupByColumnSelectorStrategy} *

* {@code null} can be represented by either -1 or the position of null in the dictionary it was stored when it was diff --git a/processing/src/main/java/org/apache/druid/query/search/SearchQueryRunner.java b/processing/src/main/java/org/apache/druid/query/search/SearchQueryRunner.java index 776c115408e9..449219c48a8e 100644 --- a/processing/src/main/java/org/apache/druid/query/search/SearchQueryRunner.java +++ b/processing/src/main/java/org/apache/druid/query/search/SearchQueryRunner.java @@ -84,6 +84,12 @@ public SearchColumnSelectorStrategy makeColumnSelectorStrategy( throw new IAE("Cannot create query type helper from invalid type [%s]", capabilities.asTypeString()); } } + + @Override + public boolean supportsNestedArraysAndComplexTypes() + { + return false; + } } public interface SearchColumnSelectorStrategy extends ColumnSelectorStrategy diff --git a/processing/src/main/java/org/apache/druid/query/topn/types/TopNColumnAggregatesProcessorFactory.java b/processing/src/main/java/org/apache/druid/query/topn/types/TopNColumnAggregatesProcessorFactory.java index 0eafee26af72..cbbae69ec22c 100644 --- a/processing/src/main/java/org/apache/druid/query/topn/types/TopNColumnAggregatesProcessorFactory.java +++ b/processing/src/main/java/org/apache/druid/query/topn/types/TopNColumnAggregatesProcessorFactory.java @@ -78,4 +78,10 @@ public TopNColumnAggregatesProcessor makeColumnSelectorStrategy( throw new IAE("Cannot create query type helper from invalid type [%s]", capabilities.asTypeString()); } + + @Override + public boolean supportsNestedArraysAndComplexTypes() + { + return false; + } } diff --git a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java index d4c15864275c..5ae916276a2e 100644 --- a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java +++ b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java @@ -202,7 +202,8 @@ public static ColumnSelectorPlus selector = getColumnValueSelectorFromDimensionSpec( dimSpec, - columnSelectorFactory + columnSelectorFactory, + strategyFactory.supportsNestedArraysAndComplexTypes() ); Strategy strategy = makeStrategy( strategyFactory, @@ -223,12 +224,13 @@ public static ColumnSelectorPlus getColumnValueSelectorFromDimensionSpec( DimensionSpec dimSpec, - ColumnSelectorFactory columnSelectorFactory + ColumnSelectorFactory columnSelectorFactory, + boolean supportsComplexTypes ) { String dimName = dimSpec.getDimension(); ColumnCapabilities capabilities = columnSelectorFactory.getColumnCapabilities(dimName); - capabilities = getEffectiveCapabilities(dimSpec, capabilities); + capabilities = getEffectiveCapabilities(dimSpec, capabilities, supportsComplexTypes); if (capabilities.is(ValueType.STRING)) { return columnSelectorFactory.makeDimensionSelector(dimSpec); } @@ -242,13 +244,19 @@ private static ColumnValueSelector getColumnValueSelectorFromDimensionSpec( */ private static ColumnCapabilities getEffectiveCapabilities( DimensionSpec dimSpec, - @Nullable ColumnCapabilities capabilities + @Nullable ColumnCapabilities capabilities, + boolean supportsComplexTypes ) { if (capabilities == null) { capabilities = DEFAULT_STRING_CAPABILITIES; } + // Complex dimension type is not supported + if (!supportsComplexTypes && capabilities.is(ValueType.COMPLEX)) { + capabilities = DEFAULT_STRING_CAPABILITIES; + } + // Currently, all extractionFns output Strings, so the column will return String values via a // DimensionSelector if an extractionFn is present. if (dimSpec.getExtractionFn() != null) { @@ -285,7 +293,7 @@ private static Strategy makeStrategy( ColumnValueSelector selector ) { - capabilities = getEffectiveCapabilities(dimSpec, capabilities); + capabilities = getEffectiveCapabilities(dimSpec, capabilities, strategyFactory.supportsNestedArraysAndComplexTypes()); return strategyFactory.makeColumnSelectorStrategy(capabilities, selector); } diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java index e3e19a172aef..58174655db11 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java @@ -9865,22 +9865,9 @@ public void testGroupByComplexColumn() .setGranularity(QueryRunnerTestHelper.ALL_GRAN) .build(); - Assert.assertEquals(Functions.>identity(), query.getLimitSpec().build(query)); - - List expectedResults = Collections.singletonList( - makeRow( - query, - "2011-04-01", - "quality_uniques", - null, - "rows", - 26L, - "idx", - 12446L - ) - ); - Iterable results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); - TestHelper.assertExpectedObjects(expectedResults, results, "long"); + expectedException.expect(RuntimeException.class); + expectedException.expectMessage("Unable to group on the type [COMPLEX]"); + GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); } @Test From f30c7c649641ef537aaaca876abd9c1ddd80cec3 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Tue, 16 Apr 2024 11:34:52 +0530 Subject: [PATCH 37/46] tests fix --- .../apache/druid/segment/MapVirtualColumnGroupByTest.java | 6 ++++-- .../druid/query/groupby/GroupByQueryRunnerTest.java | 8 ++++++-- .../FixedWidthGroupByColumnSelectorStrategyTest.java | 3 +++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/extensions-contrib/virtual-columns/src/test/java/org/apache/druid/segment/MapVirtualColumnGroupByTest.java b/extensions-contrib/virtual-columns/src/test/java/org/apache/druid/segment/MapVirtualColumnGroupByTest.java index ebbee1f8a547..e8a6b231d6f2 100644 --- a/extensions-contrib/virtual-columns/src/test/java/org/apache/druid/segment/MapVirtualColumnGroupByTest.java +++ b/extensions-contrib/virtual-columns/src/test/java/org/apache/druid/segment/MapVirtualColumnGroupByTest.java @@ -26,6 +26,7 @@ import org.apache.druid.data.input.MapBasedRow; import org.apache.druid.jackson.DefaultObjectMapper; import org.apache.druid.java.util.common.DateTimes; +import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.Intervals; import org.apache.druid.java.util.common.granularity.Granularities; import org.apache.druid.query.DruidProcessingConfig; @@ -118,6 +119,7 @@ public int getNumThreads() @Test public void testWithMapColumn() { + // TODO(laksh): VET, The GroupBy engine groups using the complex type, but the output is a STRING final GroupByQuery query = new GroupByQuery( new TableDataSource(QueryRunnerTestHelper.DATA_SOURCE), new MultipleIntervalSegmentSpec(ImmutableList.of(Intervals.of("2011/2012"))), @@ -134,10 +136,10 @@ public void testWithMapColumn() ); Throwable t = Assert.assertThrows( - UnsupportedOperationException.class, + IAE.class, () -> runner.run(QueryPlus.wrap(query)).toList() ); - Assert.assertEquals("Map column doesn't support getRow()", t.getMessage()); + Assert.assertEquals("Cannot find strategy for type [COMPLEX]", t.getMessage()); } diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java index 58174655db11..6e721dc81181 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java @@ -9856,17 +9856,21 @@ public void testGroupByLongColumn() @Test public void testGroupByComplexColumn() { + cannotVectorize(); GroupByQuery query = makeQueryBuilder() .setDataSource(QueryRunnerTestHelper.DATA_SOURCE) .setQuerySegmentSpec(QueryRunnerTestHelper.FIRST_TO_THIRD) - .setDimensions(new DefaultDimensionSpec("quality_uniques", "quality_uniques")) + .setDimensions(new DefaultDimensionSpec( + "quality_uniques", + "quality_uniques", + HyperUniquesAggregatorFactory.TYPE + )) .setDimFilter(new SelectorDimFilter("quality_uniques", null, null)) .setAggregatorSpecs(QueryRunnerTestHelper.ROWS_COUNT, new LongSumAggregatorFactory("idx", "index")) .setGranularity(QueryRunnerTestHelper.ALL_GRAN) .build(); expectedException.expect(RuntimeException.class); - expectedException.expectMessage("Unable to group on the type [COMPLEX]"); GroupByQueryRunnerTestHelper.runQuery(factory, runner, query); } diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java index 61e7d5ee1932..84ce80692155 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java @@ -162,6 +162,7 @@ private static void writeGroupingKeyToBuffer(final ByteBuffer buffer, @Nullable ColumnValueSelector columnValueSelector1 = Mockito.mock(ColumnValueSelector.class); Mockito.when(columnValueSelector1.getObject()).thenReturn(key); + Mockito.when(columnValueSelector1.getLong()).thenReturn(key == null ? 0 : key); Mockito.when(columnValueSelector1.isNull()).thenReturn(key == null); Assert.assertEquals(0, STRATEGY.writeToKeyBuffer(0, columnValueSelector1, buffer)); @@ -304,6 +305,7 @@ private static void writeGroupingKeyToBuffer(final ByteBuffer buffer, @Nullable ColumnValueSelector columnValueSelector1 = Mockito.mock(ColumnValueSelector.class); Mockito.when(columnValueSelector1.getObject()).thenReturn(key); + Mockito.when(columnValueSelector1.getFloat()).thenReturn(key == null ? 0.0f : key); Mockito.when(columnValueSelector1.isNull()).thenReturn(key == null); Assert.assertEquals(0, STRATEGY.writeToKeyBuffer(0, columnValueSelector1, buffer)); @@ -448,6 +450,7 @@ private static void writeGroupingKeyToBuffer(final ByteBuffer buffer, @Nullable ColumnValueSelector columnValueSelector1 = Mockito.mock(ColumnValueSelector.class); Mockito.when(columnValueSelector1.getObject()).thenReturn(key); + Mockito.when(columnValueSelector1.getDouble()).thenReturn(key == null ? 0.0d : key); Mockito.when(columnValueSelector1.isNull()).thenReturn(key == null); Assert.assertEquals(0, STRATEGY.writeToKeyBuffer(0, columnValueSelector1, buffer)); From 6e2db869543bc8aaa137b31db321128b87ecb389 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Tue, 16 Apr 2024 12:01:36 +0530 Subject: [PATCH 38/46] tests fix --- .../segment/MapVirtualColumnGroupByTest.java | 7 +++---- ...gregatorColumnSelectorStrategyFactory.java | 3 ++- .../ColumnSelectorStrategyFactory.java | 2 +- .../GroupByColumnSelectorStrategyFactory.java | 21 ++++++++++++++----- .../epinephelinae/RowBasedGrouperHelper.java | 3 ++- .../druid/query/search/SearchQueryRunner.java | 3 ++- .../TopNColumnAggregatesProcessorFactory.java | 3 ++- .../druid/segment/DimensionHandlerUtils.java | 2 +- ...idthGroupByColumnSelectorStrategyTest.java | 9 +++++--- ...lumnGroupByColumnSelectorStrategyTest.java | 3 ++- 10 files changed, 37 insertions(+), 19 deletions(-) diff --git a/extensions-contrib/virtual-columns/src/test/java/org/apache/druid/segment/MapVirtualColumnGroupByTest.java b/extensions-contrib/virtual-columns/src/test/java/org/apache/druid/segment/MapVirtualColumnGroupByTest.java index e8a6b231d6f2..aeab939f4ec8 100644 --- a/extensions-contrib/virtual-columns/src/test/java/org/apache/druid/segment/MapVirtualColumnGroupByTest.java +++ b/extensions-contrib/virtual-columns/src/test/java/org/apache/druid/segment/MapVirtualColumnGroupByTest.java @@ -24,9 +24,9 @@ import org.apache.druid.collections.StupidPool; import org.apache.druid.common.config.NullHandling; import org.apache.druid.data.input.MapBasedRow; +import org.apache.druid.error.DruidException; import org.apache.druid.jackson.DefaultObjectMapper; import org.apache.druid.java.util.common.DateTimes; -import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.Intervals; import org.apache.druid.java.util.common.granularity.Granularities; import org.apache.druid.query.DruidProcessingConfig; @@ -119,7 +119,6 @@ public int getNumThreads() @Test public void testWithMapColumn() { - // TODO(laksh): VET, The GroupBy engine groups using the complex type, but the output is a STRING final GroupByQuery query = new GroupByQuery( new TableDataSource(QueryRunnerTestHelper.DATA_SOURCE), new MultipleIntervalSegmentSpec(ImmutableList.of(Intervals.of("2011/2012"))), @@ -136,10 +135,10 @@ public void testWithMapColumn() ); Throwable t = Assert.assertThrows( - IAE.class, + DruidException.class, () -> runner.run(QueryPlus.wrap(query)).toList() ); - Assert.assertEquals("Cannot find strategy for type [COMPLEX]", t.getMessage()); + Assert.assertEquals("Unable to group on the column[params]", t.getMessage()); } diff --git a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/CardinalityAggregatorColumnSelectorStrategyFactory.java b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/CardinalityAggregatorColumnSelectorStrategyFactory.java index 237acc20ab29..22f33f8b6b2f 100644 --- a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/CardinalityAggregatorColumnSelectorStrategyFactory.java +++ b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/CardinalityAggregatorColumnSelectorStrategyFactory.java @@ -30,7 +30,8 @@ public class CardinalityAggregatorColumnSelectorStrategyFactory @Override public CardinalityAggregatorColumnSelectorStrategy makeColumnSelectorStrategy( ColumnCapabilities capabilities, - ColumnValueSelector selector + ColumnValueSelector selector, + String dimension ) { switch (capabilities.getType()) { diff --git a/processing/src/main/java/org/apache/druid/query/dimension/ColumnSelectorStrategyFactory.java b/processing/src/main/java/org/apache/druid/query/dimension/ColumnSelectorStrategyFactory.java index ba36b8085dfb..59384f6d8a65 100644 --- a/processing/src/main/java/org/apache/druid/query/dimension/ColumnSelectorStrategyFactory.java +++ b/processing/src/main/java/org/apache/druid/query/dimension/ColumnSelectorStrategyFactory.java @@ -24,7 +24,7 @@ public interface ColumnSelectorStrategyFactory { - ColumnSelectorStrategyClass makeColumnSelectorStrategy(ColumnCapabilities capabilities, ColumnValueSelector selector); + ColumnSelectorStrategyClass makeColumnSelectorStrategy(ColumnCapabilities capabilities, ColumnValueSelector selector, String dimension); // TODO(laksh): Javadoc boolean supportsNestedArraysAndComplexTypes(); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java index b46b8f753a4e..f38ef2fb1c71 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java @@ -45,17 +45,28 @@ public class GroupByColumnSelectorStrategyFactory implements ColumnSelectorStrat @Override public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( ColumnCapabilities capabilities, - ColumnValueSelector selector + ColumnValueSelector selector, + String dimension ) { if (capabilities == null || capabilities.getType() == null) { throw DruidException.defensive("Unable to deduce type for the grouping dimension"); } - if (!capabilities.toColumnType().getNullableStrategy().groupable()) { - // InvalidInput because the SQL planner would have already flagged these dimensions, therefore this will only happen - // if native queries have been submitted. - throw InvalidInput.exception("Unable to group on the type [%s]", capabilities.toColumnType()); + try { + if (!capabilities.toColumnType().getNullableStrategy().groupable()) { + // InvalidInput because the SQL planner would have already flagged these dimensions, therefore this will only happen + // if native queries have been submitted. + throw InvalidInput.exception( + "Unable to group on the column[%s] with type[%s]", + dimension, + capabilities.toColumnType() + ); + } + } + catch (Exception e) { + throw InvalidInput.exception(e, "Unable to group on the column[%s]", dimension); } + switch (capabilities.getType()) { case STRING: return KeyMappingMultiValueGroupByColumnSelectorStrategy.create(capabilities, (DimensionSelector) selector); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java index 4edc0d87d62e..4a0bfdda19a4 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java @@ -746,7 +746,8 @@ private static class InputRawSupplierColumnSelectorStrategyFactory @Override public InputRawSupplierColumnSelectorStrategy makeColumnSelectorStrategy( ColumnCapabilities capabilities, - ColumnValueSelector selector + ColumnValueSelector selector, + String dimension ) { switch (capabilities.getType()) { diff --git a/processing/src/main/java/org/apache/druid/query/search/SearchQueryRunner.java b/processing/src/main/java/org/apache/druid/query/search/SearchQueryRunner.java index 449219c48a8e..fbfa4cb6b7fa 100644 --- a/processing/src/main/java/org/apache/druid/query/search/SearchQueryRunner.java +++ b/processing/src/main/java/org/apache/druid/query/search/SearchQueryRunner.java @@ -68,7 +68,8 @@ private static class SearchColumnSelectorStrategyFactory @Override public SearchColumnSelectorStrategy makeColumnSelectorStrategy( ColumnCapabilities capabilities, - ColumnValueSelector selector + ColumnValueSelector selector, + String dimension ) { switch (capabilities.getType()) { diff --git a/processing/src/main/java/org/apache/druid/query/topn/types/TopNColumnAggregatesProcessorFactory.java b/processing/src/main/java/org/apache/druid/query/topn/types/TopNColumnAggregatesProcessorFactory.java index cbbae69ec22c..472d58ca6605 100644 --- a/processing/src/main/java/org/apache/druid/query/topn/types/TopNColumnAggregatesProcessorFactory.java +++ b/processing/src/main/java/org/apache/druid/query/topn/types/TopNColumnAggregatesProcessorFactory.java @@ -43,7 +43,8 @@ public TopNColumnAggregatesProcessorFactory(final ColumnType dimensionType) @Override public TopNColumnAggregatesProcessor makeColumnSelectorStrategy( ColumnCapabilities capabilities, - ColumnValueSelector selector + ColumnValueSelector selector, + String dimension ) { if (capabilities.is(ValueType.STRING)) { diff --git a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java index 5ae916276a2e..15c287bc73a4 100644 --- a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java +++ b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java @@ -294,7 +294,7 @@ private static Strategy makeStrategy( ) { capabilities = getEffectiveCapabilities(dimSpec, capabilities, strategyFactory.supportsNestedArraysAndComplexTypes()); - return strategyFactory.makeColumnSelectorStrategy(capabilities, selector); + return strategyFactory.makeColumnSelectorStrategy(capabilities, selector, dimSpec.getDimension()); } @Nullable diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java index 84ce80692155..565fb8b4e81d 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java @@ -57,7 +57,8 @@ public static class LongGroupByColumnSelectorStrategyTest private static final GroupByColumnSelectorStrategy STRATEGY = STRATEGY_FACTORY.makeColumnSelectorStrategy( createCursor().getColumnSelectorFactory().getColumnCapabilities(LONG_COLUMN), - createCursor().getColumnSelectorFactory().makeColumnValueSelector(LONG_COLUMN) + createCursor().getColumnSelectorFactory().makeColumnValueSelector(LONG_COLUMN), + "dimension" ); @Test @@ -202,7 +203,8 @@ public static class FloatGroupByColumnSelectorStrategyTest private static final GroupByColumnSelectorStrategy STRATEGY = STRATEGY_FACTORY.makeColumnSelectorStrategy( createCursor().getColumnSelectorFactory().getColumnCapabilities(FLOAT_COLUMN), - createCursor().getColumnSelectorFactory().makeColumnValueSelector(FLOAT_COLUMN) + createCursor().getColumnSelectorFactory().makeColumnValueSelector(FLOAT_COLUMN), + "dimension" ); @Test @@ -345,7 +347,8 @@ public static class DoubleGroupByColumnSelectorStrategyTest private static final GroupByColumnSelectorStrategy STRATEGY = STRATEGY_FACTORY.makeColumnSelectorStrategy( createCursor().getColumnSelectorFactory().getColumnCapabilities(DOUBLE_COLUMN), - createCursor().getColumnSelectorFactory().makeColumnValueSelector(DOUBLE_COLUMN) + createCursor().getColumnSelectorFactory().makeColumnValueSelector(DOUBLE_COLUMN), + "dimension" ); @Test diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java index 89ddb94c1c09..8ee82be7e538 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java @@ -150,7 +150,8 @@ private static GroupByColumnSelectorStrategy createStrategy() { return STRATEGY_FACTORY.makeColumnSelectorStrategy( createCursor().getColumnSelectorFactory().getColumnCapabilities(NESTED_COLUMN), - createCursor().getColumnSelectorFactory().makeColumnValueSelector(NESTED_COLUMN) + createCursor().getColumnSelectorFactory().makeColumnValueSelector(NESTED_COLUMN), + "dimension" ); } From a3a01452fdaee36d91de0c841d61d1886fc6671d Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Wed, 17 Apr 2024 10:17:58 +0530 Subject: [PATCH 39/46] tests fix --- .../druid/common/config/NullHandling.java | 27 ++++++++++++ ...BuildingGroupByColumnSelectorStrategy.java | 3 ++ ...yMappingGroupByColumnSelectorStrategy.java | 7 +++- ...idthGroupByColumnSelectorStrategyTest.java | 16 +++++--- ...lumnGroupByColumnSelectorStrategyTest.java | 41 +++++++++++++------ .../calcite/CalciteNestedDataQueryTest.java | 2 +- 6 files changed, 74 insertions(+), 22 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/common/config/NullHandling.java b/processing/src/main/java/org/apache/druid/common/config/NullHandling.java index 7b4722052d47..747512cece53 100644 --- a/processing/src/main/java/org/apache/druid/common/config/NullHandling.java +++ b/processing/src/main/java/org/apache/druid/common/config/NullHandling.java @@ -142,6 +142,33 @@ public static String nullToEmptyIfNeeded(@Nullable String value) //CHECKSTYLE.ON: Regexp } + @Nullable + public static Long nullToEmptyIfNeeded(@Nullable Long value) + { + if (replaceWithDefault() && value == null) { + return defaultLongValue(); + } + return value; + } + + @Nullable + public static Float nullToEmptyIfNeeded(@Nullable Float value) + { + if (replaceWithDefault() && value == null) { + return defaultFloatValue(); + } + return value; + } + + @Nullable + public static Double nullToEmptyIfNeeded(@Nullable Double value) + { + if (replaceWithDefault() && value == null) { + return defaultDoubleValue(); + } + return value; + } + @Nullable public static String emptyToNullIfNeeded(@Nullable String value) { diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java index ea57cffd2d2f..f89efeaf2f9b 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -176,6 +176,9 @@ public DictionaryIdToDimensionConverter(List dictionary) @Override public DimensionType idToKey(int id) { + if (id >= dictionary.size()) { + throw DruidException.defensive("Unknown dictionary id"); + } // No need to handle GROUP_BY_MISSING_VALUE, by contract return dictionary.get(id); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java index 9373cb6018b4..fc0a70efbaa7 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -210,8 +210,11 @@ public Grouper.BufferComparator bufferComparator(int keyBufferPosition, @Nullabl ); } else { return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> { - Object lhsObject = idToDimensionConverter.idToKey(lhsBuffer.getInt(lhsPosition + keyBufferPosition)); - Object rhsObject = idToDimensionConverter.idToKey(rhsBuffer.getInt(rhsPosition + keyBufferPosition)); + int lhsDictId = lhsBuffer.getInt(lhsPosition + keyBufferPosition); + int rhsDictId = rhsBuffer.getInt(rhsPosition + keyBufferPosition); + + Object lhsObject = lhsDictId == GROUP_BY_MISSING_VALUE ? null : idToDimensionConverter.idToKey(lhsDictId); + Object rhsObject = rhsDictId == GROUP_BY_MISSING_VALUE ? null : idToDimensionConverter.idToKey(rhsDictId); if (usesNaturalComparator) { return nullableTypeStrategy.compare( (DimensionType) DimensionHandlerUtils.convertObjectToType(lhsObject, columnType), diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java index 565fb8b4e81d..d427cb88bef6 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/FixedWidthGroupByColumnSelectorStrategyTest.java @@ -20,6 +20,7 @@ package org.apache.druid.query.groupby.epinephelinae.column; import com.google.common.collect.ImmutableList; +import org.apache.druid.common.config.NullHandling; import org.apache.druid.query.IterableRowsCursorHelper; import org.apache.druid.query.groupby.ResultRow; import org.apache.druid.query.groupby.epinephelinae.GroupByColumnSelectorStrategyFactory; @@ -83,7 +84,7 @@ public void testWriteToKeyBuffer() STRATEGY.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); // There shouldn't be any internal size increase associated with the fixed width types Assert.assertEquals(0, sizeIncrease); - Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[0], resultRow.get(0)); + Assert.assertEquals(NullHandling.nullToEmptyIfNeeded((Long) DATASOURCE_ROWS.get(rowNum)[0]), resultRow.get(0)); cursor.advance(); ++rowNum; } @@ -100,7 +101,7 @@ public void testInitColumnValues() while (!cursor.isDone()) { int sizeIncrease = STRATEGY.initColumnValues(columnValueSelector, 0, valuess); Assert.assertEquals(0, sizeIncrease); - Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[0], valuess[0]); + Assert.assertEquals(NullHandling.nullToEmptyIfNeeded((Long) DATASOURCE_ROWS.get(rowNum)[0]), valuess[0]); cursor.advance(); ++rowNum; } @@ -227,7 +228,7 @@ public void testWriteToKeyBuffer() int sizeIncrease = STRATEGY.writeToKeyBuffer(0, columnValueSelector, BUFFER1); STRATEGY.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); Assert.assertEquals(0, sizeIncrease); - Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[1], resultRow.get(0)); + Assert.assertEquals(NullHandling.nullToEmptyIfNeeded((Float) DATASOURCE_ROWS.get(rowNum)[1]), resultRow.get(0)); cursor.advance(); ++rowNum; } @@ -244,7 +245,7 @@ public void testInitColumnValues() while (!cursor.isDone()) { int sizeIncrease = STRATEGY.initColumnValues(columnValueSelector, 0, valuess); Assert.assertEquals(0, sizeIncrease); - Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[1], valuess[0]); + Assert.assertEquals(NullHandling.nullToEmptyIfNeeded((Float) DATASOURCE_ROWS.get(rowNum)[1]), valuess[0]); cursor.advance(); ++rowNum; } @@ -372,7 +373,10 @@ public void testWriteToKeyBuffer() int sizeIncrease = STRATEGY.writeToKeyBuffer(0, columnValueSelector, BUFFER1); STRATEGY.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); Assert.assertEquals(0, sizeIncrease); - Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[2], resultRow.get(0)); + Assert.assertEquals( + NullHandling.nullToEmptyIfNeeded((Double) DATASOURCE_ROWS.get(rowNum)[2]), + resultRow.get(0) + ); cursor.advance(); ++rowNum; } @@ -390,7 +394,7 @@ public void testInitColumnValues() while (!cursor.isDone()) { int sizeIncrease = STRATEGY.initColumnValues(columnValueSelector, 0, valuess); Assert.assertEquals(0, sizeIncrease); - Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[2], valuess[0]); + Assert.assertEquals(NullHandling.nullToEmptyIfNeeded((Double) DATASOURCE_ROWS.get(rowNum)[2]), valuess[0]); cursor.advance(); ++rowNum; } diff --git a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java index 8ee82be7e538..a35432cc80e3 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/epinephelinae/column/NestedColumnGroupByColumnSelectorStrategyTest.java @@ -21,6 +21,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import org.apache.druid.guice.NestedDataModule; import org.apache.druid.query.IterableRowsCursorHelper; import org.apache.druid.query.groupby.ResultRow; import org.apache.druid.query.groupby.epinephelinae.GroupByColumnSelectorStrategyFactory; @@ -42,6 +43,10 @@ */ public class NestedColumnGroupByColumnSelectorStrategyTest extends InitializedNullHandlingTest { + static { + NestedDataModule.registerHandlersAndSerde(); + } + private static final GroupByColumnSelectorStrategyFactory STRATEGY_FACTORY = new GroupByColumnSelectorStrategyFactory(); // No datasource would exist like this, however the inline datasource is an easy way to create the required column value selectors @@ -52,6 +57,10 @@ public class NestedColumnGroupByColumnSelectorStrategyTest extends InitializedNu new Object[]{StructuredData.wrap("hello")} ); + // Dictionary ids alloted to each object, in the column-0 of the DATASOURCE_ROWS, when building from scratch. + // null's dictionary id would be -1 + private static final int[] DICT_IDS = new int[]{0, 1, -1, 2}; + private static final String NESTED_COLUMN = "nested"; /** * Row with null value in the column @@ -72,13 +81,20 @@ public void testInitColumnValues() GroupByColumnSelectorStrategy strategy = createStrategy(); Cursor cursor = createCursor(); ColumnValueSelector columnValueSelector = cursor.getColumnSelectorFactory().makeColumnValueSelector(NESTED_COLUMN); + GroupByColumnSelectorPlus groupByColumnSelectorPlus = Mockito.mock(GroupByColumnSelectorPlus.class); + Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); Object[] valuess = new Object[1]; int rowNum = 0; while (!cursor.isDone()) { int sz = strategy.initColumnValues(columnValueSelector, 0, valuess); - // While adding the values for the first time, the initialisation should have a non-zero footprint - Assert.assertTrue(sz > 0); - Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[0], valuess[0]); + // While adding the values for the first time, the initialisation should have a non-zero footprint, apart from the + // row with the null value + if (DATASOURCE_ROWS.get(rowNum)[0] == null) { + Assert.assertEquals(0, sz); + } else { + Assert.assertTrue(sz > 0); + } + Assert.assertEquals(DICT_IDS[rowNum], valuess[0]); cursor.advance(); ++rowNum; @@ -91,7 +107,7 @@ public void testInitColumnValues() int sz = strategy.initColumnValues(columnValueSelector, 0, valuess); // While adding the values for the first time, the initialisation should have a non-zero footprint Assert.assertEquals(0, sz); - Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[0], valuess[0]); + Assert.assertEquals(DICT_IDS[rowNum], valuess[0]); cursor.advance(); ++rowNum; @@ -111,9 +127,13 @@ public void testWriteToKeyBuffer() int rowNum = 0; while (!cursor.isDone()) { int sz = strategy.writeToKeyBuffer(0, columnValueSelector, BUFFER1); - Assert.assertTrue(sz > 0); + if (DATASOURCE_ROWS.get(rowNum)[0] == null) { + Assert.assertEquals(0, sz); + } else { + Assert.assertTrue(sz > 0); + } // null is represented by GROUP_BY_MISSING_VALUE on the buffer, even though it gets its own dictionaryId in the dictionary - Assert.assertEquals(rowNum == NULL_ROW_NUMBER ? -1 : rowNum, BUFFER1.getInt(0)); + Assert.assertEquals(DICT_IDS[rowNum], BUFFER1.getInt(0)); // Readback the value strategy.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); Assert.assertEquals(DATASOURCE_ROWS.get(rowNum)[0], resultRow.get(0)); @@ -131,14 +151,9 @@ public void testInitGroupingKeyColumnValue() Mockito.when(groupByColumnSelectorPlus.getResultRowPosition()).thenReturn(0); int[] stack = new int[1]; ResultRow resultRow = ResultRow.create(1); - Object obj = StructuredData.wrap(ImmutableList.of("x", "y", "z")); - - strategy.initGroupingKeyColumnValue(0, 0, obj, BUFFER1, stack); - Assert.assertEquals(1, stack[0]); - strategy.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); - Assert.assertEquals(obj, resultRow.get(0)); - strategy.initGroupingKeyColumnValue(0, 0, null, BUFFER1, stack); + // Test nulls + strategy.initGroupingKeyColumnValue(0, 0, -1, BUFFER1, stack); Assert.assertEquals(0, stack[0]); strategy.processValueFromGroupingKey(groupByColumnSelectorPlus, BUFFER1, resultRow, 0); Assert.assertNull(resultRow.get(0)); diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java index 4bec9e16eb9d..a1dafb5bf871 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java @@ -539,7 +539,7 @@ public void testTopNPath() } @Test - public void testGroupByNested() + public void testGroupByOnNestedColumn() { cannotVectorize(); testQuery( From e95731cb283694a4d6a82c2378892afbb4c52de0 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Thu, 18 Apr 2024 16:33:18 +0530 Subject: [PATCH 40/46] review 1, delete bogus class --- .../druid/frame/write/FrameWriterUtils.java | 5 - .../GroupByColumnSelectorStrategyFactory.java | 1 + ...BuildingGroupByColumnSelectorStrategy.java | 188 ++++++++++++------ ...onConverter.java => DimensionIdCodec.java} | 38 ++-- .../column/DimensionToIdConverter.java | 38 ---- ...yMappingGroupByColumnSelectorStrategy.java | 31 ++- .../segment/data/ComparableIntArray.java | 118 ----------- .../segment/data/ComparableIntArrayTest.java | 70 ------- 8 files changed, 165 insertions(+), 324 deletions(-) rename processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/{IdToDimensionConverter.java => DimensionIdCodec.java} (51%) delete mode 100644 processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java delete mode 100644 processing/src/main/java/org/apache/druid/segment/data/ComparableIntArray.java delete mode 100644 processing/src/test/java/org/apache/druid/segment/data/ComparableIntArrayTest.java diff --git a/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java b/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java index 864a9cc183e3..0bb78b2109b7 100644 --- a/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java +++ b/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java @@ -33,7 +33,6 @@ import org.apache.druid.segment.DimensionSelector; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.column.RowSignature; -import org.apache.druid.segment.data.ComparableIntArray; import org.apache.druid.segment.data.IndexedInts; import javax.annotation.Nullable; @@ -195,10 +194,6 @@ public static List getNumericArrayFromObject(Object row) for (Object value : (Object[]) row) { retVal.add((Number) value); } - } else if (row instanceof ComparableIntArray) { - for (int value : ((ComparableIntArray) row).getDelegate()) { - retVal.add(value); - } } else { throw new ISE("Unexpected type %s found", row.getClass().getName()); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java index f38ef2fb1c71..00e5ea08308d 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java @@ -53,6 +53,7 @@ public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( throw DruidException.defensive("Unable to deduce type for the grouping dimension"); } try { + // TODO(laksh): Check if the .getNullableStrategy() works, and doesn't throw if (!capabilities.toColumnType().getNullableStrategy().groupable()) { // InvalidInput because the SQL planner would have already flagged these dimensions, therefore this will only happen // if native queries have been submitted. diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java index f89efeaf2f9b..0b6b00106862 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -19,6 +19,8 @@ package org.apache.druid.query.groupby.epinephelinae.column; +import com.google.common.collect.BiMap; +import com.google.common.collect.HashBiMap; import it.unimi.dsi.fastutil.objects.Object2IntMap; import org.apache.druid.error.DruidException; import org.apache.druid.query.groupby.epinephelinae.DictionaryBuildingUtils; @@ -26,6 +28,7 @@ import org.apache.druid.segment.column.NullableTypeStrategy; import javax.annotation.concurrent.NotThreadSafe; +import java.util.ArrayList; import java.util.List; /** @@ -43,36 +46,14 @@ public class DictionaryBuildingGroupByColumnSelectorStrategy extends KeyMappingGroupByColumnSelectorStrategy { - /** - * Dictionary for mapping the dimension value to an index. i-th position in the dictionary holds the value represented - * by the dictionaryId "i". - * Therefore, if a value has a dictionary id "i", dictionary.get(i) = value - */ - private final List dictionary; - - /** - * Reverse dictionary for faster lookup into the dictionary, and reusing pre-existing dictionary ids. - *

- * An entry of form (value, i) in the reverse dictionary represents that "value" is present at the i-th location in the - * {@link #dictionary}. - * Absence of mapping of a "value" (denoted by returning {@link GroupByColumnSelectorStrategy#GROUP_BY_MISSING_VALUE}) - * represents that the value is absent in the dictionary - */ - private final Object2IntMap reverseDictionary; - private DictionaryBuildingGroupByColumnSelectorStrategy( - DimensionToIdConverter dimensionToIdConverter, + DimensionIdCodec dimensionIdCodec, ColumnType columnType, NullableTypeStrategy nullableTypeStrategy, - DimensionType defaultValue, - IdToDimensionConverter idToDimensionConverter, - List dictionary, - Object2IntMap reverseDictionary + DimensionType defaultValue ) { - super(dimensionToIdConverter, columnType, nullableTypeStrategy, defaultValue, idToDimensionConverter); - this.dictionary = dictionary; - this.reverseDictionary = reverseDictionary; + super(dimensionIdCodec, columnType, nullableTypeStrategy, defaultValue); } /** @@ -91,6 +72,10 @@ public static GroupByColumnSelectorStrategy forType(final ColumnType columnType) throw DruidException.defensive("Could used a fixed width strategy"); } + if (ColumnType.STRING_ARRAY.equals(columnType)) { + forStringArrays(); + } + // Catch-all for all other types, that can only have single-valued dimensions return forArrayAndComplexTypes(columnType); } @@ -111,24 +96,48 @@ private static GroupByColumnSelectorStrategy forArrayAndComplexTypes(final Colum final Object2IntMap reverseDictionary = DictionaryBuildingUtils.createReverseDictionary(columnType.getNullableStrategy()); return new DictionaryBuildingGroupByColumnSelectorStrategy<>( - new UniValueDimensionToIdConverter(dictionary, reverseDictionary, columnType.getNullableStrategy()), + new UniValueDimensionIdCodec(dictionary, reverseDictionary, columnType.getNullableStrategy()), columnType, columnType.getNullableStrategy(), - null, - new DictionaryIdToDimensionConverter<>(dictionary), - dictionary, - reverseDictionary + null + ); + } + + private static GroupByColumnSelectorStrategy forStringArrays() + { + final BiMap elementBiDictionary = HashBiMap.create(); + final BiMap, Integer> arrayBiDictionary = HashBiMap.create(); + return new DictionaryBuildingGroupByColumnSelectorStrategy<>( + new StringArrayDimensionIdCodec(elementBiDictionary, arrayBiDictionary), + ColumnType.STRING_ARRAY, + ColumnType.STRING_ARRAY.getNullableStrategy(), + null ); } - private static class UniValueDimensionToIdConverter implements DimensionToIdConverter + private static class UniValueDimensionIdCodec implements DimensionIdCodec { + /** + * Dictionary for mapping the dimension value to an index. i-th position in the dictionary holds the value represented + * by the dictionaryId "i". + * Therefore, if a value has a dictionary id "i", dictionary.get(i) = value + */ private final List dictionary; + + /** + * Reverse dictionary for faster lookup into the dictionary, and reusing pre-existing dictionary ids. + *

+ * An entry of form (value, i) in the reverse dictionary represents that "value" is present at the i-th location in the + * {@link #dictionary}. + * Absence of mapping of a "value" (denoted by returning {@link GroupByColumnSelectorStrategy#GROUP_BY_MISSING_VALUE}) + * represents that the value is absent in the dictionary + */ private final Object2IntMap reverseDictionary; + @SuppressWarnings("rawtypes") private final NullableTypeStrategy nullableTypeStrategy; - public UniValueDimensionToIdConverter( + public UniValueDimensionIdCodec( final List dictionary, final Object2IntMap reverseDictionary, final NullableTypeStrategy nullableTypeStrategy @@ -140,64 +149,129 @@ public UniValueDimensionToIdConverter( } @Override - public MemoryEstimate lookupId(Object multiValueHolder) + public MemoryEstimate lookupId(Object dimension) { - int dictId = reverseDictionary.getInt(multiValueHolder); + int dictId = reverseDictionary.getInt(dimension); int footprintIncrease = 0; // Even if called again, then this is no-op if (dictId < 0) { final int size = dictionary.size(); - dictionary.add(multiValueHolder); - reverseDictionary.put(multiValueHolder, size); + dictionary.add(dimension); + reverseDictionary.put(dimension, size); dictId = size; // MultiValueHOlder is always expected to handle the type, once the coercion is complete //noinspection unchecked footprintIncrease = DictionaryBuildingUtils.estimateEntryFootprint( - nullableTypeStrategy.estimateSizeBytes(multiValueHolder) + nullableTypeStrategy.estimateSizeBytes(dimension) ); } return new MemoryEstimate<>(dictId, footprintIncrease); + } + @Override + public Object idToKey(int id) + { + if (id >= dictionary.size()) { + throw DruidException.defensive("Unknown dictionary id [%d]", id); + } + // No need to handle GROUP_BY_MISSING_VALUE, by contract + return dictionary.get(id); + } + + @Override + public boolean canCompareIds() + { + // Dictionaries are built on the fly, and ids are assigned in the order in which the value is added to the + // dictionary. + return false; + } + + @Override + public void reset() + { + dictionary.clear(); + reverseDictionary.clear(); } } /** - * Defers to the dictionary we have built to decode the dictionary id + * {@link DimensionIdCodec} for string arrays. Dictionary building for string arrays is optimised to have a dual + * dictionary - one that maps the string values to an id, and another which maps an array of these ids, to the returned + * dictionary id. This reduces the amount of heap memory required to build the dictionaries */ - private static class DictionaryIdToDimensionConverter implements IdToDimensionConverter + private static class StringArrayDimensionIdCodec implements DimensionIdCodec { - private final List dictionary; + // contains string <-> id for each element of the multi value grouping column + // for eg : [a,b,c] is the col value. dictionaryToInt will contain { a <-> 1, b <-> 2, c <-> 3} + private final BiMap elementBiDictionary; - public DictionaryIdToDimensionConverter(List dictionary) + // stores each row as an integer array where the int represents the value in dictionaryToInt + // for eg : [a,b,c] would be converted to [1,2,3] and assigned a integer value 1. + // [1,2,3] <-> 1 + private final BiMap, Integer> arrayBiDictionary; + + public StringArrayDimensionIdCodec( + BiMap elementBiDictionary, + BiMap, Integer> arrayBiDictionary + ) { - this.dictionary = dictionary; + this.elementBiDictionary = elementBiDictionary; + this.arrayBiDictionary = arrayBiDictionary; } @Override - public DimensionType idToKey(int id) + public MemoryEstimate lookupId(Object dimension) { - if (id >= dictionary.size()) { - throw DruidException.defensive("Unknown dictionary id"); + // dimension IS non-null, by contract of this method + Object[] stringArray = (Object[]) dimension; + ArrayList dictionaryEncodedStringArray = new ArrayList<>(); + int estimatedFootprint = 0; + for (Object element : stringArray) { + String elementCasted = (String) element; + Integer elementDictId = elementBiDictionary.get(elementCasted); + if (elementDictId == null) { + elementDictId = elementBiDictionary.size(); + elementBiDictionary.put(elementCasted, elementDictId); + // We're not using the dictionary and reverseDictionary from DictionaryBuilding, but the BiMap is close enough + // that we expect this footprint calculation to still be useful. + estimatedFootprint += + DictionaryBuildingUtils.estimateEntryFootprint(elementCasted == null ? 0 : elementCasted.length() * Character.BYTES); + } + dictionaryEncodedStringArray.add(elementDictId); } - // No need to handle GROUP_BY_MISSING_VALUE, by contract - return dictionary.get(id); + + Integer arrayDictId = arrayBiDictionary.get(dictionaryEncodedStringArray); + if (arrayDictId == null) { + arrayDictId = arrayBiDictionary.size(); + arrayBiDictionary.put(dictionaryEncodedStringArray, arrayDictId); + estimatedFootprint += + DictionaryBuildingUtils.estimateEntryFootprint(dictionaryEncodedStringArray.size() * Integer.BYTES); + } + return new MemoryEstimate<>(arrayDictId, estimatedFootprint); + } + + @Override + public Object idToKey(int id) + { + ArrayList dictionaryEncodedStringArray = arrayBiDictionary.inverse().get(id); + final Object[] stringRepresentation = new Object[dictionaryEncodedStringArray.size()]; + for (int i = 0; i < dictionaryEncodedStringArray.size(); ++i) { + stringRepresentation[i] = elementBiDictionary.inverse().get(dictionaryEncodedStringArray.get(i)); + } + return stringRepresentation; } @Override public boolean canCompareIds() { - // Dictionaries are built on the fly, and ids are assigned in the order in which the value is added to the - // dictionary. return false; } - } - @Override - public void reset() - { - super.reset(); - // Clean up the dictionaries - dictionary.clear(); - reverseDictionary.clear(); + @Override + public void reset() + { + arrayBiDictionary.clear(); + elementBiDictionary.clear(); + } } } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionIdCodec.java similarity index 51% rename from processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java rename to processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionIdCodec.java index 0e306277679b..f556c62063df 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/IdToDimensionConverter.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionIdCodec.java @@ -20,29 +20,29 @@ package org.apache.druid.query.groupby.epinephelinae.column; /** - * Converts back the dictionaryId to the dimension value. The implementations might or might not handle - * {@link GroupByColumnSelectorStrategy#GROUP_BY_MISSING_VALUE}. The callers should handle those values appropriately on - * their own, and filter those out before trying to convert the dictionary id back to value. + * Dimension to integer id encoder - decoder i.e. it is an interface for converters of dimension to dictionary id and back. + * It only handles single value dimensions. Handle multi-value dimensions (i.e. strings) using the {@link KeyMappingMultiValueGroupByColumnSelectorStrategy}. + *

+ * Encoding
+ * The caller is expected to handle non-null values. Null values must be filtered by the caller, and assigned {@link GroupByColumnSelectorStrategy#GROUP_BY_MISSING_VALUE} + * 1. {@link DimensionIdCodec} extracts the dimension from the selector + * 2. The value gets encoded into a dictionaryId, using {@link DimensionIdCodec#lookupId} + * 3. The callers can use this integer dictionaryID to represent the grouping key + *

+ * Decoding
+ * Converts back the dictionaryId to the dimension value. The implementations are not expected to handle {@link GroupByColumnSelectorStrategy#GROUP_BY_MISSING_VALUE}. + * The callers should handle those values appropriately ontheir own, and filter those out before trying to convert + * the dictionary id back to value. * - * The encoding - decoding workflow looks like: - * - * Encoding - * 1. {@link DimensionToIdConverter} extracts the multi-value holder for the given row, which get's stored somewhere - * 2. For each entry in the multi-value object, the value gets encoded into a dictionaryId, using {@link DimensionToIdConverter#lookupId} - * 3. The callers can use this integer dictionaryID to materialize the results somewhere - * - * Decoding - * 1. The materialized dictionary id is deserialized back to an int, and then decoded into value using {@link #idToKey} - * - * @see DimensionToIdConverter for converting the dimensions to dictionary ids - * - * @param Type of the dimension's values + * @param Type of the dimension holder */ -public interface IdToDimensionConverter +public interface DimensionIdCodec { /** - * Decodes the dictionaryId back to the dimensionKey + * @return DictionaryId of the object at the given index and the memory increase associated with it */ + MemoryEstimate lookupId(DimensionType dimension); + DimensionType idToKey(int id); /** @@ -53,4 +53,6 @@ public interface IdToDimensionConverter * Ids backed by dictionaries built on the fly can never be compared, therefore those should always return false. */ boolean canCompareIds(); + + void reset(); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java deleted file mode 100644 index 56a6c159376a..000000000000 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionToIdConverter.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.query.groupby.epinephelinae.column; - -/** - * Interface for converters of dimension to dictionary id. - * - * It only handles single-value dimensions. Handle multi-value dimensions (i.e. strings) using the - * {@link KeyMappingMultiValueGroupByColumnSelectorStrategy} - * - * @see IdToDimensionConverter for converting the dictionary values back to dimensions - * - * @param Type of the dimension holder - */ -public interface DimensionToIdConverter -{ - /** - * @return DictionaryId of the object at the given index and the memory increase associated with it - */ - MemoryEstimate lookupId(DimensionType multiValueHolder); -} diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java index fc0a70efbaa7..dcfdef865199 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -45,7 +45,7 @@ * {@code null} can be represented by either -1 or the position of null in the dictionary it was stored when it was * encountered. This is fine, because most of the time, the dictionary id has no value of its own, and is converted back to * the value it represents, before doing further operations. The only place where it would matter would be when - * {@link IdToDimensionConverter#canCompareIds()} is true, and we compare directly on the dictionary ids for prebuilt + * {@link DimensionIdCodec#canCompareIds()} is true, and we compare directly on the dictionary ids for prebuilt * dictionaries (we can't compare ids for the dictionaries built on the fly in the grouping strategy). However, in that case, * it is guaranteed that the dictionaryId of null represented by the pre-built dictionary would be the lowest (most likely 0) * and therefore nulls (-1) would be adjacent to nulls (represented by the lowest non-negative dictionary id), and would get @@ -58,8 +58,7 @@ * and the string primitives are handled by the {@link KeyMappingMultiValueGroupByColumnSelectorStrategy} * * @param > Class of the dimension - * @see DimensionToIdConverter encoding logic for converting value to dictionary - * @see IdToDimensionConverter decoding logic for converting back dictionary to value + * @see DimensionIdCodec encoding decoding logic for converting value to dictionary */ @NotThreadSafe class KeyMappingGroupByColumnSelectorStrategy implements GroupByColumnSelectorStrategy @@ -67,7 +66,7 @@ class KeyMappingGroupByColumnSelectorStrategy implements GroupByC /** * Converts the dimension to equivalent dictionaryId. */ - final DimensionToIdConverter dimensionToIdConverter; + final DimensionIdCodec dimensionIdCodec; /** * Type of the dimension on which the grouping strategy is used @@ -84,21 +83,17 @@ class KeyMappingGroupByColumnSelectorStrategy implements GroupByC */ final DimensionType defaultValue; - final IdToDimensionConverter idToDimensionConverter; - KeyMappingGroupByColumnSelectorStrategy( - final DimensionToIdConverter dimensionToIdConverter, + final DimensionIdCodec dimensionIdCodec, final ColumnType columnType, final NullableTypeStrategy nullableTypeStrategy, - final DimensionType defaultValue, - final IdToDimensionConverter idToDimensionConverter + final DimensionType defaultValue ) { - this.dimensionToIdConverter = dimensionToIdConverter; + this.dimensionIdCodec = dimensionIdCodec; this.columnType = columnType; this.nullableTypeStrategy = nullableTypeStrategy; this.defaultValue = defaultValue; - this.idToDimensionConverter = idToDimensionConverter; } /** @@ -120,7 +115,7 @@ public void processValueFromGroupingKey( { final int id = key.getInt(keyBufferPosition); if (id != GROUP_BY_MISSING_VALUE) { - resultRow.set(selectorPlus.getResultRowPosition(), idToDimensionConverter.idToKey(id)); + resultRow.set(selectorPlus.getResultRowPosition(), dimensionIdCodec.idToKey(id)); } else { resultRow.set(selectorPlus.getResultRowPosition(), defaultValue); } @@ -138,7 +133,7 @@ public int initColumnValues(ColumnValueSelector selector, int columnIndex, Objec valuess[columnIndex] = GROUP_BY_MISSING_VALUE; return 0; } else { - MemoryEstimate idAndMemoryEstimate = dimensionToIdConverter.lookupId(value); + MemoryEstimate idAndMemoryEstimate = dimensionIdCodec.lookupId(value); valuess[columnIndex] = idAndMemoryEstimate.value(); return idAndMemoryEstimate.memoryIncrease(); } @@ -190,7 +185,7 @@ public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, keyBuffer.putInt(keyBufferPosition, GROUP_BY_MISSING_VALUE); return 0; } else { - MemoryEstimate idAndMemoryIncrease = dimensionToIdConverter.lookupId(value); + MemoryEstimate idAndMemoryIncrease = dimensionIdCodec.lookupId(value); keyBuffer.putInt(keyBufferPosition, idAndMemoryIncrease.value()); memoryIncrease = idAndMemoryIncrease.memoryIncrease(); } @@ -203,7 +198,7 @@ public Grouper.BufferComparator bufferComparator(int keyBufferPosition, @Nullabl boolean usesNaturalComparator = stringComparator == null || DimensionComparisonUtils.isNaturalComparator(columnType.getType(), stringComparator); - if (idToDimensionConverter.canCompareIds() && usesNaturalComparator) { + if (dimensionIdCodec.canCompareIds() && usesNaturalComparator) { return (lhsBuffer, rhsBuffer, lhsPosition, rhsPosition) -> Integer.compare( lhsBuffer.getInt(lhsPosition + keyBufferPosition), rhsBuffer.getInt(rhsPosition + keyBufferPosition) @@ -213,8 +208,8 @@ public Grouper.BufferComparator bufferComparator(int keyBufferPosition, @Nullabl int lhsDictId = lhsBuffer.getInt(lhsPosition + keyBufferPosition); int rhsDictId = rhsBuffer.getInt(rhsPosition + keyBufferPosition); - Object lhsObject = lhsDictId == GROUP_BY_MISSING_VALUE ? null : idToDimensionConverter.idToKey(lhsDictId); - Object rhsObject = rhsDictId == GROUP_BY_MISSING_VALUE ? null : idToDimensionConverter.idToKey(rhsDictId); + Object lhsObject = lhsDictId == GROUP_BY_MISSING_VALUE ? null : dimensionIdCodec.idToKey(lhsDictId); + Object rhsObject = rhsDictId == GROUP_BY_MISSING_VALUE ? null : dimensionIdCodec.idToKey(rhsDictId); if (usesNaturalComparator) { return nullableTypeStrategy.compare( (DimensionType) DimensionHandlerUtils.convertObjectToType(lhsObject, columnType), @@ -230,6 +225,6 @@ public Grouper.BufferComparator bufferComparator(int keyBufferPosition, @Nullabl @Override public void reset() { - // Nothing to do here. Implementations which build dictionaries should clean them in the reset method. + dimensionIdCodec.reset(); } } diff --git a/processing/src/main/java/org/apache/druid/segment/data/ComparableIntArray.java b/processing/src/main/java/org/apache/druid/segment/data/ComparableIntArray.java deleted file mode 100644 index 7769e98fda92..000000000000 --- a/processing/src/main/java/org/apache/druid/segment/data/ComparableIntArray.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.segment.data; - -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonValue; - -import java.util.Arrays; - -public class ComparableIntArray implements Comparable -{ - public static final ComparableIntArray EMPTY_ARRAY = new ComparableIntArray(new int[0]); - - final int[] delegate; - private int hashCode; - private boolean hashCodeComputed; - - private ComparableIntArray(int[] array) - { - delegate = array; - } - - @JsonCreator - public static ComparableIntArray of(int... array) - { - if (array.length == 0) { - return EMPTY_ARRAY; - } else { - return new ComparableIntArray(array); - } - } - - @JsonValue - public int[] getDelegate() - { - return delegate; - } - - @Override - public int hashCode() - { - // Check is not thread-safe, but that's fine. Even if used by multiple threads, it's ok to write these primitive - // fields more than once. - // As ComparableIntArray is used in hot loop caching the hashcode - if (!hashCodeComputed) { - hashCode = Arrays.hashCode(delegate); - hashCodeComputed = true; - } - - return hashCode; - } - - @Override - public boolean equals(Object obj) - { - if (this == obj) { - return true; - } - if (obj == null || getClass() != obj.getClass()) { - return false; - } - - return Arrays.equals(delegate, ((ComparableIntArray) obj).getDelegate()); - } - - @Override - public int compareTo(ComparableIntArray rhs) - { - // rhs.getDelegate() cannot be null - if (rhs == null) { - return 1; - } - final int minSize = Math.min(this.getDelegate().length, rhs.getDelegate().length); - //noinspection ArrayEquality - if (this.delegate == rhs.getDelegate()) { - return 0; - } else { - for (int i = 0; i < minSize; i++) { - //int's cant be null - final int cmp = Integer.compare(delegate[i], rhs.getDelegate()[i]); - if (cmp == 0) { - continue; - } - return cmp; - } - if (this.getDelegate().length == rhs.getDelegate().length) { - return 0; - } else if (this.getDelegate().length < rhs.getDelegate().length) { - return -1; - } else { - return 1; - } - } - } - - @Override - public String toString() - { - return Arrays.toString(delegate); - } -} diff --git a/processing/src/test/java/org/apache/druid/segment/data/ComparableIntArrayTest.java b/processing/src/test/java/org/apache/druid/segment/data/ComparableIntArrayTest.java deleted file mode 100644 index cfc4e34440f2..000000000000 --- a/processing/src/test/java/org/apache/druid/segment/data/ComparableIntArrayTest.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.segment.data; - -import org.junit.Assert; -import org.junit.Test; - -import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; - -public class ComparableIntArrayTest -{ - private final int[] array = new int[]{1, 2, 3}; - private final ComparableIntArray comparableIntArray = ComparableIntArray.of(1, 2, 3); - - @Test - public void testDelegate() - { - Assert.assertArrayEquals(array, comparableIntArray.getDelegate()); - Assert.assertEquals(0, ComparableIntArray.of(new int[0]).getDelegate().length); - Assert.assertEquals(0, ComparableIntArray.of().getDelegate().length); - } - - @Test - public void testHashCode() - { - Assert.assertEquals(Arrays.hashCode(array), comparableIntArray.hashCode()); - Set set = new HashSet<>(); - set.add(comparableIntArray); - set.add(ComparableIntArray.of(array)); - Assert.assertEquals(1, set.size()); - } - - @Test - public void testEquals() - { - Assert.assertTrue(comparableIntArray.equals(ComparableIntArray.of(array))); - Assert.assertFalse(comparableIntArray.equals(ComparableIntArray.of(1, 2, 5))); - Assert.assertFalse(comparableIntArray.equals(ComparableIntArray.EMPTY_ARRAY)); - Assert.assertFalse(comparableIntArray.equals(null)); - } - - @Test - public void testCompareTo() - { - Assert.assertEquals(0, comparableIntArray.compareTo(ComparableIntArray.of(array))); - Assert.assertEquals(1, comparableIntArray.compareTo(null)); - Assert.assertEquals(1, comparableIntArray.compareTo(ComparableIntArray.of(1, 2))); - Assert.assertEquals(-1, comparableIntArray.compareTo(ComparableIntArray.of(1, 2, 3, 4))); - Assert.assertTrue(comparableIntArray.compareTo(ComparableIntArray.of(2)) < 0); - } -} From dfb5c9f23212763425c05918e195490b2cdf2075 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Fri, 19 Apr 2024 15:01:40 +0530 Subject: [PATCH 41/46] review 1, more comments --- .github/workflows/static-checks.yml | 8 -------- ...nalityAggregatorColumnSelectorStrategyFactory.java | 2 +- .../dimension/ColumnSelectorStrategyFactory.java | 7 +++++-- .../query/filter/ArrayContainsElementFilter.java | 7 ++++++- .../GroupByColumnSelectorStrategyFactory.java | 2 +- .../groupby/epinephelinae/RowBasedGrouperHelper.java | 2 +- ...ctionaryBuildingGroupByColumnSelectorStrategy.java | 11 +++++++---- .../epinephelinae/column/DimensionIdCodec.java | 2 +- .../KeyMappingGroupByColumnSelectorStrategy.java | 8 ++++---- .../{MemoryEstimate.java => MemoryFootprint.java} | 10 +++++----- .../apache/druid/query/search/SearchQueryRunner.java | 2 +- .../types/TopNColumnAggregatesProcessorFactory.java | 2 +- .../apache/druid/segment/DimensionHandlerUtils.java | 4 ++-- .../column/ObjectStrategyComplexTypeStrategy.java | 7 ++----- .../segment/nested/NestedDataComplexTypeSerde.java | 1 - 15 files changed, 37 insertions(+), 38 deletions(-) rename processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/{MemoryEstimate.java => MemoryFootprint.java} (85%) diff --git a/.github/workflows/static-checks.yml b/.github/workflows/static-checks.yml index a374cf72ccfe..a87000ac07e0 100644 --- a/.github/workflows/static-checks.yml +++ b/.github/workflows/static-checks.yml @@ -163,17 +163,9 @@ jobs: ${MVN} install -q -ff -pl 'distribution' ${MAVEN_SKIP} ${MAVEN_SKIP_TESTS} - name: rewrite:dryRun - id: rewrite-dryRun run: | ${MVN} rewrite:dryRun ${MAVEN_SKIP} - - name: Upload open rewrite patch - if: ${{ failure() && steps.rewrite-dryRun.conclusion == 'failure' }} - uses: actions/upload-artifact@master - with: - name: Rewrite patch - path: ./target/rewrite/rewrite.patch - web-checks: strategy: fail-fast: false diff --git a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/CardinalityAggregatorColumnSelectorStrategyFactory.java b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/CardinalityAggregatorColumnSelectorStrategyFactory.java index 22f33f8b6b2f..ad22956efd41 100644 --- a/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/CardinalityAggregatorColumnSelectorStrategyFactory.java +++ b/processing/src/main/java/org/apache/druid/query/aggregation/cardinality/types/CardinalityAggregatorColumnSelectorStrategyFactory.java @@ -49,7 +49,7 @@ public CardinalityAggregatorColumnSelectorStrategy makeColumnSelectorStrategy( } @Override - public boolean supportsNestedArraysAndComplexTypes() + public boolean supportsComplexTypes() { return false; } diff --git a/processing/src/main/java/org/apache/druid/query/dimension/ColumnSelectorStrategyFactory.java b/processing/src/main/java/org/apache/druid/query/dimension/ColumnSelectorStrategyFactory.java index 59384f6d8a65..7e4db58a57df 100644 --- a/processing/src/main/java/org/apache/druid/query/dimension/ColumnSelectorStrategyFactory.java +++ b/processing/src/main/java/org/apache/druid/query/dimension/ColumnSelectorStrategyFactory.java @@ -26,6 +26,9 @@ public interface ColumnSelectorStrategyFactory 0, b -> 1, c -> 2 (value -> dictionaryId), then the dictionary would be laid out like: [a, b, c] */ private final List dictionary; @@ -131,6 +132,8 @@ private static class UniValueDimensionIdCodec implements DimensionIdCodec 0, b -> 1, c -> 2 (value -> dictionaryId), then the reverse dictionary would have the entries (a, 0), (b, 1), + * (c, 2) */ private final Object2IntMap reverseDictionary; @@ -149,7 +152,7 @@ public UniValueDimensionIdCodec( } @Override - public MemoryEstimate lookupId(Object dimension) + public MemoryFootprint lookupId(Object dimension) { int dictId = reverseDictionary.getInt(dimension); int footprintIncrease = 0; @@ -165,7 +168,7 @@ public MemoryEstimate lookupId(Object dimension) nullableTypeStrategy.estimateSizeBytes(dimension) ); } - return new MemoryEstimate<>(dictId, footprintIncrease); + return new MemoryFootprint<>(dictId, footprintIncrease); } @Override @@ -220,7 +223,7 @@ public StringArrayDimensionIdCodec( } @Override - public MemoryEstimate lookupId(Object dimension) + public MemoryFootprint lookupId(Object dimension) { // dimension IS non-null, by contract of this method Object[] stringArray = (Object[]) dimension; @@ -247,7 +250,7 @@ public MemoryEstimate lookupId(Object dimension) estimatedFootprint += DictionaryBuildingUtils.estimateEntryFootprint(dictionaryEncodedStringArray.size() * Integer.BYTES); } - return new MemoryEstimate<>(arrayDictId, estimatedFootprint); + return new MemoryFootprint<>(arrayDictId, estimatedFootprint); } @Override diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionIdCodec.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionIdCodec.java index f556c62063df..3f5a2074453b 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionIdCodec.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DimensionIdCodec.java @@ -41,7 +41,7 @@ public interface DimensionIdCodec /** * @return DictionaryId of the object at the given index and the memory increase associated with it */ - MemoryEstimate lookupId(DimensionType dimension); + MemoryFootprint lookupId(DimensionType dimension); DimensionType idToKey(int id); diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java index dcfdef865199..16a563ffb4b9 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -133,9 +133,9 @@ public int initColumnValues(ColumnValueSelector selector, int columnIndex, Objec valuess[columnIndex] = GROUP_BY_MISSING_VALUE; return 0; } else { - MemoryEstimate idAndMemoryEstimate = dimensionIdCodec.lookupId(value); - valuess[columnIndex] = idAndMemoryEstimate.value(); - return idAndMemoryEstimate.memoryIncrease(); + MemoryFootprint idAndMemoryFootprint = dimensionIdCodec.lookupId(value); + valuess[columnIndex] = idAndMemoryFootprint.value(); + return idAndMemoryFootprint.memoryIncrease(); } } @@ -185,7 +185,7 @@ public int writeToKeyBuffer(int keyBufferPosition, ColumnValueSelector selector, keyBuffer.putInt(keyBufferPosition, GROUP_BY_MISSING_VALUE); return 0; } else { - MemoryEstimate idAndMemoryIncrease = dimensionIdCodec.lookupId(value); + MemoryFootprint idAndMemoryIncrease = dimensionIdCodec.lookupId(value); keyBuffer.putInt(keyBufferPosition, idAndMemoryIncrease.value()); memoryIncrease = idAndMemoryIncrease.memoryIncrease(); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MemoryEstimate.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MemoryFootprint.java similarity index 85% rename from processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MemoryEstimate.java rename to processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MemoryFootprint.java index da02ca143f9d..64303770d623 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MemoryEstimate.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/MemoryFootprint.java @@ -22,16 +22,16 @@ /** * Holder for a value and the memory increase in the internal dictionary associated with the increase */ -public class MemoryEstimate +public class MemoryFootprint { private final T value; - private final int memoryIncrease; + private final int footprintIncrease; // Reduced visibility - MemoryEstimate(T value, int memoryIncrease) + MemoryFootprint(T value, int footprintIncrease) { this.value = value; - this.memoryIncrease = memoryIncrease; + this.footprintIncrease = footprintIncrease; } public T value() @@ -41,6 +41,6 @@ public T value() public int memoryIncrease() { - return memoryIncrease; + return footprintIncrease; } } diff --git a/processing/src/main/java/org/apache/druid/query/search/SearchQueryRunner.java b/processing/src/main/java/org/apache/druid/query/search/SearchQueryRunner.java index fbfa4cb6b7fa..49d9a5ac3e24 100644 --- a/processing/src/main/java/org/apache/druid/query/search/SearchQueryRunner.java +++ b/processing/src/main/java/org/apache/druid/query/search/SearchQueryRunner.java @@ -87,7 +87,7 @@ public SearchColumnSelectorStrategy makeColumnSelectorStrategy( } @Override - public boolean supportsNestedArraysAndComplexTypes() + public boolean supportsComplexTypes() { return false; } diff --git a/processing/src/main/java/org/apache/druid/query/topn/types/TopNColumnAggregatesProcessorFactory.java b/processing/src/main/java/org/apache/druid/query/topn/types/TopNColumnAggregatesProcessorFactory.java index 472d58ca6605..e921e1231f83 100644 --- a/processing/src/main/java/org/apache/druid/query/topn/types/TopNColumnAggregatesProcessorFactory.java +++ b/processing/src/main/java/org/apache/druid/query/topn/types/TopNColumnAggregatesProcessorFactory.java @@ -81,7 +81,7 @@ public TopNColumnAggregatesProcessor makeColumnSelectorStrategy( } @Override - public boolean supportsNestedArraysAndComplexTypes() + public boolean supportsComplexTypes() { return false; } diff --git a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java index 15c287bc73a4..da63413cb4ba 100644 --- a/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java +++ b/processing/src/main/java/org/apache/druid/segment/DimensionHandlerUtils.java @@ -203,7 +203,7 @@ public static ColumnSelectorPlus selector = getColumnValueSelectorFromDimensionSpec( dimSpec, columnSelectorFactory, - strategyFactory.supportsNestedArraysAndComplexTypes() + strategyFactory.supportsComplexTypes() ); Strategy strategy = makeStrategy( strategyFactory, @@ -293,7 +293,7 @@ private static Strategy makeStrategy( ColumnValueSelector selector ) { - capabilities = getEffectiveCapabilities(dimSpec, capabilities, strategyFactory.supportsNestedArraysAndComplexTypes()); + capabilities = getEffectiveCapabilities(dimSpec, capabilities, strategyFactory.supportsComplexTypes()); return strategyFactory.makeColumnSelectorStrategy(capabilities, selector, dimSpec.getDimension()); } diff --git a/processing/src/main/java/org/apache/druid/segment/column/ObjectStrategyComplexTypeStrategy.java b/processing/src/main/java/org/apache/druid/segment/column/ObjectStrategyComplexTypeStrategy.java index 52e4599586b9..267477e52319 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/ObjectStrategyComplexTypeStrategy.java +++ b/processing/src/main/java/org/apache/druid/segment/column/ObjectStrategyComplexTypeStrategy.java @@ -37,25 +37,22 @@ public class ObjectStrategyComplexTypeStrategy implements TypeStrategy { private final ObjectStrategy objectStrategy; private final TypeSignature typeSignature; - private final boolean groupable; @Nullable private final Hash.Strategy hashStrategy; public ObjectStrategyComplexTypeStrategy(ObjectStrategy objectStrategy, TypeSignature signature) { - this(objectStrategy, signature, false, null); + this(objectStrategy, signature, null); } public ObjectStrategyComplexTypeStrategy( ObjectStrategy objectStrategy, TypeSignature signature, - boolean groupable, @Nullable final Hash.Strategy hashStrategy ) { this.objectStrategy = objectStrategy; this.typeSignature = signature; - this.groupable = groupable; this.hashStrategy = hashStrategy; } @@ -116,7 +113,7 @@ public T fromBytes(byte[] value) @Override public boolean groupable() { - return groupable; + return hashStrategy != null; } @Override diff --git a/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java b/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java index 27df8fc2cb10..24d8cac35607 100644 --- a/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java +++ b/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java @@ -166,7 +166,6 @@ public > TypeStrategy getTypeStrategy() return new ObjectStrategyComplexTypeStrategy<>( getObjectStrategy(), ColumnType.ofComplex(TYPE_NAME), - true, new Hash.Strategy() { @Override From 2098f90345eaa94e19df45d7160aa9abd38fc2f6 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Mon, 22 Apr 2024 11:27:48 +0530 Subject: [PATCH 42/46] cleanup todos, review comments --- .../GroupByColumnSelectorStrategyFactory.java | 1 - ...BuildingGroupByColumnSelectorStrategy.java | 37 ++++++------------- ...yMappingGroupByColumnSelectorStrategy.java | 5 +-- .../nested/NestedDataComplexTypeSerde.java | 4 -- .../druid/segment/nested/StructuredData.java | 1 - 5 files changed, 13 insertions(+), 35 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java index cbc28f717cfc..53ae47d41196 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByColumnSelectorStrategyFactory.java @@ -53,7 +53,6 @@ public GroupByColumnSelectorStrategy makeColumnSelectorStrategy( throw DruidException.defensive("Unable to deduce type for the grouping dimension"); } try { - // TODO(laksh): Check if the .getNullableStrategy() works, and doesn't throw if (!capabilities.toColumnType().getNullableStrategy().groupable()) { // InvalidInput because the SQL planner would have already flagged these dimensions, therefore this will only happen // if native queries have been submitted. diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java index 0178520a9a88..9aa087f501d7 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -92,11 +92,8 @@ public static GroupByColumnSelectorStrategy forType(final ColumnType columnType) */ private static GroupByColumnSelectorStrategy forArrayAndComplexTypes(final ColumnType columnType) { - final List dictionary = DictionaryBuildingUtils.createDictionary(); - final Object2IntMap reverseDictionary = - DictionaryBuildingUtils.createReverseDictionary(columnType.getNullableStrategy()); return new DictionaryBuildingGroupByColumnSelectorStrategy<>( - new UniValueDimensionIdCodec(dictionary, reverseDictionary, columnType.getNullableStrategy()), + new UniValueDimensionIdCodec(columnType.getNullableStrategy()), columnType, columnType.getNullableStrategy(), null @@ -105,10 +102,8 @@ private static GroupByColumnSelectorStrategy forArrayAndComplexTypes(final Colum private static GroupByColumnSelectorStrategy forStringArrays() { - final BiMap elementBiDictionary = HashBiMap.create(); - final BiMap, Integer> arrayBiDictionary = HashBiMap.create(); return new DictionaryBuildingGroupByColumnSelectorStrategy<>( - new StringArrayDimensionIdCodec(elementBiDictionary, arrayBiDictionary), + new StringArrayDimensionIdCodec(), ColumnType.STRING_ARRAY, ColumnType.STRING_ARRAY.getNullableStrategy(), null @@ -140,14 +135,10 @@ private static class UniValueDimensionIdCodec implements DimensionIdCodec dictionary, - final Object2IntMap reverseDictionary, - final NullableTypeStrategy nullableTypeStrategy - ) + public UniValueDimensionIdCodec(final NullableTypeStrategy nullableTypeStrategy) { - this.dictionary = dictionary; - this.reverseDictionary = reverseDictionary; + this.dictionary = DictionaryBuildingUtils.createDictionary(); + this.reverseDictionary = DictionaryBuildingUtils.createReverseDictionary(nullableTypeStrategy); this.nullableTypeStrategy = nullableTypeStrategy; } @@ -204,23 +195,15 @@ public void reset() */ private static class StringArrayDimensionIdCodec implements DimensionIdCodec { + // TODO(laksh): Use dictionaryBuilding + reverseBuilding // contains string <-> id for each element of the multi value grouping column // for eg : [a,b,c] is the col value. dictionaryToInt will contain { a <-> 1, b <-> 2, c <-> 3} - private final BiMap elementBiDictionary; + private final BiMap elementBiDictionary = HashBiMap.create(); // stores each row as an integer array where the int represents the value in dictionaryToInt // for eg : [a,b,c] would be converted to [1,2,3] and assigned a integer value 1. // [1,2,3] <-> 1 - private final BiMap, Integer> arrayBiDictionary; - - public StringArrayDimensionIdCodec( - BiMap elementBiDictionary, - BiMap, Integer> arrayBiDictionary - ) - { - this.elementBiDictionary = elementBiDictionary; - this.arrayBiDictionary = arrayBiDictionary; - } + private final BiMap, Integer> arrayBiDictionary = HashBiMap.create(); @Override public MemoryFootprint lookupId(Object dimension) @@ -238,7 +221,9 @@ public MemoryFootprint lookupId(Object dimension) // We're not using the dictionary and reverseDictionary from DictionaryBuilding, but the BiMap is close enough // that we expect this footprint calculation to still be useful. estimatedFootprint += - DictionaryBuildingUtils.estimateEntryFootprint(elementCasted == null ? 0 : elementCasted.length() * Character.BYTES); + DictionaryBuildingUtils.estimateEntryFootprint(elementCasted == null + ? 0 + : elementCasted.length() * Character.BYTES); } dictionaryEncodedStringArray.add(elementDictId); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java index 16a563ffb4b9..350118c95cb8 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/KeyMappingGroupByColumnSelectorStrategy.java @@ -38,8 +38,7 @@ * a fixed width, and is often backed by a dictionary representing the actual dimension object. It is used for arrays, * and complex types. *

- * The visibility of the class is limited, and the callers must use one of the two variants of the mapping strategy: - * 1. TODO(laksh): PrebuiltDictionaryGroupByColumnSelectorStrategy - (not available, because no one is using it) + * The visibility of the class is limited, and the callers must use the following variant of the mapping strategy: * 2. {@link DictionaryBuildingGroupByColumnSelectorStrategy} *

* {@code null} can be represented by either -1 or the position of null in the dictionary it was stored when it was @@ -57,7 +56,7 @@ * It only handles non-primitive types, because numeric primitives are handled by the {@link FixedWidthGroupByColumnSelectorStrategy} * and the string primitives are handled by the {@link KeyMappingMultiValueGroupByColumnSelectorStrategy} * - * @param > Class of the dimension + * @param Class of the dimension * @see DimensionIdCodec encoding decoding logic for converting value to dictionary */ @NotThreadSafe diff --git a/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java b/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java index 24d8cac35607..4d1bb347f218 100644 --- a/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java +++ b/processing/src/main/java/org/apache/druid/segment/nested/NestedDataComplexTypeSerde.java @@ -171,16 +171,12 @@ public > TypeStrategy getTypeStrategy() @Override public int hashCode(Object o) { - // TODO(laksh): VET, Check if StructuredData.wrap(o).hashCode() makes sense, given that most of the objects inside - // are primitives or those that have implemented .hashCode correctly return StructuredData.wrap(o).equalityHash(); } @Override public boolean equals(Object a, Object b) { - // TODO(laksh): VET, .equals() implementation of structured data is not very good for our purpose. It - // resorts to the object equality return StructuredData.wrap(a).compareTo(StructuredData.wrap(b)) == 0; } } diff --git a/processing/src/main/java/org/apache/druid/segment/nested/StructuredData.java b/processing/src/main/java/org/apache/druid/segment/nested/StructuredData.java index fb5c9cc17902..9dfa79b57d95 100644 --- a/processing/src/main/java/org/apache/druid/segment/nested/StructuredData.java +++ b/processing/src/main/java/org/apache/druid/segment/nested/StructuredData.java @@ -187,7 +187,6 @@ public int hashCode() } // hashCode that relies on the object equality. Translates the hashcode to an integer as well - // TODO(laksh): better name public int equalityHash() { return Longs.hashCode(hash.getAsLong()); From 3fbadd96a79fc3f72c354af2f2a3c6ad5fe1d906 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Tue, 23 Apr 2024 17:21:29 +0530 Subject: [PATCH 43/46] sorting and limiting fixup --- .../druid/query/groupby/GroupByQuery.java | 26 ++++--------------- .../epinephelinae/RowBasedGrouperHelper.java | 23 +++------------- .../groupby/orderby/DefaultLimitSpec.java | 11 +++++++- 3 files changed, 19 insertions(+), 41 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQuery.java b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQuery.java index 4ca5b096ec0c..8f6dcff5e7c7 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQuery.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQuery.java @@ -43,6 +43,7 @@ import org.apache.druid.java.util.common.guava.Sequences; import org.apache.druid.query.BaseQuery; import org.apache.druid.query.DataSource; +import org.apache.druid.query.DimensionComparisonUtils; import org.apache.druid.query.Queries; import org.apache.druid.query.Query; import org.apache.druid.query.QueryDataSource; @@ -597,11 +598,7 @@ private Ordering getRowOrderingForPushDown( needsReverseList.add(false); final ColumnType type = dimensions.get(i).getOutputType(); dimensionTypes.add(type); - if (type.isNumeric()) { - comparators.add(StringComparators.NUMERIC); - } else { - comparators.add(StringComparators.LEXICOGRAPHIC); - } + comparators.add(StringComparators.NATURAL); } } @@ -780,23 +777,10 @@ private static int compareDimsForLimitPushDown( final Object lhsObj = lhs.get(fieldNumber); final Object rhsObj = rhs.get(fieldNumber); - if (dimensionType.isNumeric()) { - if (comparator.equals(StringComparators.NUMERIC)) { - dimCompare = DimensionHandlerUtils.compareObjectsAsType(lhsObj, rhsObj, dimensionType); - } else { - dimCompare = comparator.compare(String.valueOf(lhsObj), String.valueOf(rhsObj)); - } - } else if (dimensionType.equals(ColumnType.STRING_ARRAY)) { - final Object[] lhsArr = DimensionHandlerUtils.coerceToStringArray(lhsObj); - final Object[] rhsArr = DimensionHandlerUtils.coerceToStringArray(rhsObj); - dimCompare = ColumnType.STRING_ARRAY.getNullableStrategy().compare(lhsArr, rhsArr); - } else if (dimensionType.equals(ColumnType.LONG_ARRAY) - || dimensionType.equals(ColumnType.DOUBLE_ARRAY)) { - final Object[] lhsArr = DimensionHandlerUtils.convertToArray(lhsObj, dimensionType.getElementType()); - final Object[] rhsArr = DimensionHandlerUtils.convertToArray(rhsObj, dimensionType.getElementType()); - dimCompare = dimensionType.getNullableStrategy().compare(lhsArr, rhsArr); + if (DimensionComparisonUtils.isNaturalComparator(dimensionType.getType(), comparator)) { + dimCompare = DimensionHandlerUtils.compareObjectsAsType(lhsObj, rhsObj, dimensionType); } else { - dimCompare = comparator.compare((String) lhsObj, (String) rhsObj); + dimCompare = comparator.compare(String.valueOf(lhsObj), String.valueOf(rhsObj)); } if (dimCompare != 0) { diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java index b4607f11c5d5..491c28d41427 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/RowBasedGrouperHelper.java @@ -739,7 +739,6 @@ public Supplier makeInputRawSupplier(DimensionSelector selector) } } - // TODO(laksh): Figure out why this isn't getting triggered private static class InputRawSupplierColumnSelectorStrategyFactory implements ColumnSelectorStrategyFactory { @@ -763,24 +762,10 @@ public InputRawSupplierColumnSelectorStrategy makeColumnSelectorStrategy( return (InputRawSupplierColumnSelectorStrategy) columnSelector -> () -> columnSelector.isNull() ? null : columnSelector.getDouble(); case ARRAY: - switch (capabilities.getElementType().getType()) { - case STRING: - return (InputRawSupplierColumnSelectorStrategy) - columnSelector -> - () -> DimensionHandlerUtils.coerceToStringArray(columnSelector.getObject()); - case FLOAT: - case LONG: - case DOUBLE: - return (InputRawSupplierColumnSelectorStrategy) - columnSelector -> - () -> DimensionHandlerUtils.convertToArray(columnSelector.getObject(), - capabilities.getElementType()); - default: - throw new IAE( - "Cannot create query type helper from invalid type [%s]", - capabilities.asTypeString() - ); - } + case COMPLEX: + return (InputRawSupplierColumnSelectorStrategy) + columnSelector -> + () -> DimensionHandlerUtils.convertObjectToType(columnSelector.getObject(), capabilities.toColumnType()); default: throw new IAE("Cannot create query type helper from invalid type [%s]", capabilities.asTypeString()); } diff --git a/processing/src/main/java/org/apache/druid/query/groupby/orderby/DefaultLimitSpec.java b/processing/src/main/java/org/apache/druid/query/groupby/orderby/DefaultLimitSpec.java index 6f7fab563810..2cc7b9d3643d 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/orderby/DefaultLimitSpec.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/orderby/DefaultLimitSpec.java @@ -478,6 +478,15 @@ private Ordering dimensionOrdering( throw new ISE("Cannot create comparator for array type %s.", columnType.toString()); } } + final Comparator comparatorToUse; + if (arrayComparator != null) { + comparatorToUse = arrayComparator; + } else { + comparatorToUse = DimensionComparisonUtils.isNaturalComparator(columnType.getType(), stringComparator) + ? columnType.getNullableStrategy() + : stringComparator; + } + return Ordering.from( Comparator.comparing( (ResultRow row) -> { @@ -487,7 +496,7 @@ private Ordering dimensionOrdering( return getDimensionValue(row, column); } }, - Comparator.nullsFirst(arrayComparator == null ? stringComparator : arrayComparator) + Comparator.nullsFirst(comparatorToUse) ) ); } From 5d5c8ae92ebfcc952a2b290c7e63ff2dca4e36f3 Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Wed, 24 Apr 2024 10:45:16 +0530 Subject: [PATCH 44/46] prevent round trip --- .../groupby/orderby/DefaultLimitSpec.java | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/orderby/DefaultLimitSpec.java b/processing/src/main/java/org/apache/druid/query/groupby/orderby/DefaultLimitSpec.java index 2cc7b9d3643d..3c54a9de737c 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/orderby/DefaultLimitSpec.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/orderby/DefaultLimitSpec.java @@ -27,11 +27,9 @@ import com.google.common.base.Functions; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; -import com.google.common.collect.Iterables; import com.google.common.collect.Ordering; import com.google.common.primitives.Longs; import org.apache.druid.common.config.NullHandling; -import org.apache.druid.data.input.Rows; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.granularity.Granularities; import org.apache.druid.java.util.common.guava.Sequence; @@ -52,7 +50,6 @@ import org.apache.druid.segment.column.TypeSignature; import org.apache.druid.segment.column.ValueType; -import javax.annotation.Nullable; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.Collections; @@ -489,25 +486,12 @@ private Ordering dimensionOrdering( return Ordering.from( Comparator.comparing( - (ResultRow row) -> { - if (columnType.isArray()) { - return row.get(column); - } else { - return getDimensionValue(row, column); - } - }, + (ResultRow row) -> row.get(column), Comparator.nullsFirst(comparatorToUse) ) ); } - @Nullable - private static String getDimensionValue(ResultRow row, int column) - { - final List values = Rows.objectToStrings(row.get(column)); - return values.isEmpty() ? null : Iterables.getOnlyElement(values); - } - @Override public String toString() { From ad2fb04c3648e5e3e322639f0abb2fdc77ada17e Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Wed, 24 Apr 2024 10:53:25 +0530 Subject: [PATCH 45/46] prevent round trip 2 --- .../groupby/orderby/DefaultLimitSpec.java | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/orderby/DefaultLimitSpec.java b/processing/src/main/java/org/apache/druid/query/groupby/orderby/DefaultLimitSpec.java index 3c54a9de737c..a34d58b6d140 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/orderby/DefaultLimitSpec.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/orderby/DefaultLimitSpec.java @@ -27,9 +27,11 @@ import com.google.common.base.Functions; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; import com.google.common.collect.Ordering; import com.google.common.primitives.Longs; import org.apache.druid.common.config.NullHandling; +import org.apache.druid.data.input.Rows; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.granularity.Granularities; import org.apache.druid.java.util.common.guava.Sequence; @@ -50,6 +52,7 @@ import org.apache.druid.segment.column.TypeSignature; import org.apache.druid.segment.column.ValueType; +import javax.annotation.Nullable; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.Collections; @@ -486,12 +489,33 @@ private Ordering dimensionOrdering( return Ordering.from( Comparator.comparing( - (ResultRow row) -> row.get(column), + (ResultRow row) -> { + if (columnType.isArray()) { + // Arrays have a specialized comparator, that applies the ordering per element. That will handle the casting + // and the comparison + return row.get(column); + } else if (DimensionComparisonUtils.isNaturalComparator(columnType.getType(), stringComparator)) { + // If the natural comparator is used, we can directly extract the dimension value, and the type strategy's comparison + // function will handle it, without casting + return row.get(column); + } else { + // If the comparator is not natural, we will be using the string comparator, and we need to cast the dimension to string + // before comparison + return getDimensionValue(row, column); + } + }, Comparator.nullsFirst(comparatorToUse) ) ); } + @Nullable + private static String getDimensionValue(ResultRow row, int column) + { + final List values = Rows.objectToStrings(row.get(column)); + return values.isEmpty() ? null : Iterables.getOnlyElement(values); + } + @Override public String toString() { From 7fe36b1d68b6b06f342f1408cb2be2247348863b Mon Sep 17 00:00:00 2001 From: Laksh Singla Date: Wed, 24 Apr 2024 13:45:15 +0530 Subject: [PATCH 46/46] tests, and fixup flake, preserve old incorrect behaviour --- .../druid/query/groupby/GroupByQuery.java | 25 ++- ...BuildingGroupByColumnSelectorStrategy.java | 1 - .../calcite/CalciteNestedDataQueryTest.java | 179 ++++++++++++++++++ 3 files changed, 202 insertions(+), 3 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQuery.java b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQuery.java index 8f6dcff5e7c7..cdcf9e3daf40 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQuery.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQuery.java @@ -759,6 +759,12 @@ private DateTime computeUniversalTimestamp() } } + /** + * Compares the dimensions for limit pushdown. + * + * Due to legacy reason, the provided StringComparator for the arrays isn't applied and must be changed once we + * get rid of the StringComparators for array types + */ private static int compareDimsForLimitPushDown( final IntList fields, final List needsReverseList, @@ -777,10 +783,25 @@ private static int compareDimsForLimitPushDown( final Object lhsObj = lhs.get(fieldNumber); final Object rhsObj = rhs.get(fieldNumber); - if (DimensionComparisonUtils.isNaturalComparator(dimensionType.getType(), comparator)) { + if (dimensionType.isNumeric()) { + if (DimensionComparisonUtils.isNaturalComparator(dimensionType.getType(), comparator)) { + dimCompare = DimensionHandlerUtils.compareObjectsAsType(lhsObj, rhsObj, dimensionType); + } else { + dimCompare = comparator.compare(String.valueOf(lhsObj), String.valueOf(rhsObj)); + } + } else if (dimensionType.equals(ColumnType.STRING_ARRAY)) { + final Object[] lhsArr = DimensionHandlerUtils.coerceToStringArray(lhsObj); + final Object[] rhsArr = DimensionHandlerUtils.coerceToStringArray(rhsObj); + dimCompare = ColumnType.STRING_ARRAY.getNullableStrategy().compare(lhsArr, rhsArr); + } else if (dimensionType.equals(ColumnType.LONG_ARRAY) + || dimensionType.equals(ColumnType.DOUBLE_ARRAY)) { + final Object[] lhsArr = DimensionHandlerUtils.convertToArray(lhsObj, dimensionType.getElementType()); + final Object[] rhsArr = DimensionHandlerUtils.convertToArray(rhsObj, dimensionType.getElementType()); + dimCompare = dimensionType.getNullableStrategy().compare(lhsArr, rhsArr); + } else if (DimensionComparisonUtils.isNaturalComparator(dimensionType.getType(), comparator)) { dimCompare = DimensionHandlerUtils.compareObjectsAsType(lhsObj, rhsObj, dimensionType); } else { - dimCompare = comparator.compare(String.valueOf(lhsObj), String.valueOf(rhsObj)); + dimCompare = comparator.compare((String) lhsObj, (String) rhsObj); } if (dimCompare != 0) { diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java index 9aa087f501d7..cf033eaa65d6 100644 --- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java +++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java @@ -195,7 +195,6 @@ public void reset() */ private static class StringArrayDimensionIdCodec implements DimensionIdCodec { - // TODO(laksh): Use dictionaryBuilding + reverseBuilding // contains string <-> id for each element of the multi value grouping column // for eg : [a,b,c] is the col value. dictionaryToInt will contain { a <-> 1, b <-> 2, c <-> 3} private final BiMap elementBiDictionary = HashBiMap.create(); diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java index c9f59e469384..dceddbaff436 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java @@ -54,6 +54,8 @@ import org.apache.druid.query.filter.ExpressionDimFilter; import org.apache.druid.query.filter.LikeDimFilter; import org.apache.druid.query.groupby.GroupByQuery; +import org.apache.druid.query.groupby.orderby.DefaultLimitSpec; +import org.apache.druid.query.groupby.orderby.OrderByColumnSpec; import org.apache.druid.query.ordering.StringComparators; import org.apache.druid.query.scan.ScanQuery; import org.apache.druid.query.topn.DimensionTopNMetricSpec; @@ -567,6 +569,183 @@ public void testGroupByOnNestedColumn() ); } + @Test + public void testGroupByOnNestedColumnWithOrderBy() + { + cannotVectorize(); + testQuery( + "SELECT nester, SUM(strlen(string)) FROM druid.nested GROUP BY 1", + ImmutableList.of( + GroupByQuery.builder() + .setDataSource(DATA_SOURCE) + .setInterval(querySegmentSpec(Filtration.eternity())) + .setGranularity(Granularities.ALL) + .setVirtualColumns( + new ExpressionVirtualColumn("v0", "strlen(\"string\")", ColumnType.LONG, queryFramework().macroTable()) + ) + .setDimensions(dimensions(new DefaultDimensionSpec("nester", "d0", ColumnType.NESTED_DATA))) + .setAggregatorSpecs(aggregators(new LongSumAggregatorFactory("a0", "v0"))) + .setContext(QUERY_CONTEXT_DEFAULT) + .build() + ), + ImmutableList.of( + new Object[]{null, 9L}, + new Object[]{"\"hello\"", 3L}, + new Object[]{"2", 3L}, + new Object[]{"{\"array\":[\"a\",\"b\"],\"n\":{\"x\":\"hello\"}}", 3L}, + new Object[]{"{\"array\":[\"a\",\"b\"],\"n\":{\"x\":1}}", 3L} + ) + ); + } + + @Test + public void testGroupByOnNestedColumnWithOrderByAndLimit() + { + cannotVectorize(); + testQuery( + "SELECT nester, SUM(strlen(string)) FROM druid.nested GROUP BY 1 ORDER BY 1 LIMIT 100", + ImmutableList.of( + GroupByQuery.builder() + .setDataSource(DATA_SOURCE) + .setInterval(querySegmentSpec(Filtration.eternity())) + .setGranularity(Granularities.ALL) + .setVirtualColumns( + new ExpressionVirtualColumn( + "v0", + "strlen(\"string\")", + ColumnType.LONG, + queryFramework().macroTable() + ) + ) + .setDimensions(dimensions(new DefaultDimensionSpec("nester", "d0", ColumnType.NESTED_DATA))) + .setAggregatorSpecs(aggregators(new LongSumAggregatorFactory("a0", "v0"))) + .setLimitSpec(new DefaultLimitSpec( + ImmutableList.of(new OrderByColumnSpec( + "d0", + OrderByColumnSpec.Direction.ASCENDING, + StringComparators.NATURAL + )), + 100 + )) + .setContext(QUERY_CONTEXT_DEFAULT) + .build() + ), + ImmutableList.of( + new Object[]{null, 9L}, + new Object[]{"\"hello\"", 3L}, + new Object[]{"2", 3L}, + new Object[]{"{\"array\":[\"a\",\"b\"],\"n\":{\"x\":\"hello\"}}", 3L}, + new Object[]{"{\"array\":[\"a\",\"b\"],\"n\":{\"x\":1}}", 3L} + ) + ); + } + + @Test + public void testGroupByOnNestedColumnWithOrderByAndLimit2() + { + cannotVectorize(); + testQuery( + "SELECT nester, SUM(strlen(string)) FROM druid.nested GROUP BY 1 ORDER BY 1 LIMIT 2", + ImmutableList.of( + GroupByQuery.builder() + .setDataSource(DATA_SOURCE) + .setInterval(querySegmentSpec(Filtration.eternity())) + .setGranularity(Granularities.ALL) + .setVirtualColumns( + new ExpressionVirtualColumn( + "v0", + "strlen(\"string\")", + ColumnType.LONG, + queryFramework().macroTable() + ) + ) + .setDimensions(dimensions(new DefaultDimensionSpec("nester", "d0", ColumnType.NESTED_DATA))) + .setAggregatorSpecs(aggregators(new LongSumAggregatorFactory("a0", "v0"))) + .setLimitSpec(new DefaultLimitSpec( + ImmutableList.of(new OrderByColumnSpec( + "d0", + OrderByColumnSpec.Direction.ASCENDING, + StringComparators.NATURAL + )), + 2 + )) + .setContext(QUERY_CONTEXT_DEFAULT) + .build() + ), + ImmutableList.of( + new Object[]{null, 9L}, + new Object[]{"\"hello\"", 3L} + ) + ); + } + + @Test + public void testGroupByOnNestedColumnWithLimit() + { + cannotVectorize(); + testQuery( + "SELECT nester, SUM(strlen(string)) FROM druid.nested GROUP BY 1 LIMIT 100", + ImmutableList.of( + GroupByQuery.builder() + .setDataSource(DATA_SOURCE) + .setInterval(querySegmentSpec(Filtration.eternity())) + .setGranularity(Granularities.ALL) + .setVirtualColumns( + new ExpressionVirtualColumn( + "v0", + "strlen(\"string\")", + ColumnType.LONG, + queryFramework().macroTable() + ) + ) + .setDimensions(dimensions(new DefaultDimensionSpec("nester", "d0", ColumnType.NESTED_DATA))) + .setAggregatorSpecs(aggregators(new LongSumAggregatorFactory("a0", "v0"))) + .setLimitSpec(new DefaultLimitSpec(null, 100)) + .setContext(QUERY_CONTEXT_DEFAULT) + .build() + ), + ImmutableList.of( + new Object[]{null, 9L}, + new Object[]{"\"hello\"", 3L}, + new Object[]{"2", 3L}, + new Object[]{"{\"array\":[\"a\",\"b\"],\"n\":{\"x\":\"hello\"}}", 3L}, + new Object[]{"{\"array\":[\"a\",\"b\"],\"n\":{\"x\":1}}", 3L} + ) + ); + } + + @Test + public void testGroupByOnNestedColumnWithLimit2() + { + cannotVectorize(); + testQuery( + "SELECT nester, SUM(strlen(string)) FROM druid.nested GROUP BY 1 LIMIT 2", + ImmutableList.of( + GroupByQuery.builder() + .setDataSource(DATA_SOURCE) + .setInterval(querySegmentSpec(Filtration.eternity())) + .setGranularity(Granularities.ALL) + .setVirtualColumns( + new ExpressionVirtualColumn( + "v0", + "strlen(\"string\")", + ColumnType.LONG, + queryFramework().macroTable() + ) + ) + .setDimensions(dimensions(new DefaultDimensionSpec("nester", "d0", ColumnType.NESTED_DATA))) + .setAggregatorSpecs(aggregators(new LongSumAggregatorFactory("a0", "v0"))) + .setLimitSpec(new DefaultLimitSpec(null, 2)) + .setContext(QUERY_CONTEXT_DEFAULT) + .build() + ), + ImmutableList.of( + new Object[]{null, 9L}, + new Object[]{"\"hello\"", 3L} + ) + ); + } + @Test public void testGroupByRootPath() {