apache · clintropolis · Jul 6, 2021 · May 6, 2021 · May 12, 2021 · May 12, 2021
diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlExpressionBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlExpressionBenchmark.java
@@ -178,13 +178,20 @@ public String getFormatString()
       // 26: group by string expr with non-expr agg
       "SELECT CONCAT(string2, '-', long2), SUM(double1) FROM foo GROUP BY 1 ORDER BY 2",
       // 27: group by string expr with expr agg
-      "SELECT CONCAT(string2, '-', long2), SUM(long1 * double4) FROM foo GROUP BY 1 ORDER BY 2"
+      "SELECT CONCAT(string2, '-', long2), SUM(long1 * double4) FROM foo GROUP BY 1 ORDER BY 2",
+      // 28: group by single input string low cardinality expr with expr agg
+      "SELECT CONCAT(string2, '-', 'foo'), SUM(long1 * long4) FROM foo GROUP BY 1 ORDER BY 2",
+      // 28: group by single input string high cardinality expr with expr agg
+      "SELECT CONCAT(string3, '-', 'foo'), SUM(long1 * long4) FROM foo GROUP BY 1 ORDER BY 2"
   );
 
   @Param({"5000000"})
   private int rowsPerSegment;
 
-  @Param({"false", "force"})
+  @Param({
+      "false",
+      "force"
+  })
   private String vectorize;
 
   @Param({
@@ -217,7 +224,9 @@ public String getFormatString()
       "24",
       "25",
       "26",
-      "27"
+      "27",
+      "28",
+      "29"
   })
   private String query;
 

diff --git a/core/src/main/java/org/apache/druid/concurrent/LifecycleLock.java b/core/src/main/java/org/apache/druid/concurrent/LifecycleLock.java
@@ -257,7 +257,7 @@ public boolean canStop()
   }
 
   /**
-   * Finalizes stopping the the LifecycleLock. This method must be called before exit from stop() on this object,
+   * Finalizes stopping the LifecycleLock. This method must be called before exit from stop() on this object,
    * usually in a finally block. If you're using a restartable object, use {@link #exitStopAndReset()} instead.
    *
    * @throws IllegalMonitorStateException if {@link #canStop()} is not yet called on this LifecycleLock

diff --git a/core/src/main/java/org/apache/druid/math/expr/Function.java b/core/src/main/java/org/apache/druid/math/expr/Function.java
@@ -647,7 +647,7 @@ public ExprEval apply(List<Expr> args, Expr.ObjectBinding bindings)
       if (evals.isEmpty()) {
         // The GREATEST/LEAST functions are not in the SQL standard. Emulate the behavior of postgres (return null if
         // all expressions are null, otherwise skip null values) since it is used as a base for a wide number of
-        // databases. This also matches the behavior the the long/double greatest/least post aggregators. Some other
+        // databases. This also matches the behavior the long/double greatest/least post aggregators. Some other
         // databases (e.g., MySQL) return null if any expression is null.
         // https://www.postgresql.org/docs/9.5/functions-conditional.html
         // https://dev.mysql.com/doc/refman/8.0/en/comparison-operators.html#function_least

diff --git a/core/src/main/java/org/apache/druid/math/expr/FunctionalExpr.java b/core/src/main/java/org/apache/druid/math/expr/FunctionalExpr.java
@@ -68,6 +68,11 @@ public List<String> getIdentifiers()
     return args.stream().map(IdentifierExpr::toString).collect(Collectors.toList());
   }
 
+  public List<String> stringifyIdentifiers()
+  {
+    return args.stream().map(IdentifierExpr::stringify).collect(Collectors.toList());
+  }
+
   ImmutableList<IdentifierExpr> getIdentifierExprs()
   {
     return args;
@@ -99,7 +104,7 @@ public ExprEval eval(ObjectBinding bindings)
   @Override
   public String stringify()
   {
-    return StringUtils.format("(%s) -> %s", ARG_JOINER.join(getIdentifiers()), expr.stringify());
+    return StringUtils.format("(%s) -> %s", ARG_JOINER.join(stringifyIdentifiers()), expr.stringify());
   }
 
   @Override

diff --git a/docs/operations/security-overview.md b/docs/operations/security-overview.md
@@ -122,7 +122,7 @@ For more information, see [TLS support](tls-support.md) and [Simple SSLContext P
 
 ## Authentication and authorization
 
-You can configure authentication and authorization to control access to the the Druid APIs. Then configure users, roles, and permissions, as described in the following sections. Make the configuration changes in the `common.runtime.properties` file on all Druid servers in the cluster.
+You can configure authentication and authorization to control access to the Druid APIs. Then configure users, roles, and permissions, as described in the following sections. Make the configuration changes in the `common.runtime.properties` file on all Druid servers in the cluster.
 
 Within Druid's operating context, authenticators control the way user identities are verified. Authorizers employ user roles to relate authenticated users to the datasources they are permitted to access. You can set the finest-grained permissions on a per-datasource basis.
 

diff --git a/docs/querying/caching.md b/docs/querying/caching.md
@@ -63,7 +63,7 @@ For instance, whole-query caching is a good option when you have queries that in
 
 - On Historicals, the default. Enable segment-level cache population on Historicals for larger production clusters to prevent Brokers from having to merge all query results. When you enable cache population on Historicals instead of Brokers, the Historicals merge their own local results and put less strain on the Brokers.
 
-- On ingestion tasks in the Peon or Indexer service. Larger production clusters should enable segment-level cache population on task services only to prevent Brokers from having to merge all query results. When you enable cache population on task execution services instead of Brokers, the the task execution services to merge their own local results and put less strain on the Brokers.
+- On ingestion tasks in the Peon or Indexer service. Larger production clusters should enable segment-level cache population on task services only to prevent Brokers from having to merge all query results. When you enable cache population on task execution services instead of Brokers, the task execution services to merge their own local results and put less strain on the Brokers.
 
      Task executor services only support caches that store data locally. For example the `caffeine` cache. This restriction exists because the cache stores results at the level of intermediate partial segments generated by the ingestion tasks. These intermediate partial segments may not be identical across task replicas. Therefore task executor services ignore remote cache types such as `memcached`.
 

diff --git a/docs/querying/datasource.md b/docs/querying/datasource.md
@@ -54,7 +54,7 @@ The table datasource is the most common type. This is the kind of datasource you
 [data ingestion](../ingestion/index.md). They are split up into segments, distributed around the cluster,
 and queried in parallel.
 
-In [Druid SQL](sql.md#from), table datasources reside in the the `druid` schema. This is the default schema, so table
+In [Druid SQL](sql.md#from), table datasources reside in the `druid` schema. This is the default schema, so table
 datasources can be referenced as either `druid.dataSourceName` or simply `dataSourceName`.
 
 In native queries, table datasources can be referenced using their names as strings (as in the example above), or by
@@ -92,7 +92,7 @@ SELECT k, v FROM lookup.countries
 <!--END_DOCUSAURUS_CODE_TABS-->
 
 Lookup datasources correspond to Druid's key-value [lookup](lookups.md) objects. In [Druid SQL](sql.md#from),
-they reside in the the `lookup` schema. They are preloaded in memory on all servers, so they can be accessed rapidly.
+they reside in the `lookup` schema. They are preloaded in memory on all servers, so they can be accessed rapidly.
 They can be joined onto regular tables using the [join operator](#join).
 
 Lookup datasources are key-value oriented and always have exactly two columns: `k` (the key) and `v` (the value), and

diff --git a/docs/querying/sorting-orders.md b/docs/querying/sorting-orders.md
@@ -49,7 +49,7 @@ This sorting order will try to parse all string values as numbers. Unparseable v
 When comparing two unparseable values (e.g., "hello" and "world"), this ordering will sort by comparing the unparsed strings lexicographically.
 
 ## Strlen
-Sorts values by the their string lengths. When there is a tie, this comparator falls back to using the String compareTo method.
+Sorts values by their string lengths. When there is a tie, this comparator falls back to using the String compareTo method.
 
 ## Version
 Sorts values as versions, e.g.: "10.0 sorts after 9.0", "1.0.0-SNAPSHOT sorts after 1.0.0".

diff --git a/extendedset/src/main/java/org/apache/druid/extendedset/intset/IntSet.java b/extendedset/src/main/java/org/apache/druid/extendedset/intset/IntSet.java
@@ -77,7 +77,7 @@ interface IntIterator extends org.roaringbitmap.IntIterator
     int next();
 
     /**
-     * Skips all the elements before the the specified element, so that
+     * Skips all the elements before the specified element, so that
      * {@link #next()} gives the given element or, if it does not exist, the
      * element immediately after according to the sorting provided by this
      * set.

diff --git a/extensions-core/hdfs-storage/src/main/java/org/apache/hadoop/fs/HadoopFsWrapper.java b/extensions-core/hdfs-storage/src/main/java/org/apache/hadoop/fs/HadoopFsWrapper.java
@@ -26,7 +26,7 @@
 import java.lang.reflect.Method;
 
 /**
- * This wrapper class is created to be able to access some of the the "protected" methods inside Hadoop's
+ * This wrapper class is created to be able to access some of the "protected" methods inside Hadoop's
  * FileSystem class. Those are supposed to become public eventually or more appropriate alternatives would be
  * provided.
  * This is a hack and should be removed when no longer necessary.

diff --git a/...gram/src/main/java/org/apache/druid/query/aggregation/histogram/ApproximateHistogram.java b/...gram/src/main/java/org/apache/druid/query/aggregation/histogram/ApproximateHistogram.java
@@ -1520,7 +1520,7 @@ public double sum(final float b)
    *
    * @param probabilities array of probabilities
    *
-   * @return an array of length probabilities.length representing the the approximate sample quantiles
+   * @return an array of length probabilities.length representing the approximate sample quantiles
    * corresponding to the given probabilities
    */
 

diff --git a/...ram/src/main/java/org/apache/druid/query/aggregation/histogram/FixedBucketsHistogram.java b/...ram/src/main/java/org/apache/druid/query/aggregation/histogram/FixedBucketsHistogram.java
@@ -94,7 +94,7 @@ public class FixedBucketsHistogram
   public static final byte SPARSE_ENCODING_MODE = 0x02;
 
   /**
-   * Determines how the the histogram handles outliers.
+   * Determines how the histogram handles outliers.
    *
    * Ignore:   do not track outliers at all
    * Overflow: track outlier counts in upperOutlierCount and lowerOutlierCount.

diff --git a/...druid/query/aggregation/histogram/sql/FixedBucketsHistogramQuantileSqlAggregatorTest.java b/...druid/query/aggregation/histogram/sql/FixedBucketsHistogramQuantileSqlAggregatorTest.java
@@ -123,8 +123,6 @@ public DruidOperatorTable createOperatorTable()
   @Test
   public void testQuantileOnFloatAndLongs() throws Exception
   {
-    cannotVectorize();
-
     final List<Object[]> expectedResults = ImmutableList.of(
         new Object[]{
             1.0299999713897705,
@@ -238,8 +236,6 @@ public void testQuantileOnFloatAndLongs() throws Exception
   @Test
   public void testQuantileOnCastedString() throws Exception
   {
-    cannotVectorize();
-
     testQuery(
         "SELECT\n"
         + "APPROX_QUANTILE_FIXED_BUCKETS(CAST(dim1 AS DOUBLE), 0.01, 20, 0.0, 10.0),\n"

diff --git a/...test/java/org/apache/druid/query/aggregation/histogram/sql/QuantileSqlAggregatorTest.java b/...test/java/org/apache/druid/query/aggregation/histogram/sql/QuantileSqlAggregatorTest.java
@@ -121,7 +121,6 @@ public DruidOperatorTable createOperatorTable()
   @Test
   public void testQuantileOnFloatAndLongs() throws Exception
   {
-    cannotVectorize();
     testQuery(
         "SELECT\n"
         + "APPROX_QUANTILE(m1, 0.01),\n"

diff --git a/.../apache/druid/query/groupby/epinephelinae/vector/GroupByVectorColumnProcessorFactory.java b/.../apache/druid/query/groupby/epinephelinae/vector/GroupByVectorColumnProcessorFactory.java
@@ -117,4 +117,25 @@ public GroupByVectorColumnSelector makeObjectProcessor(
     }
     return NilGroupByVectorColumnSelector.INSTANCE;
   }
+
+  /**
+   * The group by engine vector processor has a more relaxed approach to choosing to use a dictionary encoded string
+   * selector over an object selector than some of the other {@link VectorColumnProcessorFactory} implementations.
+   *
+   * Basically, if a valid dictionary exists, we will use it to group on dictionary ids (so that we can use
+   * {@link SingleValueStringGroupByVectorColumnSelector} whenever possible instead of
+   * {@link DictionaryBuildingSingleValueStringGroupByVectorColumnSelector}).
+   *
+   * We do this even for things like virtual columns that have a single string input, because it allows deferring
+   * accessing any of the actual string values, which involves at minimum reading utf8 byte values and converting
+   * them to string form (if not already cached), and in the case of expressions, computing the expression output for
+   * the string input.
+   */
+  @Override
+  public boolean useDictionaryEncodedSelector(ColumnCapabilities capabilities)
+  {
+    Preconditions.checkArgument(capabilities != null, "Capabilities must not be null");
+    Preconditions.checkArgument(capabilities.getType() == ValueType.STRING, "Must only be called on a STRING column");
+    return capabilities.isDictionaryEncoded().isTrue();
+  }
 }
diff --git a/...rc/main/java/org/apache/druid/query/groupby/epinephelinae/vector/VectorGroupByEngine.java b/...rc/main/java/org/apache/druid/query/groupby/epinephelinae/vector/VectorGroupByEngine.java
@@ -42,6 +42,7 @@
 import org.apache.druid.query.groupby.epinephelinae.HashVectorGrouper;
 import org.apache.druid.query.groupby.epinephelinae.VectorGrouper;
 import org.apache.druid.query.vector.VectorCursorGranularizer;
+import org.apache.druid.segment.ColumnInspector;
 import org.apache.druid.segment.ColumnProcessors;
 import org.apache.druid.segment.StorageAdapter;
 import org.apache.druid.segment.VirtualColumns;
@@ -59,7 +60,6 @@
 import java.util.Iterator;
 import java.util.List;
 import java.util.NoSuchElementException;
-import java.util.function.Function;
 import java.util.stream.Collectors;
 
 public class VectorGroupByEngine
@@ -75,25 +75,29 @@ public static boolean canVectorize(
       @Nullable final Filter filter
   )
   {
-    Function<String, ColumnCapabilities> capabilitiesFunction = name ->
-        query.getVirtualColumns().getColumnCapabilitiesWithFallback(adapter, name);
+    final ColumnInspector inspector = query.getVirtualColumns().wrapInspector(adapter);
 
-    return canVectorizeDimensions(capabilitiesFunction, query.getDimensions())
-           && query.getDimensions().stream().allMatch(DimensionSpec::canVectorize)
-           && query.getAggregatorSpecs().stream().allMatch(aggregatorFactory -> aggregatorFactory.canVectorize(adapter))
+    return adapter.canVectorize(filter, query.getVirtualColumns(), false)
+           && canVectorizeDimensions(inspector, query.getDimensions())
            && VirtualColumns.shouldVectorize(query, query.getVirtualColumns(), adapter)
-           && adapter.canVectorize(filter, query.getVirtualColumns(), false);
+           && query.getAggregatorSpecs()
+                   .stream()
+                   .allMatch(aggregatorFactory -> aggregatorFactory.canVectorize(inspector));
   }
 
   public static boolean canVectorizeDimensions(
-      final Function<String, ColumnCapabilities> capabilitiesFunction,
+      final ColumnInspector inspector,
       final List<DimensionSpec> dimensions
   )
   {
     return dimensions
         .stream()
         .allMatch(
             dimension -> {
+              if (!dimension.canVectorize()) {
+                return false;
+              }
+
               if (dimension.mustDecorate()) {
                 // group by on multi value dimensions are not currently supported
                 // DimensionSpecs that decorate may turn singly-valued columns into multi-valued selectors.
@@ -102,7 +106,7 @@ public static boolean canVectorizeDimensions(
               }
 
               // Now check column capabilities.
-              final ColumnCapabilities columnCapabilities = capabilitiesFunction.apply(dimension.getDimension());
+              final ColumnCapabilities columnCapabilities = inspector.getColumnCapabilities(dimension.getDimension());
               // null here currently means the column does not exist, nil columns can be vectorized
               if (columnCapabilities == null) {
                 return true;

diff --git a/processing/src/main/java/org/apache/druid/query/timeseries/TimeseriesQueryEngine.java b/processing/src/main/java/org/apache/druid/query/timeseries/TimeseriesQueryEngine.java
@@ -39,6 +39,7 @@
 import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.filter.Filter;
 import org.apache.druid.query.vector.VectorCursorGranularizer;
+import org.apache.druid.segment.ColumnInspector;
 import org.apache.druid.segment.SegmentMissingException;
 import org.apache.druid.segment.StorageAdapter;
 import org.apache.druid.segment.VirtualColumns;
@@ -66,7 +67,7 @@ public class TimeseriesQueryEngine
   @VisibleForTesting
   public TimeseriesQueryEngine()
   {
-    this.bufferPool = new StupidPool<>("dummy", () -> ByteBuffer.allocate(1000000));
+    this.bufferPool = new StupidPool<>("dummy", () -> ByteBuffer.allocate(10000000));
   }
 
   @Inject
@@ -94,10 +95,12 @@ public Sequence<Result<TimeseriesResultValue>> process(final TimeseriesQuery que
     final Granularity gran = query.getGranularity();
     final boolean descending = query.isDescending();
 
+    final ColumnInspector inspector = query.getVirtualColumns().wrapInspector(adapter);
+
     final boolean doVectorize = QueryContexts.getVectorize(query).shouldVectorize(
-        query.getAggregatorSpecs().stream().allMatch(aggregatorFactory -> aggregatorFactory.canVectorize(adapter))
+        adapter.canVectorize(filter, query.getVirtualColumns(), descending)
         && VirtualColumns.shouldVectorize(query, query.getVirtualColumns(), adapter)
-        && adapter.canVectorize(filter, query.getVirtualColumns(), descending)
+        && query.getAggregatorSpecs().stream().allMatch(aggregatorFactory -> aggregatorFactory.canVectorize(inspector))
     );
 
     final Sequence<Result<TimeseriesResultValue>> result;

diff --git a/...essing/src/main/java/org/apache/druid/query/topn/types/TopNColumnAggregatesProcessor.java b/...essing/src/main/java/org/apache/druid/query/topn/types/TopNColumnAggregatesProcessor.java
@@ -115,7 +115,7 @@ long scanAndAggregate(
   void initAggregateStore();
 
   /**
-   * Closes all on heap {@link Aggregator} associated withe the aggregates processor
+   * Closes all on heap {@link Aggregator} associated with the aggregates processor
    */
   void closeAggregators();
 }
diff --git a/processing/src/main/java/org/apache/druid/segment/ColumnProcessors.java b/processing/src/main/java/org/apache/druid/segment/ColumnProcessors.java
@@ -219,13 +219,27 @@ private static ColumnCapabilities computeDimensionSpecCapabilities(
     } else if (dimensionSpec.getExtractionFn() != null) {
       // DimensionSpec is applying an extractionFn but *not* decorating. We have some insight into how the
       // extractionFn will behave, so let's use it.
+      final boolean dictionaryEncoded;
+      final boolean unique;
+      final boolean sorted;
+      if (columnCapabilities != null) {
+        dictionaryEncoded = columnCapabilities.isDictionaryEncoded().isTrue();
+        unique = columnCapabilities.areDictionaryValuesUnique().isTrue();
+        sorted = columnCapabilities.areDictionaryValuesSorted().isTrue();
+      } else {
+        dictionaryEncoded = false;
+        unique = false;
+        sorted = false;
+      }
 
       return new ColumnCapabilitiesImpl()
           .setType(ValueType.STRING)
-          .setDictionaryValuesSorted(dimensionSpec.getExtractionFn().preservesOrdering())
-          .setDictionaryValuesUnique(dimensionSpec.getExtractionFn().getExtractionType()
-                                     == ExtractionFn.ExtractionType.ONE_TO_ONE)
-          .setHasMultipleValues(dimensionSpec.mustDecorate() || mayBeMultiValue(columnCapabilities));
+          .setDictionaryEncoded(dictionaryEncoded)
+          .setDictionaryValuesSorted(sorted && dimensionSpec.getExtractionFn().preservesOrdering())
+          .setDictionaryValuesUnique(
+              unique && dimensionSpec.getExtractionFn().getExtractionType() == ExtractionFn.ExtractionType.ONE_TO_ONE
+          )
+          .setHasMultipleValues(mayBeMultiValue(columnCapabilities));
     } else {
       // No transformation. Pass through underlying types.
       return columnCapabilities;
@@ -318,8 +332,8 @@ private static <T> T makeVectorProcessorInternal(
 
     switch (capabilities.getType()) {
       case STRING:
-        // if column is not uniquely dictionary encoded, use an object selector
-        if (capabilities.isDictionaryEncoded().isFalse() || capabilities.areDictionaryValuesUnique().isFalse()) {
+        // let the processor factory decide if it prefers to use an object selector or dictionary encoded selector
+        if (!processorFactory.useDictionaryEncodedSelector(capabilities)) {
           return processorFactory.makeObjectProcessor(
               capabilities,
               objectSelectorFn.apply(selectorFactory)