From 4390df7f90f6681fc40f6378270be4e1f1ba6702 Mon Sep 17 00:00:00 2001
From: Pranav Bhole <pranavbhole@gmail.com>
Date: Thu, 12 Sep 2024 10:19:26 -0700
Subject: [PATCH 01/47] Update doc for allowedHeaders

---
 docs/configuration/index.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/configuration/index.md b/docs/configuration/index.md
index 57b88b55fc21..05547e78a984 100644
--- a/docs/configuration/index.md
+++ b/docs/configuration/index.md
@@ -616,9 +616,10 @@ the [HDFS input source](../ingestion/input-sources.md#hdfs-input-source).
 You can set the following property to specify permissible protocols for
 the [HTTP input source](../ingestion/input-sources.md#http-input-source).
 
-|Property|Possible values|Description|Default|
-|--------|---------------|-----------|-------|
-|`druid.ingestion.http.allowedProtocols`|List of protocols|Allowed protocols for the HTTP input source.|`["http", "https"]`|
+|Property| Possible values | Description                                                                                                                               |Default|
+|--------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------|-------|
+|`druid.ingestion.http.allowedProtocols`| List of protocols | Allowed protocols for the HTTP input source.                                                                                              |`["http", "https"]`|
+|`druid.ingestion.http.allowedHeaders`| List of Headers | List of allowed request headers for the HTTP input source. Default is empty list which means allow all headers to pass in ingestion spec. |`[]`|
 
 ### External data access security configuration
 

From b6992760ba4bc3bd50cc23691d838d3dcb6556a1 Mon Sep 17 00:00:00 2001
From: Pranav Bhole <pranavbhole@gmail.com>
Date: Thu, 12 Sep 2024 10:22:40 -0700
Subject: [PATCH 02/47] Update doc for allowedHeaders

---
 docs/configuration/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/configuration/index.md b/docs/configuration/index.md
index 05547e78a984..16f7ba284f2d 100644
--- a/docs/configuration/index.md
+++ b/docs/configuration/index.md
@@ -619,7 +619,7 @@ the [HTTP input source](../ingestion/input-sources.md#http-input-source).
 |Property| Possible values | Description                                                                                                                               |Default|
 |--------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------|-------|
 |`druid.ingestion.http.allowedProtocols`| List of protocols | Allowed protocols for the HTTP input source.                                                                                              |`["http", "https"]`|
-|`druid.ingestion.http.allowedHeaders`| List of Headers | List of allowed request headers for the HTTP input source. Default is empty list which means allow all headers to pass in ingestion spec. |`[]`|
+|`druid.ingestion.http.allowedHeaders`| A list of permitted request headers for the HTTP input source. By default, the list is empty, which means all headers are allowed in the ingestion specification. |`[]`|
 
 ### External data access security configuration
 

From 14d764bf2408ed165ef58bc4d847d1e3604ecfcf Mon Sep 17 00:00:00 2001
From: Pranav Bhole <pranavbhole@gmail.com>
Date: Thu, 12 Sep 2024 22:04:30 -0700
Subject: [PATCH 03/47] Reversing the behavior for setting
 druid.ingestion.http.allowedHeaders

---
 docs/configuration/index.md                   |  8 ++++----
 .../data/input/impl/HttpInputSource.java      | 16 +++++++++-------
 .../data/input/impl/HttpInputSourceTest.java  | 19 ++++++++-----------
 3 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/docs/configuration/index.md b/docs/configuration/index.md
index 16f7ba284f2d..5fd31366ad0f 100644
--- a/docs/configuration/index.md
+++ b/docs/configuration/index.md
@@ -616,10 +616,10 @@ the [HDFS input source](../ingestion/input-sources.md#hdfs-input-source).
 You can set the following property to specify permissible protocols for
 the [HTTP input source](../ingestion/input-sources.md#http-input-source).
 
-|Property| Possible values | Description                                                                                                                               |Default|
-|--------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------|-------|
-|`druid.ingestion.http.allowedProtocols`| List of protocols | Allowed protocols for the HTTP input source.                                                                                              |`["http", "https"]`|
-|`druid.ingestion.http.allowedHeaders`| A list of permitted request headers for the HTTP input source. By default, the list is empty, which means all headers are allowed in the ingestion specification. |`[]`|
+|Property| Possible values                                                                                                                                                  | Description                                                                                                                               |Default|
+|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------|-------|
+|`druid.ingestion.http.allowedProtocols`| List of protocols                                                                                                                                                | Allowed protocols for the HTTP input source.                                                                                              |`["http", "https"]`|
+|`druid.ingestion.http.allowedHeaders`| A list of permitted request headers for the HTTP input source. By default, the list is empty, which means no headers are allowed in the ingestion specification. |`[]`|
 
 ### External data access security configuration
 
diff --git a/processing/src/main/java/org/apache/druid/data/input/impl/HttpInputSource.java b/processing/src/main/java/org/apache/druid/data/input/impl/HttpInputSource.java
index 12f2316fb67d..fc092ee68776 100644
--- a/processing/src/main/java/org/apache/druid/data/input/impl/HttpInputSource.java
+++ b/processing/src/main/java/org/apache/druid/data/input/impl/HttpInputSource.java
@@ -100,13 +100,15 @@ public static void throwIfInvalidProtocols(HttpInputSourceConfig config, List<UR
 
   public static void throwIfForbiddenHeaders(HttpInputSourceConfig config, Map<String, String> requestHeaders)
   {
-    if (config.getAllowedHeaders().size() > 0) {
-      for (Map.Entry<String, String> entry : requestHeaders.entrySet()) {
-        if (!config.getAllowedHeaders().contains(StringUtils.toLowerCase(entry.getKey()))) {
-          throw InvalidInput.exception("Got forbidden header %s, allowed headers are only %s ",
-                                       entry.getKey(), config.getAllowedHeaders()
-          );
-        }
+    String message = null;
+    if (config.getAllowedHeaders().size() == 0 && requestHeaders.size() > 0) {
+      message = "You can set the property druid.ingestion.http.allowedHeaders in middle managers or peons to whitelist request headers";
+    }
+    for (Map.Entry<String, String> entry : requestHeaders.entrySet()) {
+      if (!config.getAllowedHeaders().contains(StringUtils.toLowerCase(entry.getKey()))) {
+        throw InvalidInput.exception("Got forbidden header %s, allowed headers are only %s. %s",
+                                     entry.getKey(), config.getAllowedHeaders(), message
+        );
       }
     }
   }
diff --git a/processing/src/test/java/org/apache/druid/data/input/impl/HttpInputSourceTest.java b/processing/src/test/java/org/apache/druid/data/input/impl/HttpInputSourceTest.java
index 118b56838b6c..bb3f2390c4fa 100644
--- a/processing/src/test/java/org/apache/druid/data/input/impl/HttpInputSourceTest.java
+++ b/processing/src/test/java/org/apache/druid/data/input/impl/HttpInputSourceTest.java
@@ -30,7 +30,6 @@
 import org.apache.druid.data.input.impl.systemfield.SystemField;
 import org.apache.druid.data.input.impl.systemfield.SystemFields;
 import org.apache.druid.error.DruidException;
-import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.metadata.DefaultPasswordProvider;
 import org.junit.Assert;
 import org.junit.Rule;
@@ -41,8 +40,7 @@
 import java.net.URI;
 import java.util.Collections;
 import java.util.EnumSet;
-import java.util.Set;
-import java.util.stream.Collectors;
+import java.util.HashSet;
 
 public class HttpInputSourceTest
 {
@@ -152,12 +150,17 @@ public void testSystemFields()
   }
 
   @Test
-  public void testAllowedHeaders()
+  public void testEmptyAllowedHeaders()
   {
     HttpInputSourceConfig httpInputSourceConfig = new HttpInputSourceConfig(
         null,
-        Sets.newHashSet("R-cookie", "Content-type")
+        new HashSet<>()
     );
+    expectedException.expect(DruidException.class);
+    expectedException.expectMessage(
+        "Got forbidden header r-Cookie, allowed headers are only []. "
+        + "You can set the property druid.ingestion.http.allowedHeaders in middle managers or peons to whitelist request headers");
+
     final HttpInputSource inputSource = new HttpInputSource(
         ImmutableList.of(URI.create("http://test.com/http-test")),
         "myName",
@@ -166,12 +169,6 @@ public void testAllowedHeaders()
         ImmutableMap.of("r-Cookie", "test", "Content-Type", "application/json"),
         httpInputSourceConfig
     );
-    Set<String> expectedSet = inputSource.getRequestHeaders()
-                                         .keySet()
-                                         .stream()
-                                         .map(StringUtils::toLowerCase)
-                                         .collect(Collectors.toSet());
-    Assert.assertEquals(expectedSet, httpInputSourceConfig.getAllowedHeaders());
   }
 
   @Test

From 683d034234ceb9472e81b9c77b36b5140e1b2f70 Mon Sep 17 00:00:00 2001
From: Pranav <pranavbhole@gmail.com>
Date: Thu, 12 Sep 2024 22:46:31 -0700
Subject: [PATCH 04/47] Update
 processing/src/main/java/org/apache/druid/data/input/impl/HttpInputSource.java

Co-authored-by: Abhishek Agarwal <1477457+abhishekagarwal87@users.noreply.github.com>
---
 .../java/org/apache/druid/data/input/impl/HttpInputSource.java  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/processing/src/main/java/org/apache/druid/data/input/impl/HttpInputSource.java b/processing/src/main/java/org/apache/druid/data/input/impl/HttpInputSource.java
index fc092ee68776..fcaa66a9d581 100644
--- a/processing/src/main/java/org/apache/druid/data/input/impl/HttpInputSource.java
+++ b/processing/src/main/java/org/apache/druid/data/input/impl/HttpInputSource.java
@@ -106,7 +106,7 @@ public static void throwIfForbiddenHeaders(HttpInputSourceConfig config, Map<Str
     }
     for (Map.Entry<String, String> entry : requestHeaders.entrySet()) {
       if (!config.getAllowedHeaders().contains(StringUtils.toLowerCase(entry.getKey()))) {
-        throw InvalidInput.exception("Got forbidden header %s, allowed headers are only %s. %s",
+        throw InvalidInput.exception("Got forbidden header [%s], allowed headers are only [%s]. You can control the allowed headers by updating druid.ingestion.http.allowedHeaders",
                                      entry.getKey(), config.getAllowedHeaders(), message
         );
       }

From 46c4e3aff8ea46e0163095d994aadeb69872e028 Mon Sep 17 00:00:00 2001
From: Pranav Bhole <pranavbhole@gmail.com>
Date: Tue, 17 Sep 2024 16:09:53 -0700
Subject: [PATCH 05/47] fix tests

---
 .../data/input/impl/HttpInputSource.java      |   6 +-
 .../sql/calcite/IngestTableFunctionTest.java  | 129 ++++++++++++++----
 2 files changed, 104 insertions(+), 31 deletions(-)

diff --git a/processing/src/main/java/org/apache/druid/data/input/impl/HttpInputSource.java b/processing/src/main/java/org/apache/druid/data/input/impl/HttpInputSource.java
index fcaa66a9d581..0c4c9197e5fd 100644
--- a/processing/src/main/java/org/apache/druid/data/input/impl/HttpInputSource.java
+++ b/processing/src/main/java/org/apache/druid/data/input/impl/HttpInputSource.java
@@ -100,14 +100,10 @@ public static void throwIfInvalidProtocols(HttpInputSourceConfig config, List<UR
 
   public static void throwIfForbiddenHeaders(HttpInputSourceConfig config, Map<String, String> requestHeaders)
   {
-    String message = null;
-    if (config.getAllowedHeaders().size() == 0 && requestHeaders.size() > 0) {
-      message = "You can set the property druid.ingestion.http.allowedHeaders in middle managers or peons to whitelist request headers";
-    }
     for (Map.Entry<String, String> entry : requestHeaders.entrySet()) {
       if (!config.getAllowedHeaders().contains(StringUtils.toLowerCase(entry.getKey()))) {
         throw InvalidInput.exception("Got forbidden header [%s], allowed headers are only [%s]. You can control the allowed headers by updating druid.ingestion.http.allowedHeaders",
-                                     entry.getKey(), config.getAllowedHeaders(), message
+                                     entry.getKey(), config.getAllowedHeaders()
         );
       }
     }
diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/IngestTableFunctionTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/IngestTableFunctionTest.java
index 7401477e6d79..2ddcc82d2ce0 100644
--- a/sql/src/test/java/org/apache/druid/sql/calcite/IngestTableFunctionTest.java
+++ b/sql/src/test/java/org/apache/druid/sql/calcite/IngestTableFunctionTest.java
@@ -20,9 +20,14 @@
 package org.apache.druid.sql.calcite;
 
 import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.Module;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.module.SimpleModule;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Sets;
+import com.google.inject.Binder;
 import org.apache.calcite.avatica.SqlType;
 import org.apache.druid.catalog.model.Columns;
 import org.apache.druid.data.input.impl.CsvInputFormat;
@@ -31,20 +36,28 @@
 import org.apache.druid.data.input.impl.JsonInputFormat;
 import org.apache.druid.data.input.impl.LocalInputSource;
 import org.apache.druid.data.input.impl.systemfield.SystemFields;
+import org.apache.druid.guice.DruidInjectorBuilder;
+import org.apache.druid.initialization.DruidModule;
 import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.UOE;
 import org.apache.druid.metadata.DefaultPasswordProvider;
+import org.apache.druid.metadata.input.InputSourceModule;
 import org.apache.druid.segment.column.ColumnType;
 import org.apache.druid.segment.column.RowSignature;
 import org.apache.druid.server.security.Access;
 import org.apache.druid.server.security.AuthConfig;
 import org.apache.druid.server.security.ForbiddenException;
 import org.apache.druid.sql.calcite.external.ExternalDataSource;
+import org.apache.druid.sql.calcite.external.ExternalOperatorConversion;
 import org.apache.druid.sql.calcite.external.Externals;
+import org.apache.druid.sql.calcite.external.HttpOperatorConversion;
+import org.apache.druid.sql.calcite.external.InlineOperatorConversion;
+import org.apache.druid.sql.calcite.external.LocalOperatorConversion;
 import org.apache.druid.sql.calcite.filtration.Filtration;
 import org.apache.druid.sql.calcite.planner.Calcites;
 import org.apache.druid.sql.calcite.util.CalciteTests;
+import org.apache.druid.sql.guice.SqlBindings;
 import org.apache.druid.sql.http.SqlParameter;
 import org.hamcrest.CoreMatchers;
 import org.junit.internal.matchers.ThrowableMessageMatcher;
@@ -53,8 +66,10 @@
 import java.io.File;
 import java.net.URI;
 import java.net.URISyntaxException;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.List;
 
 import static org.hamcrest.MatcherAssert.assertThat;
 import static org.junit.jupiter.api.Assertions.assertThrows;
@@ -69,18 +84,9 @@
  * query ensure that the resulting MSQ task is identical regardless of the path
  * taken.
  */
+@SqlTestFrameworkConfig.ComponentSupplier(IngestTableFunctionTest.ExportComponentSupplier.class)
 public class IngestTableFunctionTest extends CalciteIngestionDmlTest
 {
-  protected static URI toURI(String uri)
-  {
-    try {
-      return new URI(uri);
-    }
-    catch (URISyntaxException e) {
-      throw new ISE("Bad URI: %s", uri);
-    }
-  }
-
   protected final ExternalDataSource httpDataSource = new ExternalDataSource(
       new HttpInputSource(
           Collections.singletonList(toURI("http://foo.com/bar.csv")),
@@ -97,6 +103,30 @@ protected static URI toURI(String uri)
                   .add("z", ColumnType.LONG)
                   .build()
   );
+  protected final ExternalDataSource localDataSource = new ExternalDataSource(
+      new LocalInputSource(
+          null,
+          null,
+          Arrays.asList(new File("/tmp/foo.csv"), new File("/tmp/bar.csv")),
+          SystemFields.none()
+      ),
+      new CsvInputFormat(ImmutableList.of("x", "y", "z"), null, false, false, 0),
+      RowSignature.builder()
+                  .add("x", ColumnType.STRING)
+                  .add("y", ColumnType.STRING)
+                  .add("z", ColumnType.LONG)
+                  .build()
+  );
+
+  protected static URI toURI(String uri)
+  {
+    try {
+      return new URI(uri);
+    }
+    catch (URISyntaxException e) {
+      throw new ISE("Bad URI: %s", uri);
+    }
+  }
 
   /**
    * Basic use of EXTERN
@@ -262,7 +292,7 @@ public void testHttpFn2()
             new DefaultPasswordProvider("secret"),
             SystemFields.none(),
             ImmutableMap.of("Accept", "application/ndjson", "a", "b"),
-            new HttpInputSourceConfig(null, null)
+            new HttpInputSourceConfig(null, Sets.newHashSet("Accept", "a"))
         ),
         new CsvInputFormat(ImmutableList.of("timestamp", "isRobot"), null, false, false, 0),
         RowSignature.builder()
@@ -549,21 +579,6 @@ public void testInlineFn()
         .verify();
   }
 
-  protected final ExternalDataSource localDataSource = new ExternalDataSource(
-      new LocalInputSource(
-          null,
-          null,
-          Arrays.asList(new File("/tmp/foo.csv"), new File("/tmp/bar.csv")),
-          SystemFields.none()
-      ),
-      new CsvInputFormat(ImmutableList.of("x", "y", "z"), null, false, false, 0),
-      RowSignature.builder()
-                  .add("x", ColumnType.STRING)
-                  .add("y", ColumnType.STRING)
-                  .add("z", ColumnType.LONG)
-                  .build()
-  );
-
   /**
    * Basic use of LOCALFILES
    */
@@ -702,4 +717,66 @@ public void testLocalFnNotNull()
         .expectLogicalPlanFrom("localExtern")
         .verify();
   }
+
+  protected static class ExportComponentSupplier extends IngestionDmlComponentSupplier
+  {
+    public ExportComponentSupplier(TempDirProducer tempFolderProducer)
+    {
+      super(tempFolderProducer);
+    }
+
+    @Override
+    public void configureGuice(DruidInjectorBuilder builder)
+    {
+      builder.addModule(new DruidModule()
+      {
+
+        // Clone of MSQExternalDataSourceModule since it is not
+        // visible here.
+        @Override
+        public List<? extends Module> getJacksonModules()
+        {
+          return Collections.singletonList(
+              new SimpleModule(getClass().getSimpleName())
+                  .registerSubtypes(ExternalDataSource.class)
+          );
+        }
+
+        @Override
+        public void configure(Binder binder)
+        {
+          // Adding the config to allow following 2 headers.
+          binder.bind(HttpInputSourceConfig.class)
+                .toInstance(new HttpInputSourceConfig(null, ImmutableSet.of("Accept", "a")));
+
+        }
+      });
+
+      builder.addModule(new DruidModule()
+      {
+
+        @Override
+        public List<? extends Module> getJacksonModules()
+        {
+          // We want this module to bring input sources along for the ride.
+          List<Module> modules = new ArrayList<>(new InputSourceModule().getJacksonModules());
+          modules.add(new SimpleModule("test-module").registerSubtypes(TestFileInputSource.class));
+          return modules;
+        }
+
+        @Override
+        public void configure(Binder binder)
+        {
+          // Set up the EXTERN macro.
+          SqlBindings.addOperatorConversion(binder, ExternalOperatorConversion.class);
+
+          // Enable the extended table functions for testing even though these
+          // are not enabled in production in Druid 26.
+          SqlBindings.addOperatorConversion(binder, HttpOperatorConversion.class);
+          SqlBindings.addOperatorConversion(binder, InlineOperatorConversion.class);
+          SqlBindings.addOperatorConversion(binder, LocalOperatorConversion.class);
+        }
+      });
+    }
+  }
 }

From f841a5d19125531d427a104dbb68e08f862c0d8c Mon Sep 17 00:00:00 2001
From: Abhishek Radhakrishnan <abhishek.rb19@gmail.com>
Date: Thu, 12 Sep 2024 13:30:28 -0400
Subject: [PATCH 06/47] Add support for selective loading of broadcast
 datasources in the task layer (#17027)

Tasks control the loading of broadcast datasources via BroadcastDatasourceLoadingSpec getBroadcastDatasourceLoadingSpec(). By default, tasks download all broadcast datasources, unless there's an override as with kill and MSQ controller task.

The CLIPeon command line option --loadBroadcastSegments is deprecated in favor of --loadBroadcastDatasourceMode.

Broadcast datasources can be specified in SQL queries through JOIN and FROM clauses, or obtained from other sources such as lookups.To this effect, we have introduced a BroadcastDatasourceLoadingSpec. Finding the set of broadcast datasources during SQL planning will be done in a follow-up, which will apply only to MSQ tasks, so they load only required broadcast datasources. This PR primarily focuses on the skeletal changes around BroadcastDatasourceLoadingSpec and integrating it from the Task interface via CliPeon to SegmentBootstrapper.

Currently, only kill tasks and MSQ controller tasks skip loading broadcast datasources.
---
 .../overlord/common/DruidK8sConstants.java    |   1 +
 .../overlord/taskadapter/K8sTaskAdapter.java  |   6 +-
 .../taskadapter/PodTemplateTaskAdapter.java   |   4 +
 .../PodTemplateTaskAdapterTest.java           |  43 ++++-
 .../src/test/resources/expectedNoopJob.yaml   |   2 +
 .../test/resources/expectedNoopJobBase.yaml   |   2 +
 .../resources/expectedNoopJobLongIds.yaml     |   2 +
 .../resources/expectedNoopJobNoTaskJson.yaml  |   2 +
 .../expectedNoopJobTlsEnabledBase.yaml        |   2 +
 .../druid/msq/indexing/MSQControllerTask.java |   7 +
 .../msq/indexing/MSQControllerTaskTest.java   |  17 ++
 .../common/task/KillUnusedSegmentsTask.java   |   7 +
 .../druid/indexing/common/task/Task.java      |  13 +-
 .../indexing/overlord/ForkingTaskRunner.java  |   6 +-
 .../task/KillUnusedSegmentsTaskTest.java      |  11 ++
 .../BroadcastDatasourceLoadingSpec.java       | 170 ++++++++++++++++++
 .../coordination/SegmentBootstrapper.java     |  32 +++-
 .../metrics/DataSourceTaskIdHolder.java       |  15 +-
 .../BroadcastDatasourceLoadingSpecTest.java   | 166 +++++++++++++++++
 .../SegmentBootstrapperCacheTest.java         |  10 +-
 .../coordination/SegmentBootstrapperTest.java | 145 ++++++++++++++-
 .../java/org/apache/druid/cli/CliPeon.java    |  31 +++-
 22 files changed, 673 insertions(+), 21 deletions(-)
 create mode 100644 server/src/main/java/org/apache/druid/server/coordination/BroadcastDatasourceLoadingSpec.java
 create mode 100644 server/src/test/java/org/apache/druid/server/coordination/BroadcastDatasourceLoadingSpecTest.java

diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/common/DruidK8sConstants.java b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/common/DruidK8sConstants.java
index 568f8ed5a117..644a7f109b25 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/common/DruidK8sConstants.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/common/DruidK8sConstants.java
@@ -37,6 +37,7 @@ public class DruidK8sConstants
   public static final String TASK_JSON_ENV = "TASK_JSON";
   public static final String TASK_DIR_ENV = "TASK_DIR";
   public static final String TASK_ID_ENV = "TASK_ID";
+  public static final String LOAD_BROADCAST_DATASOURCE_MODE_ENV = "LOAD_BROADCAST_DATASOURCE_MODE";
   public static final String LOAD_BROADCAST_SEGMENTS_ENV = "LOAD_BROADCAST_SEGMENTS";
   public static final String JAVA_OPTS = "JAVA_OPTS";
   public static final String DRUID_HOST_ENV = "druid_host";
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/taskadapter/K8sTaskAdapter.java b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/taskadapter/K8sTaskAdapter.java
index c15698803d9f..cc689f925f4f 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/taskadapter/K8sTaskAdapter.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/taskadapter/K8sTaskAdapter.java
@@ -444,12 +444,16 @@ private List<String> generateCommand(Task task)
     }
 
     // If the task type is queryable, we need to load broadcast segments on the peon, used for
-    // join queries
+    // join queries. This is replaced by --loadBroadcastDatasourceMode option, but is preserved here
+    // for backwards compatibility and can be removed in a future release.
     if (task.supportsQueries()) {
       command.add("--loadBroadcastSegments");
       command.add("true");
     }
 
+    command.add("--loadBroadcastDatasourceMode");
+    command.add(task.getBroadcastDatasourceLoadingSpec().getMode().toString());
+
     command.add("--taskId");
     command.add(task.getId());
     log.info(
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapter.java b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapter.java
index e8aaf1bbab35..321fe3fcb3e8 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapter.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapter.java
@@ -280,6 +280,10 @@ private Collection<EnvVar> getEnv(Task task) throws IOException
             .withName(DruidK8sConstants.TASK_ID_ENV)
             .withValue(task.getId())
             .build(),
+        new EnvVarBuilder()
+            .withName(DruidK8sConstants.LOAD_BROADCAST_DATASOURCE_MODE_ENV)
+            .withValue(task.getBroadcastDatasourceLoadingSpec().getMode().toString())
+            .build(),
         new EnvVarBuilder()
             .withName(DruidK8sConstants.LOAD_BROADCAST_SEGMENTS_ENV)
             .withValue(Boolean.toString(task.supportsQueries()))
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapterTest.java b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapterTest.java
index ac2aaa705581..b25f23a25ddc 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapterTest.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapterTest.java
@@ -46,6 +46,7 @@
 import org.apache.druid.k8s.overlord.execution.Selector;
 import org.apache.druid.k8s.overlord.execution.SelectorBasedPodTemplateSelectStrategy;
 import org.apache.druid.server.DruidNode;
+import org.apache.druid.server.coordination.BroadcastDatasourceLoadingSpec;
 import org.apache.druid.tasklogs.TaskLogs;
 import org.easymock.EasyMock;
 import org.junit.Assert;
@@ -537,6 +538,7 @@ public void test_fromTask_taskSupportsQueries() throws IOException
     EasyMock.expect(task.getId()).andReturn("id").anyTimes();
     EasyMock.expect(task.getGroupId()).andReturn("groupid").anyTimes();
     EasyMock.expect(task.getDataSource()).andReturn("datasource").anyTimes();
+    EasyMock.expect(task.getBroadcastDatasourceLoadingSpec()).andReturn(BroadcastDatasourceLoadingSpec.ALL).anyTimes();
 
     EasyMock.replay(task);
     Job actual = adapter.fromTask(task);
@@ -550,7 +552,46 @@ public void test_fromTask_taskSupportsQueries() throws IOException
   }
 
   @Test
-  public void test_fromTask_withIndexKafkaPodTemplateInRuntimeProperites() throws IOException
+  public void test_fromTask_withBroadcastDatasourceLoadingModeAll() throws IOException
+  {
+    Path templatePath = Files.createFile(tempDir.resolve("noop.yaml"));
+    mapper.writeValue(templatePath.toFile(), podTemplateSpec);
+
+    Properties props = new Properties();
+    props.setProperty("druid.indexer.runner.k8s.podTemplate.base", templatePath.toString());
+    props.setProperty("druid.indexer.runner.k8s.podTemplate.queryable", templatePath.toString());
+
+    PodTemplateTaskAdapter adapter = new PodTemplateTaskAdapter(
+        taskRunnerConfig,
+        taskConfig,
+        node,
+        mapper,
+        props,
+        taskLogs,
+        dynamicConfigRef
+    );
+
+    Task task = EasyMock.mock(Task.class);
+    EasyMock.expect(task.supportsQueries()).andReturn(true);
+    EasyMock.expect(task.getType()).andReturn("queryable").anyTimes();
+    EasyMock.expect(task.getId()).andReturn("id").anyTimes();
+    EasyMock.expect(task.getGroupId()).andReturn("groupid").anyTimes();
+    EasyMock.expect(task.getDataSource()).andReturn("datasource").anyTimes();
+    EasyMock.expect(task.getBroadcastDatasourceLoadingSpec()).andReturn(BroadcastDatasourceLoadingSpec.ALL).anyTimes();
+
+    EasyMock.replay(task);
+    Job actual = adapter.fromTask(task);
+    EasyMock.verify(task);
+
+    Assertions.assertEquals(BroadcastDatasourceLoadingSpec.Mode.ALL.toString(), actual.getSpec().getTemplate()
+                                          .getSpec().getContainers()
+                                          .get(0).getEnv().stream()
+                                          .filter(env -> env.getName().equals(DruidK8sConstants.LOAD_BROADCAST_DATASOURCE_MODE_ENV))
+                                          .collect(Collectors.toList()).get(0).getValue());
+  }
+
+  @Test
+  public void test_fromTask_withIndexKafkaPodTemplateInRuntimeProperties() throws IOException
   {
     Path baseTemplatePath = Files.createFile(tempDir.resolve("base.yaml"));
     mapper.writeValue(baseTemplatePath.toFile(), podTemplateSpec);
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJob.yaml b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJob.yaml
index ddae7c0567f2..ac539c5da154 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJob.yaml
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJob.yaml
@@ -45,6 +45,8 @@ spec:
               value: "/tmp"
             - name: "TASK_ID"
               value: "id"
+            - name: "LOAD_BROADCAST_DATASOURCE_MODE"
+              value: "ALL"
             - name: "LOAD_BROADCAST_SEGMENTS"
               value: "false"
             - name: "TASK_JSON"
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobBase.yaml b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobBase.yaml
index 532c3dd53e82..f7c2ff958bbc 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobBase.yaml
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobBase.yaml
@@ -45,6 +45,8 @@ spec:
               value: "/tmp"
             - name: "TASK_ID"
               value: "id"
+            - name: "LOAD_BROADCAST_DATASOURCE_MODE"
+              value: "ALL"
             - name: "LOAD_BROADCAST_SEGMENTS"
               value: "false"
             - name: "TASK_JSON"
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobLongIds.yaml b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobLongIds.yaml
index d6c316dcdde8..3a3af1528b56 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobLongIds.yaml
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobLongIds.yaml
@@ -44,6 +44,8 @@ spec:
               value: "/tmp"
             - name: "TASK_ID"
               value: "api-issued_kill_wikipedia3_omjobnbc_1000-01-01T00:00:00.000Z_2023-05-14T00:00:00.000Z_2023-05-15T17:03:01.220Z"
+            - name: "LOAD_BROADCAST_DATASOURCE_MODE"
+              value: "ALL"
             - name: "LOAD_BROADCAST_SEGMENTS"
               value: "false"
             - name: "TASK_JSON"
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobNoTaskJson.yaml b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobNoTaskJson.yaml
index 90ae99709598..ec7f9a062481 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobNoTaskJson.yaml
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobNoTaskJson.yaml
@@ -43,6 +43,8 @@ spec:
               value: "/tmp"
             - name: "TASK_ID"
               value: "id"
+            - name: "LOAD_BROADCAST_DATASOURCE_MODE"
+              value: "ALL"
             - name: "LOAD_BROADCAST_SEGMENTS"
               value: "false"
           image: one
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobTlsEnabledBase.yaml b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobTlsEnabledBase.yaml
index 0e52beac9e32..84457fb3175c 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobTlsEnabledBase.yaml
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobTlsEnabledBase.yaml
@@ -44,6 +44,8 @@ spec:
               value: "/tmp"
             - name: "TASK_ID"
               value: "id"
+            - name: "LOAD_BROADCAST_DATASOURCE_MODE"
+              value: "ALL"
             - name: "LOAD_BROADCAST_SEGMENTS"
               value: "false"
             - name: "TASK_JSON"
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/MSQControllerTask.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/MSQControllerTask.java
index c3f6feaab245..4ddc8274b9d0 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/MSQControllerTask.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/MSQControllerTask.java
@@ -59,6 +59,7 @@
 import org.apache.druid.rpc.StandardRetryPolicy;
 import org.apache.druid.rpc.indexing.OverlordClient;
 import org.apache.druid.segment.column.ColumnType;
+import org.apache.druid.server.coordination.BroadcastDatasourceLoadingSpec;
 import org.apache.druid.server.lookup.cache.LookupLoadingSpec;
 import org.apache.druid.server.security.Resource;
 import org.apache.druid.server.security.ResourceAction;
@@ -374,4 +375,10 @@ public LookupLoadingSpec getLookupLoadingSpec()
   {
     return LookupLoadingSpec.NONE;
   }
+
+  @Override
+  public BroadcastDatasourceLoadingSpec getBroadcastDatasourceLoadingSpec()
+  {
+    return BroadcastDatasourceLoadingSpec.NONE;
+  }
 }
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQControllerTaskTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQControllerTaskTest.java
index 76586c1e1081..8d974285fb57 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQControllerTaskTest.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQControllerTaskTest.java
@@ -35,6 +35,7 @@
 import org.apache.druid.query.Druids;
 import org.apache.druid.query.scan.ScanQuery;
 import org.apache.druid.query.spec.MultipleIntervalSegmentSpec;
+import org.apache.druid.server.coordination.BroadcastDatasourceLoadingSpec;
 import org.apache.druid.server.lookup.cache.LookupLoadingSpec;
 import org.apache.druid.sql.calcite.planner.ColumnMapping;
 import org.apache.druid.sql.calcite.planner.ColumnMappings;
@@ -104,6 +105,22 @@ public void testGetDefaultLookupLoadingSpec()
     Assert.assertEquals(LookupLoadingSpec.NONE, controllerTask.getLookupLoadingSpec());
   }
 
+  @Test
+  public void testGetDefaultBroadcastDatasourceLoadingSpec()
+  {
+    MSQControllerTask controllerTask = new MSQControllerTask(
+        null,
+        MSQ_SPEC,
+        null,
+        null,
+        null,
+        null,
+        null,
+        null
+    );
+    Assert.assertEquals(BroadcastDatasourceLoadingSpec.NONE, controllerTask.getBroadcastDatasourceLoadingSpec());
+  }
+
   @Test
   public void testGetLookupLoadingSpecUsingLookupLoadingInfoInContext()
   {
diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/KillUnusedSegmentsTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/KillUnusedSegmentsTask.java
index e1f6d2915eea..06082a988d98 100644
--- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/KillUnusedSegmentsTask.java
+++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/KillUnusedSegmentsTask.java
@@ -47,6 +47,7 @@
 import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.logger.Logger;
+import org.apache.druid.server.coordination.BroadcastDatasourceLoadingSpec;
 import org.apache.druid.server.lookup.cache.LookupLoadingSpec;
 import org.apache.druid.server.security.ResourceAction;
 import org.apache.druid.timeline.DataSegment;
@@ -412,6 +413,12 @@ public LookupLoadingSpec getLookupLoadingSpec()
     return LookupLoadingSpec.NONE;
   }
 
+  @Override
+  public BroadcastDatasourceLoadingSpec getBroadcastDatasourceLoadingSpec()
+  {
+    return BroadcastDatasourceLoadingSpec.NONE;
+  }
+
   @Override
   public boolean isReady(TaskActionClient taskActionClient) throws Exception
   {
diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/Task.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/Task.java
index 003b39e606b0..cacdc47f520a 100644
--- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/Task.java
+++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/Task.java
@@ -41,13 +41,13 @@
 import org.apache.druid.java.util.common.UOE;
 import org.apache.druid.query.Query;
 import org.apache.druid.query.QueryRunner;
+import org.apache.druid.server.coordination.BroadcastDatasourceLoadingSpec;
 import org.apache.druid.server.lookup.cache.LookupLoadingSpec;
 import org.apache.druid.server.security.Resource;
 import org.apache.druid.server.security.ResourceAction;
 import org.apache.druid.server.security.ResourceType;
 
 import javax.annotation.Nonnull;
-import javax.annotation.Nullable;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
@@ -329,9 +329,18 @@ static TaskInfo<TaskIdentifier, TaskStatus> toTaskIdentifierInfo(TaskInfo<Task,
    * This behaviour can be overridden by passing parameters {@link LookupLoadingSpec#CTX_LOOKUP_LOADING_MODE}
    * and {@link LookupLoadingSpec#CTX_LOOKUPS_TO_LOAD} in the task context.
    */
-  @Nullable
   default LookupLoadingSpec getLookupLoadingSpec()
   {
     return LookupLoadingSpec.createFromContext(getContext(), LookupLoadingSpec.ALL);
   }
+
+  /**
+   * Specifies the list of broadcast datasources to load for this task. Tasks load ALL broadcast datasources by default.
+   * This behavior can be overridden by passing parameters {@link BroadcastDatasourceLoadingSpec#CTX_BROADCAST_DATASOURCE_LOADING_MODE}
+   * and {@link BroadcastDatasourceLoadingSpec#CTX_BROADCAST_DATASOURCES_TO_LOAD} in the task context.
+   */
+  default BroadcastDatasourceLoadingSpec getBroadcastDatasourceLoadingSpec()
+  {
+    return BroadcastDatasourceLoadingSpec.createFromContext(getContext(), BroadcastDatasourceLoadingSpec.ALL);
+  }
 }
diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/ForkingTaskRunner.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/ForkingTaskRunner.java
index c676877c1107..7e4dd6c39c22 100644
--- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/ForkingTaskRunner.java
+++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/ForkingTaskRunner.java
@@ -376,12 +376,16 @@ public TaskStatus call()
                         }
 
                         // If the task type is queryable, we need to load broadcast segments on the peon, used for
-                        // join queries
+                        // join queries. This is replaced by --loadBroadcastDatasourceMode option, but is preserved here
+                        // for backwards compatibility and can be removed in a future release.
                         if (task.supportsQueries()) {
                           command.add("--loadBroadcastSegments");
                           command.add("true");
                         }
 
+                        command.add("--loadBroadcastDatasourceMode");
+                        command.add(task.getBroadcastDatasourceLoadingSpec().getMode().toString());
+
                         if (!taskFile.exists()) {
                           jsonMapper.writeValue(taskFile, task);
                         }
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/KillUnusedSegmentsTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/KillUnusedSegmentsTaskTest.java
index fe2b5a51c86a..7a4df9de36c3 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/KillUnusedSegmentsTaskTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/KillUnusedSegmentsTaskTest.java
@@ -37,6 +37,7 @@
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.JodaUtils;
 import org.apache.druid.metadata.IndexerSqlMetadataStorageCoordinatorTestBase;
+import org.apache.druid.server.coordination.BroadcastDatasourceLoadingSpec;
 import org.apache.druid.server.lookup.cache.LookupLoadingSpec;
 import org.apache.druid.timeline.DataSegment;
 import org.assertj.core.api.Assertions;
@@ -601,6 +602,16 @@ public void testGetLookupsToLoad()
     Assert.assertEquals(LookupLoadingSpec.Mode.NONE, task.getLookupLoadingSpec().getMode());
   }
 
+  @Test
+  public void testGetBroadcastDatasourcesToLoad()
+  {
+    final KillUnusedSegmentsTask task = new KillUnusedSegmentsTaskBuilder()
+        .dataSource(DATA_SOURCE)
+        .interval(Intervals.of("2019-03-01/2019-04-01"))
+        .build();
+    Assert.assertEquals(BroadcastDatasourceLoadingSpec.Mode.NONE, task.getBroadcastDatasourceLoadingSpec().getMode());
+  }
+
   @Test
   public void testKillBatchSizeOneAndLimit4() throws Exception
   {
diff --git a/server/src/main/java/org/apache/druid/server/coordination/BroadcastDatasourceLoadingSpec.java b/server/src/main/java/org/apache/druid/server/coordination/BroadcastDatasourceLoadingSpec.java
new file mode 100644
index 000000000000..3a11027311e6
--- /dev/null
+++ b/server/src/main/java/org/apache/druid/server/coordination/BroadcastDatasourceLoadingSpec.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.server.coordination;
+
+import com.google.common.collect.ImmutableSet;
+import org.apache.druid.error.InvalidInput;
+
+import javax.annotation.Nullable;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+
+/**
+ * This class defines the spec for loading of broadcast datasources for a given task. It contains 2 fields:
+ * <ol>
+ *   <li>{@link BroadcastDatasourceLoadingSpec#mode}: This mode defines whether broadcastDatasources need to be
+ *   loaded for the given task, or not. It can take 3 values: </li>
+ *   <ul>
+ *    <li> ALL: Load all the broadcast datasources.</li>
+ *    <li> NONE: Load no broadcast datasources. </li>
+ *    <li> ONLY_REQUIRED: Load only the broadcast datasources defined in broadcastDatasourcesToLoad </li>
+ *   </ul>
+ * <li>{@link BroadcastDatasourceLoadingSpec#broadcastDatasourcesToLoad}: Defines the broadcastDatasources to load when the broadcastDatasourceLoadingMode is set to ONLY_REQUIRED.</li>
+ * </ol>
+ */
+public class BroadcastDatasourceLoadingSpec
+{
+
+  public static final String CTX_BROADCAST_DATASOURCE_LOADING_MODE = "broadcastDatasourceLoadingMode";
+  public static final String CTX_BROADCAST_DATASOURCES_TO_LOAD = "broadcastDatasourcesToLoad";
+
+  public enum Mode
+  {
+    ALL, NONE, ONLY_REQUIRED
+  }
+
+  private final Mode mode;
+  @Nullable
+  private final ImmutableSet<String> broadcastDatasourcesToLoad;
+
+  public static final BroadcastDatasourceLoadingSpec ALL = new BroadcastDatasourceLoadingSpec(Mode.ALL, null);
+  public static final BroadcastDatasourceLoadingSpec NONE = new BroadcastDatasourceLoadingSpec(Mode.NONE, null);
+
+  private BroadcastDatasourceLoadingSpec(Mode mode, @Nullable Set<String> broadcastDatasourcesToLoad)
+  {
+    this.mode = mode;
+    this.broadcastDatasourcesToLoad = broadcastDatasourcesToLoad == null ? null : ImmutableSet.copyOf(broadcastDatasourcesToLoad);
+  }
+
+  /**
+   * Creates a BroadcastSegmentLoadingSpec which loads only the broadcast datasources present in the given set.
+   */
+  public static BroadcastDatasourceLoadingSpec loadOnly(Set<String> broadcastDatasourcesToLoad)
+  {
+    if (broadcastDatasourcesToLoad == null) {
+      throw InvalidInput.exception("Expected non-null set of broadcast datasources to load.");
+    }
+    return new BroadcastDatasourceLoadingSpec(Mode.ONLY_REQUIRED, broadcastDatasourcesToLoad);
+  }
+
+  public Mode getMode()
+  {
+    return mode;
+  }
+
+  /**
+   * @return A non-null immutable set of broadcast datasource names when {@link BroadcastDatasourceLoadingSpec#mode} is ONLY_REQUIRED, null otherwise.
+   */
+  public ImmutableSet<String> getBroadcastDatasourcesToLoad()
+  {
+    return broadcastDatasourcesToLoad;
+  }
+
+  public static BroadcastDatasourceLoadingSpec createFromContext(Map<String, Object> context, BroadcastDatasourceLoadingSpec defaultSpec)
+  {
+    if (context == null) {
+      return defaultSpec;
+    }
+
+    final Object broadcastDatasourceModeValue = context.get(CTX_BROADCAST_DATASOURCE_LOADING_MODE);
+    if (broadcastDatasourceModeValue == null) {
+      return defaultSpec;
+    }
+
+    final BroadcastDatasourceLoadingSpec.Mode broadcastDatasourceLoadingMode;
+    try {
+      broadcastDatasourceLoadingMode = BroadcastDatasourceLoadingSpec.Mode.valueOf(broadcastDatasourceModeValue.toString());
+    }
+    catch (IllegalArgumentException e) {
+      throw InvalidInput.exception(
+          "Invalid value of %s[%s]. Allowed values are %s",
+          CTX_BROADCAST_DATASOURCE_LOADING_MODE, broadcastDatasourceModeValue.toString(),
+          Arrays.asList(BroadcastDatasourceLoadingSpec.Mode.values())
+      );
+    }
+
+    if (broadcastDatasourceLoadingMode == Mode.NONE) {
+      return NONE;
+    } else if (broadcastDatasourceLoadingMode == Mode.ALL) {
+      return ALL;
+    } else if (broadcastDatasourceLoadingMode == Mode.ONLY_REQUIRED) {
+      final Collection<String> broadcastDatasourcesToLoad;
+      try {
+        broadcastDatasourcesToLoad = (Collection<String>) context.get(CTX_BROADCAST_DATASOURCES_TO_LOAD);
+      }
+      catch (ClassCastException e) {
+        throw InvalidInput.exception(
+            "Invalid value of %s[%s]. Please provide a comma-separated list of broadcast datasource names."
+            + " For example: [\"datasourceName1\", \"datasourceName2\"]",
+            CTX_BROADCAST_DATASOURCES_TO_LOAD, context.get(CTX_BROADCAST_DATASOURCES_TO_LOAD)
+        );
+      }
+
+      if (broadcastDatasourcesToLoad == null || broadcastDatasourcesToLoad.isEmpty()) {
+        throw InvalidInput.exception("Set of broadcast datasources to load cannot be %s for mode[ONLY_REQUIRED].", broadcastDatasourcesToLoad);
+      }
+      return BroadcastDatasourceLoadingSpec.loadOnly(new HashSet<>(broadcastDatasourcesToLoad));
+    } else {
+      return defaultSpec;
+    }
+  }
+
+  @Override
+  public String toString()
+  {
+    return "BroadcastDatasourceLoadingSpec{" +
+           "mode=" + mode +
+           ", broadcastDatasourcesToLoad=" + broadcastDatasourcesToLoad +
+           '}';
+  }
+
+  @Override
+  public boolean equals(Object o)
+  {
+    if (this == o) {
+      return true;
+    }
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+    BroadcastDatasourceLoadingSpec that = (BroadcastDatasourceLoadingSpec) o;
+    return mode == that.mode && Objects.equals(broadcastDatasourcesToLoad, that.broadcastDatasourcesToLoad);
+  }
+
+  @Override
+  public int hashCode()
+  {
+    return Objects.hash(mode, broadcastDatasourcesToLoad);
+  }
+}
diff --git a/server/src/main/java/org/apache/druid/server/coordination/SegmentBootstrapper.java b/server/src/main/java/org/apache/druid/server/coordination/SegmentBootstrapper.java
index c5b71fbcddcf..7eec82e80b1c 100644
--- a/server/src/main/java/org/apache/druid/server/coordination/SegmentBootstrapper.java
+++ b/server/src/main/java/org/apache/druid/server/coordination/SegmentBootstrapper.java
@@ -39,12 +39,14 @@
 import org.apache.druid.segment.loading.SegmentLoaderConfig;
 import org.apache.druid.segment.loading.SegmentLoadingException;
 import org.apache.druid.server.SegmentManager;
+import org.apache.druid.server.metrics.DataSourceTaskIdHolder;
 import org.apache.druid.timeline.DataSegment;
 
 import javax.annotation.Nullable;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Set;
 import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.ExecutionException;
@@ -80,6 +82,8 @@ public class SegmentBootstrapper
 
   private static final EmittingLogger log = new EmittingLogger(SegmentBootstrapper.class);
 
+  private final DataSourceTaskIdHolder datasourceHolder;
+
   @Inject
   public SegmentBootstrapper(
       SegmentLoadDropHandler loadDropHandler,
@@ -89,7 +93,8 @@ public SegmentBootstrapper(
       SegmentManager segmentManager,
       ServerTypeConfig serverTypeConfig,
       CoordinatorClient coordinatorClient,
-      ServiceEmitter emitter
+      ServiceEmitter emitter,
+      DataSourceTaskIdHolder datasourceHolder
   )
   {
     this.loadDropHandler = loadDropHandler;
@@ -100,6 +105,7 @@ public SegmentBootstrapper(
     this.serverTypeConfig = serverTypeConfig;
     this.coordinatorClient = coordinatorClient;
     this.emitter = emitter;
+    this.datasourceHolder = datasourceHolder;
   }
 
   @LifecycleStart
@@ -261,10 +267,18 @@ private void loadSegmentsOnStartup() throws IOException
 
   /**
    * @return a list of bootstrap segments. When bootstrap segments cannot be found, an empty list is returned.
+   * The bootstrap segments returned are filtered by the broadcast datasources indicated by {@link DataSourceTaskIdHolder#getBroadcastDatasourceLoadingSpec()}
+   * if applicable.
    */
   private List<DataSegment> getBootstrapSegments()
   {
-    log.info("Fetching bootstrap segments from the coordinator.");
+    final BroadcastDatasourceLoadingSpec.Mode mode = datasourceHolder.getBroadcastDatasourceLoadingSpec().getMode();
+    if (mode == BroadcastDatasourceLoadingSpec.Mode.NONE) {
+      log.info("Skipping fetch of bootstrap segments.");
+      return ImmutableList.of();
+    }
+
+    log.info("Fetching bootstrap segments from the coordinator with BroadcastDatasourceLoadingSpec mode[%s].", mode);
     final Stopwatch stopwatch = Stopwatch.createStarted();
 
     List<DataSegment> bootstrapSegments = new ArrayList<>();
@@ -272,7 +286,18 @@ private List<DataSegment> getBootstrapSegments()
     try {
       final BootstrapSegmentsResponse response =
           FutureUtils.getUnchecked(coordinatorClient.fetchBootstrapSegments(), true);
-      bootstrapSegments = ImmutableList.copyOf(response.getIterator());
+      if (mode == BroadcastDatasourceLoadingSpec.Mode.ONLY_REQUIRED) {
+        final Set<String> broadcastDatasourcesToLoad = datasourceHolder.getBroadcastDatasourceLoadingSpec().getBroadcastDatasourcesToLoad();
+        final List<DataSegment> filteredBroadcast = new ArrayList<>();
+        response.getIterator().forEachRemaining(segment -> {
+          if (broadcastDatasourcesToLoad.contains(segment.getDataSource())) {
+            filteredBroadcast.add(segment);
+          }
+        });
+        bootstrapSegments = filteredBroadcast;
+      } else {
+        bootstrapSegments = ImmutableList.copyOf(response.getIterator());
+      }
     }
     catch (Exception e) {
       log.warn("Error fetching bootstrap segments from the coordinator: [%s]. ", e.getMessage());
@@ -284,7 +309,6 @@ private List<DataSegment> getBootstrapSegments()
       emitter.emit(new ServiceMetricEvent.Builder().setMetric("segment/bootstrap/count", bootstrapSegments.size()));
       log.info("Fetched [%d] bootstrap segments in [%d]ms.", bootstrapSegments.size(), fetchRunMillis);
     }
-
     return bootstrapSegments;
   }
 
diff --git a/server/src/main/java/org/apache/druid/server/metrics/DataSourceTaskIdHolder.java b/server/src/main/java/org/apache/druid/server/metrics/DataSourceTaskIdHolder.java
index 6d2dafd31a55..87002a5157f8 100644
--- a/server/src/main/java/org/apache/druid/server/metrics/DataSourceTaskIdHolder.java
+++ b/server/src/main/java/org/apache/druid/server/metrics/DataSourceTaskIdHolder.java
@@ -21,15 +21,16 @@
 
 import com.google.inject.Inject;
 import com.google.inject.name.Named;
+import org.apache.druid.server.coordination.BroadcastDatasourceLoadingSpec;
 import org.apache.druid.server.lookup.cache.LookupLoadingSpec;
 
-import javax.annotation.Nullable;
-
 public class DataSourceTaskIdHolder
 {
   public static final String DATA_SOURCE_BINDING = "druidDataSource";
   public static final String TASK_ID_BINDING = "druidTaskId";
   public static final String LOOKUPS_TO_LOAD_FOR_TASK = "lookupsToLoadForTask";
+  public static final String BROADCAST_DATASOURCES_TO_LOAD_FOR_TASK = "broadcastDatasourcesToLoadForTask";
+
   @Named(DATA_SOURCE_BINDING)
   @Inject(optional = true)
   String dataSource = null;
@@ -37,11 +38,14 @@ public class DataSourceTaskIdHolder
   @Inject(optional = true)
   String taskId = null;
 
-  @Nullable
   @Named(LOOKUPS_TO_LOAD_FOR_TASK)
   @Inject(optional = true)
   LookupLoadingSpec lookupLoadingSpec = LookupLoadingSpec.ALL;
 
+  @Named(BROADCAST_DATASOURCES_TO_LOAD_FOR_TASK)
+  @Inject(optional = true)
+  BroadcastDatasourceLoadingSpec broadcastDatasourceLoadingSpec = BroadcastDatasourceLoadingSpec.ALL;
+
   public String getDataSource()
   {
     return dataSource;
@@ -56,4 +60,9 @@ public LookupLoadingSpec getLookupLoadingSpec()
   {
     return lookupLoadingSpec;
   }
+
+  public BroadcastDatasourceLoadingSpec getBroadcastDatasourceLoadingSpec()
+  {
+    return broadcastDatasourceLoadingSpec;
+  }
 }
diff --git a/server/src/test/java/org/apache/druid/server/coordination/BroadcastDatasourceLoadingSpecTest.java b/server/src/test/java/org/apache/druid/server/coordination/BroadcastDatasourceLoadingSpecTest.java
new file mode 100644
index 000000000000..ddec0901965d
--- /dev/null
+++ b/server/src/test/java/org/apache/druid/server/coordination/BroadcastDatasourceLoadingSpecTest.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.server.coordination;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSet;
+import junitparams.JUnitParamsRunner;
+import junitparams.Parameters;
+import org.apache.druid.error.DruidException;
+import org.apache.druid.java.util.common.StringUtils;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import java.util.Arrays;
+import java.util.Set;
+
+@RunWith(JUnitParamsRunner.class)
+public class BroadcastDatasourceLoadingSpecTest
+{
+  @Test
+  public void testLoadingAllBroadcastDatasources()
+  {
+    final BroadcastDatasourceLoadingSpec spec = BroadcastDatasourceLoadingSpec.ALL;
+    Assert.assertEquals(BroadcastDatasourceLoadingSpec.Mode.ALL, spec.getMode());
+    Assert.assertNull(spec.getBroadcastDatasourcesToLoad());
+  }
+
+  @Test
+  public void testLoadingNoLookups()
+  {
+    final BroadcastDatasourceLoadingSpec spec = BroadcastDatasourceLoadingSpec.NONE;
+    Assert.assertEquals(BroadcastDatasourceLoadingSpec.Mode.NONE, spec.getMode());
+    Assert.assertNull(spec.getBroadcastDatasourcesToLoad());
+  }
+
+  @Test
+  public void testLoadingOnlyRequiredLookups()
+  {
+    final Set<String> broadcastDatasourcesToLoad = ImmutableSet.of("ds1", "ds2");
+    final BroadcastDatasourceLoadingSpec spec = BroadcastDatasourceLoadingSpec.loadOnly(ImmutableSet.of("ds1", "ds2"));
+    Assert.assertEquals(BroadcastDatasourceLoadingSpec.Mode.ONLY_REQUIRED, spec.getMode());
+    Assert.assertEquals(broadcastDatasourcesToLoad, spec.getBroadcastDatasourcesToLoad());
+  }
+
+  @Test
+  public void testLoadingOnlyRequiredLookupsWithNullList()
+  {
+    DruidException exception = Assert.assertThrows(DruidException.class, () -> BroadcastDatasourceLoadingSpec.loadOnly(null));
+    Assert.assertEquals("Expected non-null set of broadcast datasources to load.", exception.getMessage());
+  }
+
+  @Test
+  public void testCreateBroadcastLoadingSpecFromNullContext()
+  {
+    // Default spec is returned in the case of context=null.
+    Assert.assertEquals(
+        BroadcastDatasourceLoadingSpec.NONE,
+        BroadcastDatasourceLoadingSpec.createFromContext(
+            null,
+            BroadcastDatasourceLoadingSpec.NONE
+        )
+    );
+
+    Assert.assertEquals(
+        BroadcastDatasourceLoadingSpec.ALL,
+        BroadcastDatasourceLoadingSpec.createFromContext(
+            null,
+            BroadcastDatasourceLoadingSpec.ALL
+        )
+    );
+  }
+
+  @Test
+  public void testCreateBroadcastLoadingSpecFromContext()
+  {
+    // Only required lookups are returned in the case of context having the lookup keys.
+    Assert.assertEquals(
+        BroadcastDatasourceLoadingSpec.loadOnly(ImmutableSet.of("ds1", "ds2")),
+        BroadcastDatasourceLoadingSpec.createFromContext(
+            ImmutableMap.of(
+                BroadcastDatasourceLoadingSpec.CTX_BROADCAST_DATASOURCES_TO_LOAD, Arrays.asList("ds1", "ds2"),
+                BroadcastDatasourceLoadingSpec.CTX_BROADCAST_DATASOURCE_LOADING_MODE, BroadcastDatasourceLoadingSpec.Mode.ONLY_REQUIRED
+            ),
+            BroadcastDatasourceLoadingSpec.ALL
+        )
+    );
+
+    // No lookups are returned in the case of context having mode=NONE, irrespective of the default spec.
+    Assert.assertEquals(
+        BroadcastDatasourceLoadingSpec.NONE,
+        BroadcastDatasourceLoadingSpec.createFromContext(
+            ImmutableMap.of(
+                BroadcastDatasourceLoadingSpec.CTX_BROADCAST_DATASOURCE_LOADING_MODE, BroadcastDatasourceLoadingSpec.Mode.NONE),
+            BroadcastDatasourceLoadingSpec.ALL
+        )
+    );
+
+    // All lookups are returned in the case of context having mode=ALL, irrespective of the default spec.
+    Assert.assertEquals(
+        BroadcastDatasourceLoadingSpec.ALL,
+        BroadcastDatasourceLoadingSpec.createFromContext(
+            ImmutableMap.of(BroadcastDatasourceLoadingSpec.CTX_BROADCAST_DATASOURCE_LOADING_MODE, BroadcastDatasourceLoadingSpec.Mode.ALL),
+            BroadcastDatasourceLoadingSpec.NONE
+        )
+    );
+  }
+
+  @Test
+  @Parameters(
+      {
+          "NONE1",
+          "A",
+          "Random mode",
+          "all",
+          "only required",
+          "none"
+      }
+  )
+  public void testSpecFromInvalidModeInContext(final String mode)
+  {
+    final DruidException exception = Assert.assertThrows(DruidException.class, () -> BroadcastDatasourceLoadingSpec.createFromContext(
+        ImmutableMap.of(BroadcastDatasourceLoadingSpec.CTX_BROADCAST_DATASOURCE_LOADING_MODE, mode), BroadcastDatasourceLoadingSpec.ALL));
+    Assert.assertEquals(StringUtils.format("Invalid value of %s[%s]. Allowed values are [ALL, NONE, ONLY_REQUIRED]",
+                                           BroadcastDatasourceLoadingSpec.CTX_BROADCAST_DATASOURCE_LOADING_MODE, mode), exception.getMessage());
+  }
+
+
+  @Test
+  @Parameters(
+      {
+          "foo bar",
+          "foo]"
+      }
+  )
+  public void testSpecFromInvalidBroadcastDatasourcesInContext(final Object lookupsToLoad)
+  {
+    final DruidException exception = Assert.assertThrows(DruidException.class, () ->
+        BroadcastDatasourceLoadingSpec.createFromContext(
+            ImmutableMap.of(
+                BroadcastDatasourceLoadingSpec.CTX_BROADCAST_DATASOURCES_TO_LOAD, lookupsToLoad,
+                BroadcastDatasourceLoadingSpec.CTX_BROADCAST_DATASOURCE_LOADING_MODE, BroadcastDatasourceLoadingSpec.Mode.ONLY_REQUIRED),
+            BroadcastDatasourceLoadingSpec.ALL)
+    );
+    Assert.assertEquals(StringUtils.format("Invalid value of %s[%s]. Please provide a comma-separated list of "
+                                           + "broadcast datasource names. For example: [\"datasourceName1\", \"datasourceName2\"]",
+                                           BroadcastDatasourceLoadingSpec.CTX_BROADCAST_DATASOURCES_TO_LOAD, lookupsToLoad), exception.getMessage());
+  }
+}
diff --git a/server/src/test/java/org/apache/druid/server/coordination/SegmentBootstrapperCacheTest.java b/server/src/test/java/org/apache/druid/server/coordination/SegmentBootstrapperCacheTest.java
index 7629a6b875c8..187725317a21 100644
--- a/server/src/test/java/org/apache/druid/server/coordination/SegmentBootstrapperCacheTest.java
+++ b/server/src/test/java/org/apache/druid/server/coordination/SegmentBootstrapperCacheTest.java
@@ -36,6 +36,7 @@
 import org.apache.druid.segment.loading.StorageLocationConfig;
 import org.apache.druid.server.SegmentManager;
 import org.apache.druid.server.TestSegmentUtils;
+import org.apache.druid.server.metrics.DataSourceTaskIdHolder;
 import org.apache.druid.timeline.DataSegment;
 import org.junit.Assert;
 import org.junit.Before;
@@ -137,7 +138,8 @@ public void testLoadStartStopWithEmptyLocations() throws IOException
         segmentManager,
         new ServerTypeConfig(ServerType.HISTORICAL),
         coordinatorClient,
-        emitter
+        emitter,
+        new DataSourceTaskIdHolder()
     );
 
     bootstrapper.start();
@@ -164,7 +166,8 @@ public void testLoadStartStop() throws IOException
         segmentManager,
         new ServerTypeConfig(ServerType.HISTORICAL),
         coordinatorClient,
-        emitter
+        emitter,
+        new DataSourceTaskIdHolder()
     );
 
     bootstrapper.start();
@@ -204,7 +207,8 @@ public void testLoadLocalCache() throws IOException, SegmentLoadingException
         segmentManager,
         new ServerTypeConfig(ServerType.HISTORICAL),
         coordinatorClient,
-        emitter
+        emitter,
+        new DataSourceTaskIdHolder()
     );
 
     bootstrapper.start();
diff --git a/server/src/test/java/org/apache/druid/server/coordination/SegmentBootstrapperTest.java b/server/src/test/java/org/apache/druid/server/coordination/SegmentBootstrapperTest.java
index c41763f18245..fe1424e27005 100644
--- a/server/src/test/java/org/apache/druid/server/coordination/SegmentBootstrapperTest.java
+++ b/server/src/test/java/org/apache/druid/server/coordination/SegmentBootstrapperTest.java
@@ -20,13 +20,23 @@
 package org.apache.druid.server.coordination;
 
 import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+import com.google.inject.Guice;
+import com.google.inject.Injector;
+import com.google.inject.Key;
+import com.google.inject.Scopes;
+import com.google.inject.name.Names;
+import org.apache.druid.guice.LazySingleton;
+import org.apache.druid.guice.LifecycleModule;
 import org.apache.druid.guice.ServerTypeConfig;
+import org.apache.druid.jackson.JacksonModule;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.emitter.EmittingLogger;
 import org.apache.druid.java.util.metrics.StubServiceEmitter;
 import org.apache.druid.segment.loading.SegmentLoaderConfig;
 import org.apache.druid.segment.loading.StorageLocationConfig;
 import org.apache.druid.server.SegmentManager;
+import org.apache.druid.server.metrics.DataSourceTaskIdHolder;
 import org.apache.druid.timeline.DataSegment;
 import org.junit.Assert;
 import org.junit.Before;
@@ -125,7 +135,8 @@ public void testStartStop() throws Exception
         segmentManager,
         new ServerTypeConfig(ServerType.HISTORICAL),
         coordinatorClient,
-        serviceEmitter
+        serviceEmitter,
+        new DataSourceTaskIdHolder()
     );
 
     Assert.assertTrue(segmentManager.getDataSourceCounts().isEmpty());
@@ -184,7 +195,8 @@ public void testLoadCachedSegments() throws Exception
         segmentManager,
         new ServerTypeConfig(ServerType.HISTORICAL),
         coordinatorClient,
-        serviceEmitter
+        serviceEmitter,
+        new DataSourceTaskIdHolder()
     );
 
     Assert.assertTrue(segmentManager.getDataSourceCounts().isEmpty());
@@ -240,7 +252,8 @@ public void testLoadBootstrapSegments() throws Exception
         segmentManager,
         new ServerTypeConfig(ServerType.HISTORICAL),
         coordinatorClient,
-        serviceEmitter
+        serviceEmitter,
+        new DataSourceTaskIdHolder()
     );
 
     Assert.assertTrue(segmentManager.getDataSourceCounts().isEmpty());
@@ -267,6 +280,129 @@ public void testLoadBootstrapSegments() throws Exception
     bootstrapper.stop();
   }
 
+  @Test
+  public void testLoadNoBootstrapSegments() throws Exception
+  {
+    final Set<DataSegment> segments = new HashSet<>();
+    for (int i = 0; i < COUNT; ++i) {
+      segments.add(makeSegment("test" + i, "1", Intervals.of("P1d/2011-04-01")));
+      segments.add(makeSegment("test" + i, "1", Intervals.of("P1d/2011-04-02")));
+      segments.add(makeSegment("test_two" + i, "1", Intervals.of("P1d/2011-04-01")));
+      segments.add(makeSegment("test_two" + i, "1", Intervals.of("P1d/2011-04-02")));
+    }
+
+    Injector injector = Guice.createInjector(
+        new JacksonModule(),
+        new LifecycleModule(),
+        binder -> {
+          binder.bindScope(LazySingleton.class, Scopes.SINGLETON);
+          final BroadcastDatasourceLoadingSpec broadcastMode = BroadcastDatasourceLoadingSpec.NONE;
+          binder.bind(Key.get(BroadcastDatasourceLoadingSpec.class, Names.named(DataSourceTaskIdHolder.BROADCAST_DATASOURCES_TO_LOAD_FOR_TASK)))
+                .toInstance(broadcastMode);
+        }
+    );
+
+    final TestCoordinatorClient coordinatorClient = new TestCoordinatorClient(segments);
+    final TestSegmentCacheManager cacheManager = new TestSegmentCacheManager();
+    final SegmentManager segmentManager = new SegmentManager(cacheManager);
+    final SegmentLoadDropHandler handler = new SegmentLoadDropHandler(
+        segmentLoaderConfig,
+        segmentAnnouncer,
+        segmentManager
+    );
+    final SegmentBootstrapper bootstrapper = new SegmentBootstrapper(
+        handler,
+        segmentLoaderConfig,
+        segmentAnnouncer,
+        serverAnnouncer,
+        segmentManager,
+        new ServerTypeConfig(ServerType.HISTORICAL),
+        coordinatorClient,
+        serviceEmitter,
+        injector.getInstance(DataSourceTaskIdHolder.class)
+    );
+
+    Assert.assertTrue(segmentManager.getDataSourceCounts().isEmpty());
+
+    bootstrapper.start();
+
+    Assert.assertEquals(1, serverAnnouncer.getObservedCount());
+    Assert.assertTrue(segmentManager.getDataSourceCounts().isEmpty());
+
+    final ImmutableList<DataSegment> expectedBootstrapSegments = ImmutableList.of();
+
+    Assert.assertEquals(expectedBootstrapSegments, segmentAnnouncer.getObservedSegments());
+
+    Assert.assertEquals(expectedBootstrapSegments, cacheManager.getObservedBootstrapSegments());
+    Assert.assertEquals(expectedBootstrapSegments, cacheManager.getObservedBootstrapSegmentsLoadedIntoPageCache());
+
+    bootstrapper.stop();
+  }
+
+  @Test
+  public void testLoadOnlyRequiredBootstrapSegments() throws Exception
+  {
+    final Set<DataSegment> segments = new HashSet<>();
+    final DataSegment ds1Segment1 = makeSegment("test1", "1", Intervals.of("P1D/2011-04-01"));
+    final DataSegment ds1Segment2 = makeSegment("test1", "1", Intervals.of("P1D/2012-04-01"));
+    final DataSegment ds2Segment1 = makeSegment("test2", "1", Intervals.of("P1d/2011-04-01"));
+    final DataSegment ds2Segment2 = makeSegment("test2", "1", Intervals.of("P1d/2012-04-01"));
+    segments.add(ds1Segment1);
+    segments.add(ds1Segment2);
+    segments.add(ds2Segment1);
+    segments.add(ds2Segment2);
+
+    Injector injector = Guice.createInjector(
+        new JacksonModule(),
+        new LifecycleModule(),
+        binder -> {
+          binder.bindScope(LazySingleton.class, Scopes.SINGLETON);
+          final BroadcastDatasourceLoadingSpec broadcastMode = BroadcastDatasourceLoadingSpec.loadOnly(ImmutableSet.of("test1"));
+          binder.bind(Key.get(BroadcastDatasourceLoadingSpec.class, Names.named(DataSourceTaskIdHolder.BROADCAST_DATASOURCES_TO_LOAD_FOR_TASK)))
+                .toInstance(broadcastMode);
+        }
+    );
+
+    final TestCoordinatorClient coordinatorClient = new TestCoordinatorClient(segments);
+    final TestSegmentCacheManager cacheManager = new TestSegmentCacheManager();
+    final SegmentManager segmentManager = new SegmentManager(cacheManager);
+    final SegmentLoadDropHandler handler = new SegmentLoadDropHandler(
+        segmentLoaderConfig,
+        segmentAnnouncer,
+        segmentManager
+    );
+    final SegmentBootstrapper bootstrapper = new SegmentBootstrapper(
+        handler,
+        segmentLoaderConfig,
+        segmentAnnouncer,
+        serverAnnouncer,
+        segmentManager,
+        new ServerTypeConfig(ServerType.HISTORICAL),
+        coordinatorClient,
+        serviceEmitter,
+        injector.getInstance(DataSourceTaskIdHolder.class)
+    );
+
+    Assert.assertTrue(segmentManager.getDataSourceCounts().isEmpty());
+
+    bootstrapper.start();
+
+    Assert.assertEquals(1, serverAnnouncer.getObservedCount());
+    Assert.assertFalse(segmentManager.getDataSourceCounts().isEmpty());
+    Assert.assertEquals(ImmutableSet.of("test1"), segmentManager.getDataSourceNames());
+
+    final ImmutableList<DataSegment> expectedBootstrapSegments = ImmutableList.of(ds1Segment2, ds1Segment1);
+
+    Assert.assertEquals(expectedBootstrapSegments, segmentAnnouncer.getObservedSegments());
+
+    Assert.assertEquals(expectedBootstrapSegments, cacheManager.getObservedBootstrapSegments());
+    Assert.assertEquals(expectedBootstrapSegments, cacheManager.getObservedBootstrapSegmentsLoadedIntoPageCache());
+    serviceEmitter.verifyValue("segment/bootstrap/count", expectedBootstrapSegments.size());
+    serviceEmitter.verifyEmitted("segment/bootstrap/time", 1);
+
+    bootstrapper.stop();
+  }
+
   @Test
   public void testLoadBootstrapSegmentsWhenExceptionThrown() throws Exception
   {
@@ -285,7 +421,8 @@ public void testLoadBootstrapSegmentsWhenExceptionThrown() throws Exception
         segmentManager,
         new ServerTypeConfig(ServerType.HISTORICAL),
         coordinatorClient,
-        serviceEmitter
+        serviceEmitter,
+        new DataSourceTaskIdHolder()
     );
 
     Assert.assertTrue(segmentManager.getDataSourceCounts().isEmpty());
diff --git a/services/src/main/java/org/apache/druid/cli/CliPeon.java b/services/src/main/java/org/apache/druid/cli/CliPeon.java
index 61a8ab7374e4..15374625d301 100644
--- a/services/src/main/java/org/apache/druid/cli/CliPeon.java
+++ b/services/src/main/java/org/apache/druid/cli/CliPeon.java
@@ -123,6 +123,7 @@
 import org.apache.druid.server.DruidNode;
 import org.apache.druid.server.ResponseContextConfig;
 import org.apache.druid.server.SegmentManager;
+import org.apache.druid.server.coordination.BroadcastDatasourceLoadingSpec;
 import org.apache.druid.server.coordination.SegmentBootstrapper;
 import org.apache.druid.server.coordination.ServerType;
 import org.apache.druid.server.coordination.ZkCoordinator;
@@ -176,12 +177,26 @@ public class CliPeon extends GuiceRunnable
   private boolean isZkEnabled = true;
 
   /**
+   * <p> This option is deprecated, see {@link #loadBroadcastDatasourcesMode} option. </p>
+   *
    * If set to "true", the peon will bind classes necessary for loading broadcast segments. This is used for
    * queryable tasks, such as streaming ingestion tasks.
+   *
    */
-  @Option(name = "--loadBroadcastSegments", title = "loadBroadcastSegments", description = "Enable loading of broadcast segments")
+  @Deprecated
+  @Option(name = "--loadBroadcastSegments", title = "loadBroadcastSegments",
+      description = "Enable loading of broadcast segments. This option is deprecated and will be removed in a"
+                    + " future release. Use --loadBroadcastDatasourceMode instead.")
   public String loadBroadcastSegments = "false";
 
+  /**
+   * Broadcast datasource loading mode. The peon will bind classes necessary required for loading broadcast segments if
+   * the mode is {@link BroadcastDatasourceLoadingSpec.Mode#ALL} or {@link BroadcastDatasourceLoadingSpec.Mode#ONLY_REQUIRED}.
+   */
+  @Option(name = "--loadBroadcastDatasourceMode", title = "loadBroadcastDatasourceMode",
+      description = "Specify the broadcast datasource loading mode for the peon. Supported values are ALL, NONE, ONLY_REQUIRED.")
+  public String loadBroadcastDatasourcesMode = BroadcastDatasourceLoadingSpec.Mode.ALL.toString();
+
   @Option(name = "--taskId", title = "taskId", description = "TaskId for fetching task.json remotely")
   public String taskId = "";
 
@@ -274,7 +289,11 @@ public void configure(Binder binder)
             binder.bind(ServerTypeConfig.class).toInstance(new ServerTypeConfig(ServerType.fromString(serverType)));
             LifecycleModule.register(binder, Server.class);
 
-            if ("true".equals(loadBroadcastSegments)) {
+            final BroadcastDatasourceLoadingSpec.Mode mode =
+                BroadcastDatasourceLoadingSpec.Mode.valueOf(loadBroadcastDatasourcesMode);
+            if ("true".equals(loadBroadcastSegments)
+                || mode == BroadcastDatasourceLoadingSpec.Mode.ALL
+                || mode == BroadcastDatasourceLoadingSpec.Mode.ONLY_REQUIRED) {
               binder.install(new BroadcastSegmentLoadingModule());
             }
           }
@@ -340,6 +359,14 @@ public LookupLoadingSpec getLookupsToLoad(final Task task)
           {
             return task.getLookupLoadingSpec();
           }
+
+          @Provides
+          @LazySingleton
+          @Named(DataSourceTaskIdHolder.BROADCAST_DATASOURCES_TO_LOAD_FOR_TASK)
+          public BroadcastDatasourceLoadingSpec getBroadcastDatasourcesToLoad(final Task task)
+          {
+            return task.getBroadcastDatasourceLoadingSpec();
+          }
         },
         new QueryablePeonModule(),
         new IndexingServiceInputSourceModule(),

From 7a14900ea1943c5b645bf9f90ecb792f543853f6 Mon Sep 17 00:00:00 2001
From: Abhishek Radhakrishnan <abhishek.rb19@gmail.com>
Date: Fri, 13 Sep 2024 00:17:28 -0400
Subject: [PATCH 07/47] Provide `chmod` command for `-XX:OnOutOfMemoryError`
 from shell script (#17054)

A command line arg -XX:OnOutOfMemoryError='chmod 644 ${project.parent.basedir}/target/*.hprof' was added to collect heap dumps: #17029

This arg is causing problems when running tests from Intellij. Intellij doesn't seem to likechmod 644, but this command works as expected in mvn. So as a workaround, add the chmod 644 ${BASE_DIR/target/*.hprof' command in a shell script that can then be executed when OnOutOfMemoryError happens to make Intellij happy.
---
 dev/chmod-heap-dumps.sh | 19 +++++++++++++++++++
 pom.xml                 |  2 +-
 2 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100755 dev/chmod-heap-dumps.sh

diff --git a/dev/chmod-heap-dumps.sh b/dev/chmod-heap-dumps.sh
new file mode 100755
index 000000000000..dbb9582224b1
--- /dev/null
+++ b/dev/chmod-heap-dumps.sh
@@ -0,0 +1,19 @@
+#!/bin/bash -eux
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+BASE_DIR=$(git rev-parse --show-toplevel)
+chmod 644 ${BASE_DIR}/target/*.hprof
diff --git a/pom.xml b/pom.xml
index 47a6bd5cef30..893e4cb45f9c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1778,7 +1778,7 @@
                             -Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
                             -Daws.region=us-east-1 <!-- required for s3-related unit tests -->
                             -Ddruid.test.stupidPool.poison=true
-                            -XX:OnOutOfMemoryError='chmod 644 ${project.parent.basedir}/target/*.hprof'
+                            -XX:OnOutOfMemoryError=${project.parent.basedir}/dev/chmod-heap-dumps.sh
                             -XX:HeapDumpPath=${project.parent.basedir}/target
                             <!--@TODO After fixing https://github.com/apache/druid/issues/4964 remove this parameter-->
                             -Ddruid.indexing.doubleStorage=double

From a85c930042022ff0586b42186df2a5f4b627d699 Mon Sep 17 00:00:00 2001
From: Akshat Jain <akjn11@gmail.com>
Date: Fri, 13 Sep 2024 11:35:45 +0530
Subject: [PATCH 08/47] Add window function drill tests for array_concat_agg
 for empty over scenarios (#17026)

* Add window function drill tests for array_concat_agg for empty over scenarios

* Cleanup sqlNativeIncompatible() as it's not needed now

* Address review comment
---
 .../druid/sql/calcite/BaseCalciteQueryTest.java |  5 -----
 .../druid/sql/calcite/DrillWindowQueryTest.java | 17 ++++++++++++++---
 .../array_concat_agg/empty_over_1.e             |  1 +
 .../array_concat_agg/empty_over_1.q             |  4 ++++
 .../array_concat_agg/only_sorting_column_1.e    | 10 ++++++++++
 .../array_concat_agg/only_sorting_column_1.q    |  4 ++++
 6 files changed, 33 insertions(+), 8 deletions(-)
 create mode 100644 sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/empty_over_1.e
 create mode 100644 sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/empty_over_1.q
 create mode 100644 sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/only_sorting_column_1.e
 create mode 100644 sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/only_sorting_column_1.q

diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/BaseCalciteQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/BaseCalciteQueryTest.java
index fcaaf0448247..676bf8b4dd42 100644
--- a/sql/src/test/java/org/apache/druid/sql/calcite/BaseCalciteQueryTest.java
+++ b/sql/src/test/java/org/apache/druid/sql/calcite/BaseCalciteQueryTest.java
@@ -1217,11 +1217,6 @@ protected void skipVectorize()
     skipVectorize = true;
   }
 
-  protected void sqlNativeIncompatible()
-  {
-    assumeTrue(testBuilder().config.isRunningMSQ(), "test case is not SQL native compatible");
-  }
-
   protected void msqIncompatible()
   {
     assumeFalse(testBuilder().config.isRunningMSQ(), "test case is not MSQ compatible");
diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/DrillWindowQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/DrillWindowQueryTest.java
index 0cc30c49c34d..01bab3f99de0 100644
--- a/sql/src/test/java/org/apache/druid/sql/calcite/DrillWindowQueryTest.java
+++ b/sql/src/test/java/org/apache/druid/sql/calcite/DrillWindowQueryTest.java
@@ -7731,13 +7731,10 @@ public void test_over_clause_with_only_partitioning_multiple_over_multiple_parti
     windowQueryTest();
   }
 
-  // This test gives the following error on sql-native engine:
-  // Column[w0] of type[class org.apache.druid.query.rowsandcols.column.ColumnAccessorBasedColumn] cannot be sorted.
   @DrillTest("druid_queries/empty_and_non_empty_over/wikipedia_query_1")
   @Test
   public void test_empty_and_non_empty_over_wikipedia_query_1()
   {
-    sqlNativeIncompatible();
     windowQueryTest();
   }
 
@@ -7803,4 +7800,18 @@ public void test_array_concat_agg_with_multiple_partition_columns_1()
   {
     windowQueryTest();
   }
+
+  @DrillTest("druid_queries/array_concat_agg/only_sorting_column_1")
+  @Test
+  public void test_array_concat_agg_with_only_sorting_column_1()
+  {
+    windowQueryTest();
+  }
+
+  @DrillTest("druid_queries/array_concat_agg/empty_over_1")
+  @Test
+  public void test_array_concat_agg_with_empty_over_1()
+  {
+    windowQueryTest();
+  }
 }
diff --git a/sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/empty_over_1.e b/sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/empty_over_1.e
new file mode 100644
index 000000000000..486bf8740fe1
--- /dev/null
+++ b/sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/empty_over_1.e
@@ -0,0 +1 @@
+Guatemala	["Guatemala"]
diff --git a/sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/empty_over_1.q b/sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/empty_over_1.q
new file mode 100644
index 000000000000..5448fb8d9bef
--- /dev/null
+++ b/sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/empty_over_1.q
@@ -0,0 +1,4 @@
+select
+countryName, array_concat_agg(ARRAY[countryName], 10000) over () as c1
+from wikipedia where countryName='Guatemala'
+group by countryName
diff --git a/sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/only_sorting_column_1.e b/sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/only_sorting_column_1.e
new file mode 100644
index 000000000000..8757e543a4f7
--- /dev/null
+++ b/sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/only_sorting_column_1.e
@@ -0,0 +1,10 @@
+Austria	null	#de.wikipedia	["N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria"]
+Austria	Horsching	#de.wikipedia	["N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria"]
+Austria	Vienna	#de.wikipedia	["N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria"]
+Austria	Vienna	#es.wikipedia	["N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria"]
+Austria	Vienna	#tr.wikipedia	["N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria"]
+Republic of Korea	null	#ko.wikipedia	["N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria","N/A","Republic of Korea","N/A","Republic of Korea","N/A","Republic of Korea","N/A","Republic of Korea","N/A","Republic of Korea"]
+Republic of Korea	Jeonju	#ko.wikipedia	["N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria","N/A","Republic of Korea","N/A","Republic of Korea","N/A","Republic of Korea","N/A","Republic of Korea","N/A","Republic of Korea"]
+Republic of Korea	Seoul	#ko.wikipedia	["N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria","N/A","Republic of Korea","N/A","Republic of Korea","N/A","Republic of Korea","N/A","Republic of Korea","N/A","Republic of Korea"]
+Republic of Korea	null	#en.wikipedia	["N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria","N/A","Republic of Korea","N/A","Republic of Korea","N/A","Republic of Korea","N/A","Republic of Korea","N/A","Republic of Korea"]
+Republic of Korea	null	#ja.wikipedia	["N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria","N/A","Austria","N/A","Republic of Korea","N/A","Republic of Korea","N/A","Republic of Korea","N/A","Republic of Korea","N/A","Republic of Korea"]
diff --git a/sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/only_sorting_column_1.q b/sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/only_sorting_column_1.q
new file mode 100644
index 000000000000..54bb76b1bd26
--- /dev/null
+++ b/sql/src/test/resources/drill/window/queries/druid_queries/array_concat_agg/only_sorting_column_1.q
@@ -0,0 +1,4 @@
+select countryName, cityName, channel, array_concat_agg(ARRAY['N/A', countryName], 10000) over (order by countryName) as c
+from wikipedia
+where countryName in ('Austria', 'Republic of Korea') and (cityName in ('Horsching', 'Vienna', 'Seoul', 'Jeonju') or cityName is null)
+group by countryName, cityName, channel

From 0d68de446153aabb823b96d96613fb98b52dc2c3 Mon Sep 17 00:00:00 2001
From: Rishabh Singh <6513075+findingrish@users.noreply.github.com>
Date: Fri, 13 Sep 2024 11:47:11 +0530
Subject: [PATCH 09/47] Skip tombstone segment refresh in metadata cache
 (#17025)

This PR #16890 introduced a change to skip adding tombstone segments to the cache.
It turns out that as a side effect tombstone segments appear unavailable in the console. This happens because availability of a segment in Broker is determined from the metadata cache.

The fix is to keep the segment in the metadata cache but skip them from refresh.

This doesn't affect any functionality as metadata query for tombstone returns empty causing continuous refresh of those segments.
---
 .../AbstractSegmentMetadataCache.java         |  24 +--
 .../CoordinatorSegmentMetadataCache.java      |  50 ++++--
 .../CoordinatorSegmentMetadataCacheTest.java  | 155 ++++++++++++------
 .../BrokerSegmentMetadataCacheTest.java       | 141 ++++++++++------
 4 files changed, 241 insertions(+), 129 deletions(-)

diff --git a/server/src/main/java/org/apache/druid/segment/metadata/AbstractSegmentMetadataCache.java b/server/src/main/java/org/apache/druid/segment/metadata/AbstractSegmentMetadataCache.java
index d918ec5e3f29..99d965ec643e 100644
--- a/server/src/main/java/org/apache/druid/segment/metadata/AbstractSegmentMetadataCache.java
+++ b/server/src/main/java/org/apache/druid/segment/metadata/AbstractSegmentMetadataCache.java
@@ -102,6 +102,13 @@
  * <p>
  * This class has an abstract method {@link #refresh(Set, Set)} which the child class must override
  * with the logic to build and cache table schema.
+ * <p>
+ * Note on handling tombstone segments:
+ * These segments lack data or column information.
+ * Additionally, segment metadata queries, which are not yet implemented for tombstone segments
+ * (see: https://github.com/apache/druid/pull/12137) do not provide metadata for tombstones,
+ * leading to indefinite refresh attempts for these segments.
+ * Therefore, these segments are never added to the set of segments being refreshed.
  *
  * @param <T> The type of information associated with the data source, which must extend {@link DataSourceInformation}.
  */
@@ -478,13 +485,6 @@ public int getTotalSegments()
   @VisibleForTesting
   public void addSegment(final DruidServerMetadata server, final DataSegment segment)
   {
-    // Skip adding tombstone segment to the cache. These segments lack data or column information.
-    // Additionally, segment metadata queries, which are not yet implemented for tombstone segments
-    // (see: https://github.com/apache/druid/pull/12137) do not provide metadata for tombstones,
-    // leading to indefinite refresh attempts for these segments.
-    if (segment.isTombstone()) {
-      return;
-    }
     // Get lock first so that we won't wait in ConcurrentMap.compute().
     synchronized (lock) {
       // someday we could hypothetically remove broker special casing, whenever BrokerServerView supports tracking
@@ -511,7 +511,11 @@ public void addSegment(final DruidServerMetadata server, final DataSegment segme
                       segmentMetadata = AvailableSegmentMetadata
                           .builder(segment, isRealtime, ImmutableSet.of(server), null, DEFAULT_NUM_ROWS) // Added without needing a refresh
                           .build();
-                      markSegmentAsNeedRefresh(segment.getId());
+                      if (segment.isTombstone()) {
+                        log.debug("Skipping refresh for tombstone segment.");
+                      } else {
+                        markSegmentAsNeedRefresh(segment.getId());
+                      }
                       if (!server.isSegmentReplicationTarget()) {
                         log.debug("Added new mutable segment [%s].", segment.getId());
                         markSegmentAsMutable(segment.getId());
@@ -557,10 +561,6 @@ public void addSegment(final DruidServerMetadata server, final DataSegment segme
   @VisibleForTesting
   public void removeSegment(final DataSegment segment)
   {
-    // tombstone segments are not present in the cache
-    if (segment.isTombstone()) {
-      return;
-    }
     // Get lock first so that we won't wait in ConcurrentMap.compute().
     synchronized (lock) {
       log.debug("Segment [%s] is gone.", segment.getId());
diff --git a/server/src/main/java/org/apache/druid/segment/metadata/CoordinatorSegmentMetadataCache.java b/server/src/main/java/org/apache/druid/segment/metadata/CoordinatorSegmentMetadataCache.java
index 321c33fa1dbf..24489e60acdc 100644
--- a/server/src/main/java/org/apache/druid/segment/metadata/CoordinatorSegmentMetadataCache.java
+++ b/server/src/main/java/org/apache/druid/segment/metadata/CoordinatorSegmentMetadataCache.java
@@ -374,9 +374,7 @@ public Iterator<AvailableSegmentMetadata> iterateSegmentMetadata()
                                                .withNumRows(metadata.get().getNumRows())
                                                .build();
               } else {
-                // mark it for refresh, however, this case shouldn't arise by design
-                markSegmentAsNeedRefresh(segmentId);
-                log.debug("SchemaMetadata for segmentId[%s] is absent.", segmentId);
+                markSegmentForRefreshIfNeeded(availableSegmentMetadata.getSegment());
                 return availableSegmentMetadata;
               }
             }
@@ -403,9 +401,7 @@ public AvailableSegmentMetadata getAvailableSegmentMetadata(String datasource, S
                                        .withNumRows(metadata.get().getNumRows())
                                        .build();
     } else {
-      // mark it for refresh, however, this case shouldn't arise by design
-      markSegmentAsNeedRefresh(segmentId);
-      log.debug("SchemaMetadata for segmentId [%s] is absent.", segmentId);
+      markSegmentForRefreshIfNeeded(availableSegmentMetadata.getSegment());
     }
     return availableSegmentMetadata;
   }
@@ -686,22 +682,14 @@ public RowSignature buildDataSourceRowSignature(final String dataSource)
     final Map<String, ColumnType> columnTypes = new LinkedHashMap<>();
 
     if (segmentsMap != null && !segmentsMap.isEmpty()) {
-      for (SegmentId segmentId : segmentsMap.keySet()) {
+      for (Map.Entry<SegmentId, AvailableSegmentMetadata> entry : segmentsMap.entrySet()) {
+        SegmentId segmentId = entry.getKey();
         Optional<SchemaPayloadPlus> optionalSchema = segmentSchemaCache.getSchemaForSegment(segmentId);
         if (optionalSchema.isPresent()) {
           RowSignature rowSignature = optionalSchema.get().getSchemaPayload().getRowSignature();
           mergeRowSignature(columnTypes, rowSignature);
         } else {
-          log.debug("SchemaMetadata for segmentId [%s] is absent.", segmentId);
-
-          ImmutableDruidDataSource druidDataSource =
-              sqlSegmentsMetadataManager.getImmutableDataSourceWithUsedSegments(segmentId.getDataSource());
-
-          if (druidDataSource != null && druidDataSource.getSegment(segmentId) != null) {
-            // mark it for refresh only if it is used
-            // however, this case shouldn't arise by design
-            markSegmentAsNeedRefresh(segmentId);
-          }
+          markSegmentForRefreshIfNeeded(entry.getValue().getSegment());
         }
       }
     } else {
@@ -876,4 +864,32 @@ Optional<RowSignature> mergeOrCreateRowSignature(
       return Optional.empty();
     }
   }
+
+  /**
+   * A segment schema can go missing. To ensure smooth functioning, segment is marked for refresh.
+   * It need not be refreshed in the following scenarios:
+   * - Tombstone segment, since they do not have any schema.
+   * - Unused segment which hasn't been yet removed from the cache.
+   * Any other scenario needs investigation.
+   */
+  private void markSegmentForRefreshIfNeeded(DataSegment segment)
+  {
+    SegmentId id = segment.getId();
+
+    log.debug("SchemaMetadata for segmentId [%s] is absent.", id);
+
+    if (segment.isTombstone()) {
+      log.debug("Skipping refresh for tombstone segment [%s].", id);
+      return;
+    }
+
+    ImmutableDruidDataSource druidDataSource =
+        sqlSegmentsMetadataManager.getImmutableDataSourceWithUsedSegments(segment.getDataSource());
+
+    if (druidDataSource != null && druidDataSource.getSegment(id) != null) {
+      markSegmentAsNeedRefresh(id);
+    } else {
+      log.debug("Skipping refresh for unused segment [%s].", id);
+    }
+  }
 }
diff --git a/server/src/test/java/org/apache/druid/segment/metadata/CoordinatorSegmentMetadataCacheTest.java b/server/src/test/java/org/apache/druid/segment/metadata/CoordinatorSegmentMetadataCacheTest.java
index 0c099cb551cb..22b0890e855e 100644
--- a/server/src/test/java/org/apache/druid/segment/metadata/CoordinatorSegmentMetadataCacheTest.java
+++ b/server/src/test/java/org/apache/druid/segment/metadata/CoordinatorSegmentMetadataCacheTest.java
@@ -32,6 +32,7 @@
 import org.apache.druid.client.ImmutableDruidDataSource;
 import org.apache.druid.client.InternalQueryConfig;
 import org.apache.druid.data.input.InputRow;
+import org.apache.druid.java.util.common.DateTimes;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.Pair;
 import org.apache.druid.java.util.common.StringUtils;
@@ -2220,74 +2221,109 @@ protected void coldDatasourceSchemaExec()
   }
 
   @Test
-  public void testTombstoneSegmentIsNotAdded() throws InterruptedException
+  public void testTombstoneSegmentIsNotRefreshed() throws IOException
   {
-    String datasource = "newSegmentAddTest";
-    CountDownLatch addSegmentLatch = new CountDownLatch(1);
+    String brokerInternalQueryConfigJson = "{\"context\": { \"priority\": 5} }";
+
+    TestHelper.makeJsonMapper();
+    InternalQueryConfig internalQueryConfig = MAPPER.readValue(
+        MAPPER.writeValueAsString(
+            MAPPER.readValue(brokerInternalQueryConfigJson, InternalQueryConfig.class)
+        ),
+        InternalQueryConfig.class
+    );
+
+    QueryLifecycleFactory factoryMock = EasyMock.createMock(QueryLifecycleFactory.class);
+    QueryLifecycle lifecycleMock = EasyMock.createMock(QueryLifecycle.class);
 
     CoordinatorSegmentMetadataCache schema = new CoordinatorSegmentMetadataCache(
-        getQueryLifecycleFactory(walker),
+        factoryMock,
         serverView,
         SEGMENT_CACHE_CONFIG_DEFAULT,
         new NoopEscalator(),
-        new InternalQueryConfig(),
+        internalQueryConfig,
         new NoopServiceEmitter(),
         segmentSchemaCache,
         backFillQueue,
         sqlSegmentsMetadataManager,
         segmentsMetadataManagerConfigSupplier
-    )
-    {
-      @Override
-      public void addSegment(final DruidServerMetadata server, final DataSegment segment)
-      {
-        super.addSegment(server, segment);
-        if (datasource.equals(segment.getDataSource())) {
-          addSegmentLatch.countDown();
-        }
-      }
-    };
+    );
 
-    schema.onLeaderStart();
-    schema.awaitInitialization();
+    Map<String, Object> queryContext = ImmutableMap.of(
+        QueryContexts.PRIORITY_KEY, 5,
+        QueryContexts.BROKER_PARALLEL_MERGE_KEY, false
+    );
 
-    DataSegment segment = new DataSegment(
-        datasource,
-        Intervals.of("2001/2002"),
-        "1",
-        Collections.emptyMap(),
-        Collections.emptyList(),
-        Collections.emptyList(),
-        TombstoneShardSpec.INSTANCE,
-        null,
+    DataSegment segment = newSegment("test", 0);
+    DataSegment tombstone = DataSegment.builder()
+                                       .dataSource("test")
+                                       .interval(Intervals.of("2012-01-01/2012-01-02"))
+                                       .version(DateTimes.of("2012-01-01T11:22:33.444Z").toString())
+                                       .shardSpec(new TombstoneShardSpec())
+                                       .loadSpec(Collections.singletonMap(
+                                           "type",
+                                           DataSegment.TOMBSTONE_LOADSPEC_TYPE
+                                       ))
+                                       .size(0)
+                                       .build();
+
+    final DruidServer historicalServer = druidServers.stream()
+                                                     .filter(s -> s.getType().equals(ServerType.HISTORICAL))
+                                                     .findAny()
+                                                     .orElse(null);
+
+    Assert.assertNotNull(historicalServer);
+    final DruidServerMetadata historicalServerMetadata = historicalServer.getMetadata();
+
+    schema.addSegment(historicalServerMetadata, segment);
+    schema.addSegment(historicalServerMetadata, tombstone);
+    Assert.assertFalse(schema.getSegmentsNeedingRefresh().contains(tombstone.getId()));
+
+    List<SegmentId> segmentIterable = ImmutableList.of(segment.getId(), tombstone.getId());
+
+    SegmentMetadataQuery expectedMetadataQuery = new SegmentMetadataQuery(
+        new TableDataSource(segment.getDataSource()),
+        new MultipleSpecificSegmentSpec(
+            segmentIterable.stream()
+                           .filter(id -> !id.equals(tombstone.getId()))
+                           .map(SegmentId::toDescriptor)
+                           .collect(Collectors.toList())
+        ),
+        new AllColumnIncluderator(),
+        false,
+        queryContext,
+        EnumSet.of(SegmentMetadataQuery.AnalysisType.AGGREGATORS),
+        false,
         null,
-        0
+        null
     );
 
-    Assert.assertEquals(6, schema.getTotalSegments());
+    EasyMock.expect(factoryMock.factorize()).andReturn(lifecycleMock).once();
+    EasyMock.expect(lifecycleMock.runSimple(expectedMetadataQuery, AllowAllAuthenticator.ALLOW_ALL_RESULT, Access.OK))
+            .andReturn(QueryResponse.withEmptyContext(Sequences.empty())).once();
 
-    serverView.addSegment(segment, ServerType.HISTORICAL);
-    Assert.assertTrue(addSegmentLatch.await(1, TimeUnit.SECONDS));
-    Assert.assertEquals(0, addSegmentLatch.getCount());
+    EasyMock.replay(factoryMock, lifecycleMock);
 
-    Assert.assertEquals(6, schema.getTotalSegments());
-    List<AvailableSegmentMetadata> metadatas = schema
-        .getSegmentMetadataSnapshot()
-        .values()
-        .stream()
-        .filter(metadata -> datasource.equals(metadata.getSegment().getDataSource()))
-        .collect(Collectors.toList());
-    Assert.assertEquals(0, metadatas.size());
+    schema.refresh(Collections.singleton(segment.getId()), Collections.singleton("test"));
 
-    serverView.removeSegment(segment, ServerType.HISTORICAL);
-    Assert.assertEquals(6, schema.getTotalSegments());
-    metadatas = schema
-        .getSegmentMetadataSnapshot()
-        .values()
-        .stream()
-        .filter(metadata -> datasource.equals(metadata.getSegment().getDataSource()))
-        .collect(Collectors.toList());
-    Assert.assertEquals(0, metadatas.size());
+    // verify that metadata query is not issued for tombstone segment
+    EasyMock.verify(factoryMock, lifecycleMock);
+
+    // Verify that datasource schema building logic doesn't mark the tombstone segment for refresh
+    Assert.assertFalse(schema.getSegmentsNeedingRefresh().contains(tombstone.getId()));
+
+    AvailableSegmentMetadata availableSegmentMetadata = schema.getAvailableSegmentMetadata("test", tombstone.getId());
+    Assert.assertNotNull(availableSegmentMetadata);
+    // fetching metadata for tombstone segment shouldn't mark it for refresh
+    Assert.assertFalse(schema.getSegmentsNeedingRefresh().contains(tombstone.getId()));
+
+    Set<AvailableSegmentMetadata> metadatas = new HashSet<>();
+    schema.iterateSegmentMetadata().forEachRemaining(metadatas::add);
+
+    Assert.assertEquals(1, metadatas.stream().filter(metadata -> metadata.getSegment().isTombstone()).count());
+
+    // iterating over entire metadata doesn't cause tombstone to be marked for refresh
+    Assert.assertFalse(schema.getSegmentsNeedingRefresh().contains(tombstone.getId()));
   }
 
   @Test
@@ -2384,6 +2420,27 @@ public void refresh(Set<SegmentId> segmentsToRefresh, Set<String> dataSourcesToR
 
     Assert.assertTrue(schema.getSegmentsNeedingRefresh().contains(segments.get(1).getId()));
     Assert.assertFalse(schema.getSegmentsNeedingRefresh().contains(segments.get(2).getId()));
+
+    AvailableSegmentMetadata availableSegmentMetadata =
+        schema.getAvailableSegmentMetadata(dataSource, segments.get(0).getId());
+
+    Assert.assertNotNull(availableSegmentMetadata);
+    // fetching metadata for unused segment shouldn't mark it for refresh
+    Assert.assertFalse(schema.getSegmentsNeedingRefresh().contains(segments.get(0).getId()));
+
+    Set<AvailableSegmentMetadata> metadatas = new HashSet<>();
+    schema.iterateSegmentMetadata().forEachRemaining(metadatas::add);
+
+    Assert.assertEquals(
+        1,
+        metadatas.stream()
+                 .filter(
+                     metadata ->
+                         metadata.getSegment().getId().equals(segments.get(0).getId())).count()
+    );
+
+    // iterating over entire metadata doesn't cause unsed segment to be marked for refresh
+    Assert.assertFalse(schema.getSegmentsNeedingRefresh().contains(segments.get(0).getId()));
   }
 
   private void verifyFooDSSchema(CoordinatorSegmentMetadataCache schema, int columns)
diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/schema/BrokerSegmentMetadataCacheTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/schema/BrokerSegmentMetadataCacheTest.java
index d9b24ed011dc..b613c602f633 100644
--- a/sql/src/test/java/org/apache/druid/sql/calcite/schema/BrokerSegmentMetadataCacheTest.java
+++ b/sql/src/test/java/org/apache/druid/sql/calcite/schema/BrokerSegmentMetadataCacheTest.java
@@ -37,6 +37,7 @@
 import org.apache.druid.client.coordinator.CoordinatorClient;
 import org.apache.druid.client.coordinator.NoopCoordinatorClient;
 import org.apache.druid.data.input.InputRow;
+import org.apache.druid.java.util.common.DateTimes;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.Pair;
 import org.apache.druid.java.util.common.guava.Sequences;
@@ -1139,71 +1140,109 @@ public void testNoDatasourceSchemaWhenNoSegmentMetadata() throws InterruptedExce
   }
 
   @Test
-  public void testTombstoneSegmentIsNotAdded() throws InterruptedException
+  public void testTombstoneSegmentIsNotRefreshed() throws IOException
   {
-    String datasource = "newSegmentAddTest";
-    CountDownLatch addSegmentLatch = new CountDownLatch(1);
+    String brokerInternalQueryConfigJson = "{\"context\": { \"priority\": 5} }";
+
+    TestHelper.makeJsonMapper();
+    InternalQueryConfig internalQueryConfig = MAPPER.readValue(
+        MAPPER.writeValueAsString(
+            MAPPER.readValue(brokerInternalQueryConfigJson, InternalQueryConfig.class)
+        ),
+        InternalQueryConfig.class
+    );
+
+    QueryLifecycleFactory factoryMock = EasyMock.createMock(QueryLifecycleFactory.class);
+    QueryLifecycle lifecycleMock = EasyMock.createMock(QueryLifecycle.class);
+
     BrokerSegmentMetadataCache schema = new BrokerSegmentMetadataCache(
-        CalciteTests.createMockQueryLifecycleFactory(walker, conglomerate),
+        factoryMock,
         serverView,
-        BrokerSegmentMetadataCacheConfig.create(),
+        SEGMENT_CACHE_CONFIG_DEFAULT,
         new NoopEscalator(),
-        new InternalQueryConfig(),
+        internalQueryConfig,
         new NoopServiceEmitter(),
         new PhysicalDatasourceMetadataFactory(globalTableJoinable, segmentManager),
         new NoopCoordinatorClient(),
         CentralizedDatasourceSchemaConfig.create()
-    )
-    {
-      @Override
-      public void addSegment(final DruidServerMetadata server, final DataSegment segment)
-      {
-        super.addSegment(server, segment);
-        if (datasource.equals(segment.getDataSource())) {
-          addSegmentLatch.countDown();
-        }
-      }
-    };
+    );
 
-    schema.start();
-    schema.awaitInitialization();
+    Map<String, Object> queryContext = ImmutableMap.of(
+        QueryContexts.PRIORITY_KEY, 5,
+        QueryContexts.BROKER_PARALLEL_MERGE_KEY, false
+    );
 
-    DataSegment segment = new DataSegment(
-        datasource,
-        Intervals.of("2001/2002"),
-        "1",
-        Collections.emptyMap(),
-        Collections.emptyList(),
-        Collections.emptyList(),
-        TombstoneShardSpec.INSTANCE,
-        null,
+    DataSegment segment = newSegment("test", 0);
+    DataSegment tombstone = DataSegment.builder()
+                                       .dataSource("test")
+                                       .interval(Intervals.of("2012-01-01/2012-01-02"))
+                                       .version(DateTimes.of("2012-01-01T11:22:33.444Z").toString())
+                                       .shardSpec(new TombstoneShardSpec())
+                                       .loadSpec(Collections.singletonMap(
+                                           "type",
+                                           DataSegment.TOMBSTONE_LOADSPEC_TYPE
+                                       ))
+                                       .size(0)
+                                       .build();
+
+    final ImmutableDruidServer historicalServer = druidServers.stream()
+                                                     .filter(s -> s.getType().equals(ServerType.HISTORICAL))
+                                                     .findAny()
+                                                     .orElse(null);
+
+    Assert.assertNotNull(historicalServer);
+    final DruidServerMetadata historicalServerMetadata = historicalServer.getMetadata();
+
+    schema.addSegment(historicalServerMetadata, segment);
+    schema.addSegment(historicalServerMetadata, tombstone);
+    Assert.assertFalse(schema.getSegmentsNeedingRefresh().contains(tombstone.getId()));
+
+    List<SegmentId> segmentIterable = ImmutableList.of(segment.getId(), tombstone.getId());
+
+    SegmentMetadataQuery expectedMetadataQuery = new SegmentMetadataQuery(
+        new TableDataSource(segment.getDataSource()),
+        new MultipleSpecificSegmentSpec(
+            segmentIterable.stream()
+                           .filter(id -> !id.equals(tombstone.getId()))
+                           .map(SegmentId::toDescriptor)
+                           .collect(Collectors.toList())
+        ),
+        new AllColumnIncluderator(),
+        false,
+        queryContext,
+        EnumSet.noneOf(SegmentMetadataQuery.AnalysisType.class),
+        false,
         null,
-        0
+        null
     );
 
-    Assert.assertEquals(6, schema.getTotalSegments());
+    EasyMock.expect(factoryMock.factorize()).andReturn(lifecycleMock).once();
+    EasyMock.expect(lifecycleMock.runSimple(expectedMetadataQuery, AllowAllAuthenticator.ALLOW_ALL_RESULT, Access.OK))
+            .andReturn(QueryResponse.withEmptyContext(Sequences.empty()));
 
-    serverView.addSegment(segment, ServerType.HISTORICAL);
-    Assert.assertTrue(addSegmentLatch.await(1, TimeUnit.SECONDS));
-    Assert.assertEquals(0, addSegmentLatch.getCount());
+    EasyMock.replay(factoryMock, lifecycleMock);
 
-    Assert.assertEquals(6, schema.getTotalSegments());
-    List<AvailableSegmentMetadata> metadatas = schema
-        .getSegmentMetadataSnapshot()
-        .values()
-        .stream()
-        .filter(metadata -> datasource.equals(metadata.getSegment().getDataSource()))
-        .collect(Collectors.toList());
-    Assert.assertEquals(0, metadatas.size());
-
-    serverView.removeSegment(segment, ServerType.HISTORICAL);
-    Assert.assertEquals(6, schema.getTotalSegments());
-    metadatas = schema
-        .getSegmentMetadataSnapshot()
-        .values()
-        .stream()
-        .filter(metadata -> datasource.equals(metadata.getSegment().getDataSource()))
-        .collect(Collectors.toList());
-    Assert.assertEquals(0, metadatas.size());
+    Set<SegmentId> segmentsToRefresh = new HashSet<>();
+    segmentsToRefresh.add(segment.getId());
+    schema.refresh(segmentsToRefresh, Collections.singleton("test"));
+
+    // verify that metadata is not issued for tombstone segment
+    EasyMock.verify(factoryMock, lifecycleMock);
+
+    // Verify that datasource schema building logic doesn't mark the tombstone segment for refresh
+    Assert.assertFalse(schema.getSegmentsNeedingRefresh().contains(tombstone.getId()));
+
+    AvailableSegmentMetadata availableSegmentMetadata = schema.getAvailableSegmentMetadata("test", tombstone.getId());
+    Assert.assertNotNull(availableSegmentMetadata);
+    // fetching metadata for tombstone segment shouldn't mark it for refresh
+    Assert.assertFalse(schema.getSegmentsNeedingRefresh().contains(tombstone.getId()));
+
+    Set<AvailableSegmentMetadata> metadatas = new HashSet<>();
+    schema.iterateSegmentMetadata().forEachRemaining(metadatas::add);
+
+    Assert.assertEquals(1, metadatas.stream().filter(metadata -> metadata.getSegment().isTombstone()).count());
+
+    // iterating over entire metadata doesn't cause tombstone to be marked for refresh
+    Assert.assertFalse(schema.getSegmentsNeedingRefresh().contains(tombstone.getId()));
   }
 }

From 01f1aa19aa9ee0a41cd04ef0dcc4b6e5d3f7ef50 Mon Sep 17 00:00:00 2001
From: Abhishek Radhakrishnan <abhishek.rb19@gmail.com>
Date: Fri, 13 Sep 2024 05:24:07 -0400
Subject: [PATCH 10/47] Bump up -Xmx2500m from 2GB and keep MaxDirectMemorySize
 as 2500m as well. (#17056)

---
 pom.xml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pom.xml b/pom.xml
index 893e4cb45f9c..de7543867610 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1765,9 +1765,7 @@
                             @{jacocoArgLine}
                             ${jdk.strong.encapsulation.argLine}
                             ${jdk.security.manager.allow.argLine}
-                            <!-- Bump up memory from 1.5 GB to 2 GB. This is particularly needed for JDK 17 where
-                            we have jfr profiler enabled which adds additional memory pressure. -->
-                            -Xmx2048m
+                            -Xmx2500m
                             -XX:MaxDirectMemorySize=2500m
                             -XX:+ExitOnOutOfMemoryError
                             -XX:+HeapDumpOnOutOfMemoryError

From 8d3c4793a89fc91b41eeaa02cba8d4fa007eec88 Mon Sep 17 00:00:00 2001
From: Clint Wylie <cwylie@apache.org>
Date: Fri, 13 Sep 2024 12:52:35 -0700
Subject: [PATCH 11/47] add CursorHolder.isPreAggregated method to allow
 cursors on pre-aggregated data (#17058)

changes:
* CursorHolder.isPreAggregated method indicates that a cursor has pre-aggregated data for all AggregatorFactory specified in a CursorBuildSpec. If true, engines should rewrite the query to use AggregatorFactory.getCombiningAggreggator, and column selector factories will provide selectors with the aggregator interediate type for the aggregator factory name
* Added groupby, timeseries, and topN support for CursorHolder.isPreAggregated
* Added synthetic test since no CursorHolder implementations support isPreAggregated at this point in time
---
 .../query/aggregation/AggregatorUtil.java     |   9 +
 .../druid/query/groupby/GroupingEngine.java   |   4 +
 .../query/timeseries/TimeseriesQuery.java     |   5 +
 .../timeseries/TimeseriesQueryEngine.java     |   6 +-
 .../druid/query/topn/TopNQueryEngine.java     |   7 +-
 .../apache/druid/segment/CursorHolder.java    |  17 ++
 .../druid/segment/CursorHolderPreaggTest.java | 267 ++++++++++++++++++
 7 files changed, 312 insertions(+), 3 deletions(-)
 create mode 100644 processing/src/test/java/org/apache/druid/segment/CursorHolderPreaggTest.java

diff --git a/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java b/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java
index c4c9a7875ef0..4d12327c8965 100755
--- a/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java
+++ b/processing/src/main/java/org/apache/druid/query/aggregation/AggregatorUtil.java
@@ -454,4 +454,13 @@ public static boolean shouldUseObjectColumnAggregatorWrapper(
     }
     return false;
   }
+
+  public static List<AggregatorFactory> getCombiningAggregators(List<AggregatorFactory> aggs)
+  {
+    List<AggregatorFactory> combining = new ArrayList<>(aggs.size());
+    for (AggregatorFactory agg : aggs) {
+      combining.add(agg.getCombiningFactory());
+    }
+    return combining;
+  }
 }
diff --git a/processing/src/main/java/org/apache/druid/query/groupby/GroupingEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/GroupingEngine.java
index 0962e5400e28..ce63050a7e61 100644
--- a/processing/src/main/java/org/apache/druid/query/groupby/GroupingEngine.java
+++ b/processing/src/main/java/org/apache/druid/query/groupby/GroupingEngine.java
@@ -61,6 +61,7 @@
 import org.apache.druid.query.ResourceLimitExceededException;
 import org.apache.druid.query.ResultMergeQueryRunner;
 import org.apache.druid.query.aggregation.AggregatorFactory;
+import org.apache.druid.query.aggregation.AggregatorUtil;
 import org.apache.druid.query.aggregation.PostAggregator;
 import org.apache.druid.query.context.ResponseContext;
 import org.apache.druid.query.dimension.DefaultDimensionSpec;
@@ -508,6 +509,9 @@ public Sequence<ResultRow> process(
       final CursorBuildSpec buildSpec = makeCursorBuildSpec(query, groupByQueryMetrics);
       final CursorHolder cursorHolder = closer.register(cursorFactory.makeCursorHolder(buildSpec));
 
+      if (cursorHolder.isPreAggregated()) {
+        query = query.withAggregatorSpecs(AggregatorUtil.getCombiningAggregators(query.getAggregatorSpecs()));
+      }
       final ColumnInspector inspector = query.getVirtualColumns().wrapInspector(cursorFactory);
 
       // group by specific vectorization check
diff --git a/processing/src/main/java/org/apache/druid/query/timeseries/TimeseriesQuery.java b/processing/src/main/java/org/apache/druid/query/timeseries/TimeseriesQuery.java
index 6e2cb62adcf9..88d488f85b9c 100644
--- a/processing/src/main/java/org/apache/druid/query/timeseries/TimeseriesQuery.java
+++ b/processing/src/main/java/org/apache/druid/query/timeseries/TimeseriesQuery.java
@@ -243,6 +243,11 @@ public TimeseriesQuery withDimFilter(DimFilter dimFilter)
     return Druids.TimeseriesQueryBuilder.copy(this).filters(dimFilter).build();
   }
 
+  public TimeseriesQuery withAggregatorSpecs(List<AggregatorFactory> aggregatorSpecs)
+  {
+    return Druids.TimeseriesQueryBuilder.copy(this).aggregators(aggregatorSpecs).build();
+  }
+
   public TimeseriesQuery withPostAggregatorSpecs(final List<PostAggregator> postAggregatorSpecs)
   {
     return Druids.TimeseriesQueryBuilder.copy(this).postAggregators(postAggregatorSpecs).build();
diff --git a/processing/src/main/java/org/apache/druid/query/timeseries/TimeseriesQueryEngine.java b/processing/src/main/java/org/apache/druid/query/timeseries/TimeseriesQueryEngine.java
index dd5a8cb2b58c..dbec221248e9 100644
--- a/processing/src/main/java/org/apache/druid/query/timeseries/TimeseriesQueryEngine.java
+++ b/processing/src/main/java/org/apache/druid/query/timeseries/TimeseriesQueryEngine.java
@@ -38,6 +38,7 @@
 import org.apache.druid.query.aggregation.Aggregator;
 import org.apache.druid.query.aggregation.AggregatorAdapters;
 import org.apache.druid.query.aggregation.AggregatorFactory;
+import org.apache.druid.query.aggregation.AggregatorUtil;
 import org.apache.druid.query.vector.VectorCursorGranularizer;
 import org.apache.druid.segment.ColumnSelectorFactory;
 import org.apache.druid.segment.Cursor;
@@ -86,7 +87,7 @@ public TimeseriesQueryEngine(
    * scoped down to a single interval before calling this method.
    */
   public Sequence<Result<TimeseriesResultValue>> process(
-      final TimeseriesQuery query,
+      TimeseriesQuery query,
       final CursorFactory cursorFactory,
       @Nullable TimeBoundaryInspector timeBoundaryInspector,
       @Nullable final TimeseriesQueryMetrics timeseriesQueryMetrics
@@ -102,6 +103,9 @@ public Sequence<Result<TimeseriesResultValue>> process(
     final Granularity gran = query.getGranularity();
 
     final CursorHolder cursorHolder = cursorFactory.makeCursorHolder(makeCursorBuildSpec(query, timeseriesQueryMetrics));
+    if (cursorHolder.isPreAggregated()) {
+      query = query.withAggregatorSpecs(AggregatorUtil.getCombiningAggregators(query.getAggregatorSpecs()));
+    }
     try {
       final Sequence<Result<TimeseriesResultValue>> result;
 
diff --git a/processing/src/main/java/org/apache/druid/query/topn/TopNQueryEngine.java b/processing/src/main/java/org/apache/druid/query/topn/TopNQueryEngine.java
index 442e04552f17..d10d26242e3f 100644
--- a/processing/src/main/java/org/apache/druid/query/topn/TopNQueryEngine.java
+++ b/processing/src/main/java/org/apache/druid/query/topn/TopNQueryEngine.java
@@ -30,6 +30,7 @@
 import org.apache.druid.query.QueryMetrics;
 import org.apache.druid.query.Result;
 import org.apache.druid.query.aggregation.AggregatorFactory;
+import org.apache.druid.query.aggregation.AggregatorUtil;
 import org.apache.druid.query.extraction.ExtractionFn;
 import org.apache.druid.query.topn.types.TopNColumnAggregatesProcessor;
 import org.apache.druid.query.topn.types.TopNColumnAggregatesProcessorFactory;
@@ -73,7 +74,7 @@ public TopNQueryEngine(NonBlockingPool<ByteBuffer> bufferPool)
    * update {@link TopNResultValue}
    */
   public Sequence<Result<TopNResultValue>> query(
-      final TopNQuery query,
+      TopNQuery query,
       final Segment segment,
       @Nullable final TopNQueryMetrics queryMetrics
   )
@@ -87,6 +88,9 @@ public Sequence<Result<TopNResultValue>> query(
 
     final CursorBuildSpec buildSpec = makeCursorBuildSpec(query, queryMetrics);
     final CursorHolder cursorHolder = cursorFactory.makeCursorHolder(buildSpec);
+    if (cursorHolder.isPreAggregated()) {
+      query = query.withAggregatorSpecs(AggregatorUtil.getCombiningAggregators(query.getAggregatorSpecs()));
+    }
     final Cursor cursor = cursorHolder.asCursor();
     if (cursor == null) {
       return Sequences.withBaggage(Sequences.empty(), cursorHolder);
@@ -127,7 +131,6 @@ public Sequence<Result<TopNResultValue>> query(
       return Sequences.withBaggage(Sequences.empty(), cursorHolder);
     }
 
-
     if (queryMetrics != null) {
       queryMetrics.cursor(cursor);
     }
diff --git a/processing/src/main/java/org/apache/druid/segment/CursorHolder.java b/processing/src/main/java/org/apache/druid/segment/CursorHolder.java
index a70fd8757e13..79bf2b4e557a 100644
--- a/processing/src/main/java/org/apache/druid/segment/CursorHolder.java
+++ b/processing/src/main/java/org/apache/druid/segment/CursorHolder.java
@@ -22,6 +22,7 @@
 import org.apache.druid.java.util.common.UOE;
 import org.apache.druid.query.Order;
 import org.apache.druid.query.OrderBy;
+import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.column.ColumnHolder;
 import org.apache.druid.segment.vector.VectorCursor;
 
@@ -58,6 +59,22 @@ default boolean canVectorize()
     return false;
   }
 
+  /**
+   * Returns true if the {@link Cursor} or {@link VectorCursor} contains pre-aggregated columns for all
+   * {@link AggregatorFactory} specified in {@link CursorBuildSpec#getAggregators()}.
+   * <p>
+   * If this method returns true, {@link ColumnSelectorFactory} and
+   * {@link org.apache.druid.segment.vector.VectorColumnSelectorFactory} created from {@link Cursor} and
+   * {@link VectorCursor} respectively will provide selectors for {@link AggregatorFactory#getName()}, and engines
+   * should rewrite the query using {@link AggregatorFactory#getCombiningFactory()}, since the values returned from
+   * these selectors will be of type {@link AggregatorFactory#getIntermediateType()}, so the cursor becomes a "fold"
+   * operation rather than a "build" operation.
+   */
+  default boolean isPreAggregated()
+  {
+    return false;
+  }
+
   /**
    * Returns cursor ordering, which may or may not match {@link CursorBuildSpec#getPreferredOrdering()}. If returns
    * an empty list then the cursor has no defined ordering.
diff --git a/processing/src/test/java/org/apache/druid/segment/CursorHolderPreaggTest.java b/processing/src/test/java/org/apache/druid/segment/CursorHolderPreaggTest.java
new file mode 100644
index 000000000000..82bba60821ce
--- /dev/null
+++ b/processing/src/test/java/org/apache/druid/segment/CursorHolderPreaggTest.java
@@ -0,0 +1,267 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.segment;
+
+import com.google.common.collect.ImmutableList;
+import org.apache.druid.collections.CloseableDefaultBlockingPool;
+import org.apache.druid.collections.CloseableStupidPool;
+import org.apache.druid.java.util.common.Intervals;
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.java.util.common.granularity.Granularities;
+import org.apache.druid.java.util.common.guava.Sequence;
+import org.apache.druid.query.DruidProcessingConfig;
+import org.apache.druid.query.Druids;
+import org.apache.druid.query.IterableRowsCursorHelper;
+import org.apache.druid.query.Result;
+import org.apache.druid.query.aggregation.CountAggregatorFactory;
+import org.apache.druid.query.groupby.GroupByQuery;
+import org.apache.druid.query.groupby.GroupByQueryConfig;
+import org.apache.druid.query.groupby.GroupByResourcesReservationPool;
+import org.apache.druid.query.groupby.GroupingEngine;
+import org.apache.druid.query.groupby.ResultRow;
+import org.apache.druid.query.timeseries.TimeseriesQuery;
+import org.apache.druid.query.timeseries.TimeseriesQueryEngine;
+import org.apache.druid.query.timeseries.TimeseriesResultValue;
+import org.apache.druid.query.topn.TopNQuery;
+import org.apache.druid.query.topn.TopNQueryBuilder;
+import org.apache.druid.query.topn.TopNQueryEngine;
+import org.apache.druid.query.topn.TopNResultValue;
+import org.apache.druid.segment.column.ColumnCapabilities;
+import org.apache.druid.segment.column.ColumnType;
+import org.apache.druid.segment.column.RowSignature;
+import org.apache.druid.testing.InitializedNullHandlingTest;
+import org.apache.druid.timeline.SegmentId;
+import org.apache.druid.utils.CloseableUtils;
+import org.joda.time.Interval;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+
+import javax.annotation.Nullable;
+import java.io.Closeable;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+public class CursorHolderPreaggTest extends InitializedNullHandlingTest
+{
+  private GroupingEngine groupingEngine;
+  private TopNQueryEngine topNQueryEngine;
+  private TimeseriesQueryEngine timeseriesQueryEngine;
+
+  private CursorFactory cursorFactory;
+  private Segment segment;
+
+  @Rule
+  public final CloserRule closer = new CloserRule(false);
+
+  @Before
+  public void setup()
+  {
+    final CloseableStupidPool<ByteBuffer> pool = closer.closeLater(
+        new CloseableStupidPool<>(
+            "CursorHolderPreaggTest-bufferPool",
+            () -> ByteBuffer.allocate(50000)
+        )
+    );
+    topNQueryEngine = new TopNQueryEngine(pool);
+    timeseriesQueryEngine = new TimeseriesQueryEngine(pool);
+    groupingEngine = new GroupingEngine(
+        new DruidProcessingConfig(),
+        GroupByQueryConfig::new,
+        pool,
+        new GroupByResourcesReservationPool(
+            closer.closeLater(
+                new CloseableDefaultBlockingPool<>(
+                    () -> ByteBuffer.allocate(50000),
+                    4
+                )
+            ),
+            new GroupByQueryConfig()
+        ),
+        TestHelper.makeJsonMapper(),
+        TestHelper.makeSmileMapper(),
+        (query, future) -> {
+        }
+    );
+
+    this.cursorFactory = new CursorFactory()
+    {
+      private final RowSignature rowSignature = RowSignature.builder()
+                                                            .add("a", ColumnType.STRING)
+                                                            .add("b", ColumnType.STRING)
+                                                            .add("cnt", ColumnType.LONG)
+                                                            .build();
+
+      private final Pair<Cursor, Closeable> cursorAndCloser = IterableRowsCursorHelper.getCursorFromIterable(
+          ImmutableList.of(
+              new Object[]{"a", "aa", 5L},
+              new Object[]{"a", "aa", 6L},
+              new Object[]{"b", "bb", 7L}
+          ),
+          rowSignature
+      );
+
+      @Override
+      public CursorHolder makeCursorHolder(CursorBuildSpec spec)
+      {
+        return new CursorHolder()
+        {
+          @Nullable
+          @Override
+          public Cursor asCursor()
+          {
+            return cursorAndCloser.lhs;
+          }
+
+          @Override
+          public boolean isPreAggregated()
+          {
+            return true;
+          }
+
+          @Override
+          public void close()
+          {
+            CloseableUtils.closeAndWrapExceptions(cursorAndCloser.rhs);
+          }
+        };
+      }
+
+      @Override
+      public RowSignature getRowSignature()
+      {
+        return rowSignature;
+      }
+
+      @Override
+      @Nullable
+      public ColumnCapabilities getColumnCapabilities(String column)
+      {
+        return rowSignature.getColumnCapabilities(column);
+      }
+    };
+
+    segment = new Segment()
+    {
+      @Override
+      public SegmentId getId()
+      {
+        return SegmentId.dummy("test");
+      }
+
+      @Override
+      public Interval getDataInterval()
+      {
+        return Intervals.ETERNITY;
+      }
+
+      @Nullable
+      @Override
+      public QueryableIndex asQueryableIndex()
+      {
+        return null;
+      }
+
+      @Override
+      public CursorFactory asCursorFactory()
+      {
+        return cursorFactory;
+      }
+
+      @Override
+      public void close()
+      {
+
+      }
+    };
+  }
+
+  @Test
+  public void testTopn()
+  {
+    final TopNQuery topNQuery = new TopNQueryBuilder().dataSource("test")
+                                                      .granularity(Granularities.ALL)
+                                                      .intervals(ImmutableList.of(Intervals.ETERNITY))
+                                                      .dimension("a")
+                                                      .aggregators(new CountAggregatorFactory("cnt"))
+                                                      .metric("cnt")
+                                                      .threshold(10)
+                                                      .build();
+    Sequence<Result<TopNResultValue>> results = topNQueryEngine.query(
+        topNQuery,
+        segment,
+        null
+    );
+
+    List<Result<TopNResultValue>> rows = results.toList();
+    Assert.assertEquals(1, rows.size());
+    // the cnt column is treated as pre-aggregated, so the values of the rows are summed
+    Assert.assertEquals(2, rows.get(0).getValue().getValue().size());
+    Assert.assertEquals(11L, rows.get(0).getValue().getValue().get(0).getLongMetric("cnt").longValue());
+    Assert.assertEquals(7L, rows.get(0).getValue().getValue().get(1).getLongMetric("cnt").longValue());
+  }
+
+  @Test
+  public void testGroupBy()
+  {
+    final GroupByQuery query = GroupByQuery.builder()
+                                           .setDataSource("test")
+                                           .setGranularity(Granularities.ALL)
+                                           .setInterval(Intervals.ETERNITY)
+                                           .addDimension("a")
+                                           .addDimension("b")
+                                           .addAggregator(new CountAggregatorFactory("cnt"))
+                                           .build();
+
+    Sequence<ResultRow> results = groupingEngine.process(
+        query,
+        cursorFactory,
+        null,
+        null
+    );
+    List<ResultRow> rows = results.toList();
+    Assert.assertEquals(2, rows.size());
+    // the cnt column is treated as pre-aggregated, so the values of the rows are summed
+    Assert.assertArrayEquals(new Object[]{"a", "aa", 11L}, rows.get(0).getArray());
+    Assert.assertArrayEquals(new Object[]{"b", "bb", 7L}, rows.get(1).getArray());
+  }
+
+  @Test
+  public void testTimeseries()
+  {
+    TimeseriesQuery timeseriesQuery = Druids.newTimeseriesQueryBuilder()
+                                            .dataSource("test")
+                                            .intervals(ImmutableList.of(Intervals.ETERNITY))
+                                            .granularity(Granularities.ALL)
+                                            .aggregators(new CountAggregatorFactory("cnt"))
+                                            .build();
+    Sequence<Result<TimeseriesResultValue>> results = timeseriesQueryEngine.process(
+        timeseriesQuery,
+        cursorFactory,
+        null,
+        null
+    );
+    List<Result<TimeseriesResultValue>> rows = results.toList();
+    Assert.assertEquals(1, rows.size());
+    // the cnt column is treated as pre-aggregated, so the values of the rows are summed
+    Assert.assertEquals(18L, (long) rows.get(0).getValue().getLongMetric("cnt"));
+  }
+}

From 2ce2a4a03eb0403588213ed57e481cc4730ec351 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Fri, 13 Sep 2024 15:47:51 -0700
Subject: [PATCH 12/47] Add "includeAllCounters()" to WorkerContext. (#17047)

This removes the need to read it from the query context.
---
 .../main/java/org/apache/druid/msq/exec/WorkerContext.java  | 6 ++++++
 .../src/main/java/org/apache/druid/msq/exec/WorkerImpl.java | 2 +-
 .../org/apache/druid/msq/indexing/IndexerWorkerContext.java | 6 ++++++
 .../org/apache/druid/msq/test/MSQTestWorkerContext.java     | 6 ++++++
 4 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerContext.java
index 666115d774cf..95a4ce7c7ba5 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerContext.java
@@ -27,6 +27,7 @@
 import org.apache.druid.msq.kernel.FrameProcessorFactory;
 import org.apache.druid.msq.kernel.QueryDefinition;
 import org.apache.druid.msq.kernel.WorkOrder;
+import org.apache.druid.msq.util.MultiStageQueryContext;
 import org.apache.druid.server.DruidNode;
 
 import java.io.File;
@@ -98,4 +99,9 @@ public interface WorkerContext
   DruidNode selfNode();
 
   DataServerQueryHandlerFactory dataServerQueryHandlerFactory();
+
+  /**
+   * Whether to include all counters in reports. See {@link MultiStageQueryContext#CTX_INCLUDE_ALL_COUNTERS} for detail.
+   */
+  boolean includeAllCounters();
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
index 92664feeabbb..f28d1be5e614 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
@@ -407,7 +407,7 @@ private void handleNewWorkOrder(
     kernel.startReading();
 
     final QueryContext queryContext = task != null ? QueryContext.of(task.getContext()) : QueryContext.empty();
-    final boolean includeAllCounters = MultiStageQueryContext.getIncludeAllCounters(queryContext);
+    final boolean includeAllCounters = context.includeAllCounters();
     final RunWorkOrder runWorkOrder = new RunWorkOrder(
         task.getControllerTaskId(),
         workOrder,
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java
index 0b3063ef48ba..c36b8e291db9 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java
@@ -312,6 +312,12 @@ public DataServerQueryHandlerFactory dataServerQueryHandlerFactory()
     return dataServerQueryHandlerFactory;
   }
 
+  @Override
+  public boolean includeAllCounters()
+  {
+    return includeAllCounters;
+  }
+
   private synchronized ServiceLocator makeControllerLocator(final String controllerId)
   {
     if (controllerLocator == null) {
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestWorkerContext.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestWorkerContext.java
index 082429a9d7b1..1b92f468fced 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestWorkerContext.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestWorkerContext.java
@@ -159,6 +159,12 @@ public DataServerQueryHandlerFactory dataServerQueryHandlerFactory()
     return injector.getInstance(DataServerQueryHandlerFactory.class);
   }
 
+  @Override
+  public boolean includeAllCounters()
+  {
+    return true;
+  }
+
   class FrameContextImpl implements FrameContext
   {
     private final File tempDir;

From 6b38459934d7d3f05e229ec813bbfb7fa3225bb3 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Fri, 13 Sep 2024 15:59:41 -0700
Subject: [PATCH 13/47] MSQ: Fix two issues with phase transitions. (#17053)

1) ControllerQueryKernel: Update readyToReadResults to acknowledge that sorting stages can
   go directly from READING_INPUT to RESULTS_READY.

2) WorkerStageKernel: Ignore RESULTS_COMPLETE if work is already finished, which can happen
   if the transition to FINISHED comes early due to a downstream LIMIT.
---
 .../druid/msq/kernel/controller/ControllerQueryKernel.java | 7 +++----
 .../apache/druid/msq/kernel/worker/WorkerStageKernel.java  | 6 ++++++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernel.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernel.java
index 05e0f722ccd4..b01091f9ad7a 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernel.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernel.java
@@ -676,11 +676,10 @@ private boolean readyToReadResults(final StageId stageId, final ControllerStageP
   {
     if (stageOutputChannelModes.get(stageId) == OutputChannelMode.MEMORY) {
       if (getStageDefinition(stageId).doesSortDuringShuffle()) {
-        // Stages that sort during shuffle go through a READING_INPUT phase followed by a POST_READING phase
-        // (once all input is read). These stages start producing output once POST_READING starts.
-        return newPhase == ControllerStagePhase.POST_READING;
+        // Sorting stages start producing output when they finish reading their input.
+        return newPhase.isDoneReadingInput();
       } else {
-        // Can read results immediately.
+        // Non-sorting stages start producing output immediately.
         return newPhase == ControllerStagePhase.NEW;
       }
     } else {
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/worker/WorkerStageKernel.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/worker/WorkerStageKernel.java
index b838092ca714..992b90c02859 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/worker/WorkerStageKernel.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/worker/WorkerStageKernel.java
@@ -182,6 +182,12 @@ public void setResultsComplete(Object resultObject)
       throw new NullPointerException("resultObject must not be null");
     }
 
+    if (phase.isTerminal()) {
+      // Ignore RESULTS_COMPLETE if work is already finished. This can happen if we transition to FINISHED early
+      // due to a downstream stage including a limit.
+      return;
+    }
+
     transitionTo(WorkerStagePhase.RESULTS_COMPLETE);
     this.resultObject = resultObject;
   }

From 66cb6e8d1b9253cf50c3cd78f0d8a5a1cb3fc01e Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Fri, 13 Sep 2024 16:01:18 -0700
Subject: [PATCH 14/47] Add "targetPartitionsPerWorker" setting for MSQ.
 (#17048)

As we move towards multi-threaded MSQ workers, it helps for parallelism
to generate more than one partition per worker. That way, we can fully
utilize all worker threads throughout all stages.

The default value is the number of processing threads. Currently, this
is hard-coded to 1 for peons, but that is expected to change in the future.
---
 .../druid/msq/exec/ControllerContext.java     |  8 +++++
 .../apache/druid/msq/exec/ControllerImpl.java | 21 +++++++++-----
 .../indexing/IndexerControllerContext.java    | 11 ++++++-
 .../druid/msq/querykit/DataSourcePlan.java    | 29 +++++++++++++++++--
 .../druid/msq/querykit/MultiQueryKit.java     |  2 ++
 .../apache/druid/msq/querykit/QueryKit.java   |  2 ++
 .../msq/querykit/WindowOperatorQueryKit.java  | 12 +++++---
 .../msq/querykit/groupby/GroupByQueryKit.java |  9 ++++--
 .../druid/msq/querykit/scan/ScanQueryKit.java |  2 ++
 .../msq/util/MultiStageQueryContext.java      | 14 +++++++++
 .../msq/test/MSQTestControllerContext.java    |  6 ++++
 11 files changed, 98 insertions(+), 18 deletions(-)

diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java
index 40b114511c28..bc449d141203 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java
@@ -30,6 +30,8 @@
 import org.apache.druid.msq.input.table.TableInputSpec;
 import org.apache.druid.msq.kernel.QueryDefinition;
 import org.apache.druid.msq.kernel.controller.ControllerQueryKernelConfig;
+import org.apache.druid.msq.querykit.QueryKit;
+import org.apache.druid.msq.util.MultiStageQueryContext;
 import org.apache.druid.server.DruidNode;
 
 /**
@@ -100,4 +102,10 @@ WorkerManager newWorkerManager(
    * Client for communicating with workers.
    */
   WorkerClient newWorkerClient();
+
+  /**
+   * Default target partitions per worker for {@link QueryKit#makeQueryDefinition}. Can be overridden using
+   * {@link MultiStageQueryContext#CTX_TARGET_PARTITIONS_PER_WORKER}.
+   */
+  int defaultTargetPartitionsPerWorker();
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
index 1ab7460156dc..4b63d85cda7b 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
@@ -563,11 +563,16 @@ private QueryDefinition initializeQueryDefAndState(final Closer closer)
     this.netClient = new ExceptionWrappingWorkerClient(context.newWorkerClient());
     closer.register(netClient);
 
+    final QueryContext queryContext = querySpec.getQuery().context();
     final QueryDefinition queryDef = makeQueryDefinition(
         queryId(),
         makeQueryControllerToolKit(),
         querySpec,
         context.jsonMapper(),
+        MultiStageQueryContext.getTargetPartitionsPerWorkerWithDefault(
+            queryContext,
+            context.defaultTargetPartitionsPerWorker()
+        ),
         resultsContext
     );
 
@@ -612,7 +617,7 @@ private QueryDefinition initializeQueryDefAndState(final Closer closer)
       );
     }
 
-    final long maxParseExceptions = MultiStageQueryContext.getMaxParseExceptions(querySpec.getQuery().context());
+    final long maxParseExceptions = MultiStageQueryContext.getMaxParseExceptions(queryContext);
     this.faultsExceededChecker = new FaultsExceededChecker(
         ImmutableMap.of(CannotParseExternalDataFault.CODE, maxParseExceptions)
     );
@@ -624,7 +629,7 @@ private QueryDefinition initializeQueryDefAndState(final Closer closer)
                 stageDefinition.getId().getStageNumber(),
                 finalizeClusterStatisticsMergeMode(
                     stageDefinition,
-                    MultiStageQueryContext.getClusterStatisticsMergeMode(querySpec.getQuery().context())
+                    MultiStageQueryContext.getClusterStatisticsMergeMode(queryContext)
                 )
             )
     );
@@ -1718,17 +1723,18 @@ private static QueryDefinition makeQueryDefinition(
       @SuppressWarnings("rawtypes") final QueryKit toolKit,
       final MSQSpec querySpec,
       final ObjectMapper jsonMapper,
+      final int targetPartitionsPerWorker,
       final ResultsContext resultsContext
   )
   {
     final MSQTuningConfig tuningConfig = querySpec.getTuningConfig();
     final ColumnMappings columnMappings = querySpec.getColumnMappings();
     final Query<?> queryToPlan;
-    final ShuffleSpecFactory shuffleSpecFactory;
+    final ShuffleSpecFactory resultShuffleSpecFactory;
 
     if (MSQControllerTask.isIngestion(querySpec)) {
-      shuffleSpecFactory = querySpec.getDestination()
-                                    .getShuffleSpecFactory(tuningConfig.getRowsPerSegment());
+      resultShuffleSpecFactory = querySpec.getDestination()
+                                          .getShuffleSpecFactory(tuningConfig.getRowsPerSegment());
 
       if (!columnMappings.hasUniqueOutputColumnNames()) {
         // We do not expect to hit this case in production, because the SQL validator checks that column names
@@ -1752,7 +1758,7 @@ private static QueryDefinition makeQueryDefinition(
         queryToPlan = querySpec.getQuery();
       }
     } else {
-      shuffleSpecFactory =
+      resultShuffleSpecFactory =
           querySpec.getDestination()
                    .getShuffleSpecFactory(MultiStageQueryContext.getRowsPerPage(querySpec.getQuery().context()));
       queryToPlan = querySpec.getQuery();
@@ -1765,8 +1771,9 @@ private static QueryDefinition makeQueryDefinition(
           queryId,
           queryToPlan,
           toolKit,
-          shuffleSpecFactory,
+          resultShuffleSpecFactory,
           tuningConfig.getMaxNumWorkers(),
+          targetPartitionsPerWorker,
           0
       );
     }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java
index 1037aa6c2af0..e60f1c5c9622 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java
@@ -74,6 +74,7 @@ public class IndexerControllerContext implements ControllerContext
   private final ServiceClientFactory clientFactory;
   private final OverlordClient overlordClient;
   private final ServiceMetricEvent.Builder metricBuilder;
+  private final MemoryIntrospector memoryIntrospector;
 
   public IndexerControllerContext(
       final MSQControllerTask task,
@@ -89,6 +90,7 @@ public IndexerControllerContext(
     this.clientFactory = clientFactory;
     this.overlordClient = overlordClient;
     this.metricBuilder = new ServiceMetricEvent.Builder();
+    this.memoryIntrospector = injector.getInstance(MemoryIntrospector.class);
     IndexTaskUtils.setTaskDimensions(metricBuilder, task);
   }
 
@@ -98,7 +100,6 @@ public ControllerQueryKernelConfig queryKernelConfig(
       final QueryDefinition queryDef
   )
   {
-    final MemoryIntrospector memoryIntrospector = injector.getInstance(MemoryIntrospector.class);
     final ControllerMemoryParameters memoryParameters =
         ControllerMemoryParameters.createProductionInstance(
             memoryIntrospector,
@@ -200,6 +201,14 @@ public WorkerManager newWorkerManager(
     );
   }
 
+  @Override
+  public int defaultTargetPartitionsPerWorker()
+  {
+    // Assume tasks are symmetric: workers have the same number of processors available as a controller.
+    // Create one partition per processor per task, for maximum parallelism.
+    return memoryIntrospector.numProcessorsInJvm();
+  }
+
   /**
    * Helper method for {@link #queryKernelConfig(MSQSpec, QueryDefinition)}. Also used in tests.
    */
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSourcePlan.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSourcePlan.java
index e6ddb4d723dc..15fe6263ed83 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSourcePlan.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSourcePlan.java
@@ -135,6 +135,7 @@ public class DataSourcePlan
    * @param maxWorkerCount   maximum number of workers for subqueries
    * @param minStageNumber   starting stage number for subqueries
    * @param broadcast        whether the plan should broadcast data for this datasource
+   * @param targetPartitionsPerWorker preferred number of partitions per worker for subqueries
    */
   @SuppressWarnings("rawtypes")
   public static DataSourcePlan forDataSource(
@@ -146,6 +147,7 @@ public static DataSourcePlan forDataSource(
       @Nullable DimFilter filter,
       @Nullable Set<String> filterFields,
       final int maxWorkerCount,
+      final int targetPartitionsPerWorker,
       final int minStageNumber,
       final boolean broadcast
   )
@@ -186,6 +188,7 @@ public static DataSourcePlan forDataSource(
           (FilteredDataSource) dataSource,
           querySegmentSpec,
           maxWorkerCount,
+          targetPartitionsPerWorker,
           minStageNumber,
           broadcast
       );
@@ -197,6 +200,7 @@ public static DataSourcePlan forDataSource(
           (UnnestDataSource) dataSource,
           querySegmentSpec,
           maxWorkerCount,
+          targetPartitionsPerWorker,
           minStageNumber,
           broadcast
       );
@@ -207,6 +211,7 @@ public static DataSourcePlan forDataSource(
           queryId,
           (QueryDataSource) dataSource,
           maxWorkerCount,
+          targetPartitionsPerWorker,
           minStageNumber,
           broadcast,
           queryContext
@@ -221,6 +226,7 @@ public static DataSourcePlan forDataSource(
           filter,
           filterFields,
           maxWorkerCount,
+          targetPartitionsPerWorker,
           minStageNumber,
           broadcast
       );
@@ -242,6 +248,7 @@ public static DataSourcePlan forDataSource(
               filter,
               filterFields,
               maxWorkerCount,
+              targetPartitionsPerWorker,
               minStageNumber,
               broadcast
           );
@@ -253,6 +260,7 @@ public static DataSourcePlan forDataSource(
               (JoinDataSource) dataSource,
               querySegmentSpec,
               maxWorkerCount,
+              targetPartitionsPerWorker,
               minStageNumber,
               broadcast
           );
@@ -418,6 +426,7 @@ private static DataSourcePlan forQuery(
       final String queryId,
       final QueryDataSource dataSource,
       final int maxWorkerCount,
+      final int targetPartitionsPerWorker,
       final int minStageNumber,
       final boolean broadcast,
       @Nullable final QueryContext parentContext
@@ -429,8 +438,9 @@ private static DataSourcePlan forQuery(
         // outermost query, and setting it for the subquery makes us erroneously add bucketing where it doesn't belong.
         dataSource.getQuery().withOverriddenContext(CONTEXT_MAP_NO_SEGMENT_GRANULARITY),
         queryKit,
-        ShuffleSpecFactories.globalSortWithMaxPartitionCount(maxWorkerCount),
+        ShuffleSpecFactories.globalSortWithMaxPartitionCount(maxWorkerCount * targetPartitionsPerWorker),
         maxWorkerCount,
+        targetPartitionsPerWorker,
         minStageNumber
     );
 
@@ -451,6 +461,7 @@ private static DataSourcePlan forFilteredDataSource(
       final FilteredDataSource dataSource,
       final QuerySegmentSpec querySegmentSpec,
       final int maxWorkerCount,
+      final int targetPartitionsPerWorker,
       final int minStageNumber,
       final boolean broadcast
   )
@@ -464,6 +475,7 @@ private static DataSourcePlan forFilteredDataSource(
         null,
         null,
         maxWorkerCount,
+        targetPartitionsPerWorker,
         minStageNumber,
         broadcast
     );
@@ -491,6 +503,7 @@ private static DataSourcePlan forUnnest(
       final UnnestDataSource dataSource,
       final QuerySegmentSpec querySegmentSpec,
       final int maxWorkerCount,
+      final int targetPartitionsPerWorker,
       final int minStageNumber,
       final boolean broadcast
   )
@@ -505,6 +518,7 @@ private static DataSourcePlan forUnnest(
         null,
         null,
         maxWorkerCount,
+        targetPartitionsPerWorker,
         minStageNumber,
         broadcast
     );
@@ -537,6 +551,7 @@ private static DataSourcePlan forUnion(
       @Nullable DimFilter filter,
       @Nullable Set<String> filterFields,
       final int maxWorkerCount,
+      final int targetPartitionsPerWorker,
       final int minStageNumber,
       final boolean broadcast
   )
@@ -559,6 +574,7 @@ private static DataSourcePlan forUnion(
           filter,
           filterFields,
           maxWorkerCount,
+          targetPartitionsPerWorker,
           Math.max(minStageNumber, subqueryDefBuilder.getNextStageNumber()),
           broadcast
       );
@@ -590,6 +606,7 @@ private static DataSourcePlan forBroadcastHashJoin(
       @Nullable final DimFilter filter,
       @Nullable final Set<String> filterFields,
       final int maxWorkerCount,
+      final int targetPartitionsPerWorker,
       final int minStageNumber,
       final boolean broadcast
   )
@@ -606,6 +623,7 @@ private static DataSourcePlan forBroadcastHashJoin(
         filter,
         filter == null ? null : DimFilterUtils.onlyBaseFields(filterFields, analysis),
         maxWorkerCount,
+        targetPartitionsPerWorker,
         Math.max(minStageNumber, subQueryDefBuilder.getNextStageNumber()),
         broadcast
     );
@@ -626,6 +644,7 @@ private static DataSourcePlan forBroadcastHashJoin(
           null, // Don't push down query filters for right-hand side: needs some work to ensure it works properly.
           null,
           maxWorkerCount,
+          targetPartitionsPerWorker,
           Math.max(minStageNumber, subQueryDefBuilder.getNextStageNumber()),
           true // Always broadcast right-hand side of the join.
       );
@@ -660,6 +679,7 @@ private static DataSourcePlan forSortMergeJoin(
       final JoinDataSource dataSource,
       final QuerySegmentSpec querySegmentSpec,
       final int maxWorkerCount,
+      final int targetPartitionsPerWorker,
       final int minStageNumber,
       final boolean broadcast
   )
@@ -682,6 +702,7 @@ private static DataSourcePlan forSortMergeJoin(
         queryId,
         (QueryDataSource) dataSource.getLeft(),
         maxWorkerCount,
+        targetPartitionsPerWorker,
         Math.max(minStageNumber, subQueryDefBuilder.getNextStageNumber()),
         false,
         null
@@ -696,6 +717,7 @@ private static DataSourcePlan forSortMergeJoin(
         queryId,
         (QueryDataSource) dataSource.getRight(),
         maxWorkerCount,
+        targetPartitionsPerWorker,
         Math.max(minStageNumber, subQueryDefBuilder.getNextStageNumber()),
         false,
         null
@@ -707,8 +729,9 @@ private static DataSourcePlan forSortMergeJoin(
         ((StageInputSpec) Iterables.getOnlyElement(leftPlan.getInputSpecs())).getStageNumber()
     );
 
+    final int hashPartitionCount = maxWorkerCount * targetPartitionsPerWorker;
     final List<KeyColumn> leftPartitionKey = partitionKeys.get(0);
-    leftBuilder.shuffleSpec(new HashShuffleSpec(new ClusterBy(leftPartitionKey, 0), maxWorkerCount));
+    leftBuilder.shuffleSpec(new HashShuffleSpec(new ClusterBy(leftPartitionKey, 0), hashPartitionCount));
     leftBuilder.signature(QueryKitUtils.sortableSignature(leftBuilder.getSignature(), leftPartitionKey));
 
     // Build up the right stage.
@@ -717,7 +740,7 @@ private static DataSourcePlan forSortMergeJoin(
     );
 
     final List<KeyColumn> rightPartitionKey = partitionKeys.get(1);
-    rightBuilder.shuffleSpec(new HashShuffleSpec(new ClusterBy(rightPartitionKey, 0), maxWorkerCount));
+    rightBuilder.shuffleSpec(new HashShuffleSpec(new ClusterBy(rightPartitionKey, 0), hashPartitionCount));
     rightBuilder.signature(QueryKitUtils.sortableSignature(rightBuilder.getSignature(), rightPartitionKey));
 
     // Compute join signature.
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/MultiQueryKit.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/MultiQueryKit.java
index a795f6496053..37f453f6c060 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/MultiQueryKit.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/MultiQueryKit.java
@@ -46,6 +46,7 @@ public QueryDefinition makeQueryDefinition(
       QueryKit<Query<?>> toolKitForSubQueries,
       ShuffleSpecFactory resultShuffleSpecFactory,
       int maxWorkerCount,
+      int targetPartitionsPerWorker,
       int minStageNumber
   )
   {
@@ -59,6 +60,7 @@ public QueryDefinition makeQueryDefinition(
           this,
           resultShuffleSpecFactory,
           maxWorkerCount,
+          targetPartitionsPerWorker,
           minStageNumber
       );
     } else {
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/QueryKit.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/QueryKit.java
index b259022bba5b..2bc0ad0725a8 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/QueryKit.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/QueryKit.java
@@ -40,6 +40,7 @@ public interface QueryKit<QueryType extends Query<?>>
    * @param minStageNumber           lowest stage number to use for any generated stages. Useful if the resulting
    *                                 {@link QueryDefinition} is going to be added to an existing
    *                                 {@link org.apache.druid.msq.kernel.QueryDefinitionBuilder}.
+   * @param targetPartitionsPerWorker preferred number of partitions per worker for subqueries
    */
   QueryDefinition makeQueryDefinition(
       String queryId,
@@ -47,6 +48,7 @@ QueryDefinition makeQueryDefinition(
       QueryKit<Query<?>> toolKitForSubQueries,
       ShuffleSpecFactory resultShuffleSpecFactory,
       int maxWorkerCount,
+      int targetPartitionsPerWorker,
       int minStageNumber
   );
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/WindowOperatorQueryKit.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/WindowOperatorQueryKit.java
index b3686359d2a4..b1af153fafde 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/WindowOperatorQueryKit.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/WindowOperatorQueryKit.java
@@ -68,6 +68,7 @@ public QueryDefinition makeQueryDefinition(
       QueryKit<Query<?>> queryKit,
       ShuffleSpecFactory resultShuffleSpecFactory,
       int maxWorkerCount,
+      int targetPartitionsPerWorker,
       int minStageNumber
   )
   {
@@ -97,11 +98,13 @@ public QueryDefinition makeQueryDefinition(
         originalQuery.getFilter(),
         null,
         maxWorkerCount,
+        targetPartitionsPerWorker,
         minStageNumber,
         false
     );
 
-    ShuffleSpec nextShuffleSpec = findShuffleSpecForNextWindow(operatorList.get(0), maxWorkerCount);
+    ShuffleSpec nextShuffleSpec =
+        findShuffleSpecForNextWindow(operatorList.get(0), maxWorkerCount * targetPartitionsPerWorker);
     final QueryDefinitionBuilder queryDefBuilder = makeQueryDefinitionBuilder(queryId, dataSourcePlan, nextShuffleSpec);
 
     final int firstStageNumber = Math.max(minStageNumber, queryDefBuilder.getNextStageNumber());
@@ -192,7 +195,8 @@ public QueryDefinition makeQueryDefinition(
           stageRowSignature = finalWindowStageRowSignature;
           nextShuffleSpec = finalWindowStageShuffleSpec;
         } else {
-          nextShuffleSpec = findShuffleSpecForNextWindow(operatorList.get(i + 1), maxWorkerCount);
+          nextShuffleSpec =
+              findShuffleSpecForNextWindow(operatorList.get(i + 1), maxWorkerCount * targetPartitionsPerWorker);
           if (nextShuffleSpec == null) {
             stageRowSignature = intermediateSignature;
           } else {
@@ -285,7 +289,7 @@ private List<List<OperatorFactory>> getOperatorListFromQuery(WindowOperatorQuery
     return operatorList;
   }
 
-  private ShuffleSpec findShuffleSpecForNextWindow(List<OperatorFactory> operatorFactories, int maxWorkerCount)
+  private ShuffleSpec findShuffleSpecForNextWindow(List<OperatorFactory> operatorFactories, int partitionCount)
   {
     NaivePartitioningOperatorFactory partition = null;
     NaiveSortOperatorFactory sort = null;
@@ -325,7 +329,7 @@ private ShuffleSpec findShuffleSpecForNextWindow(List<OperatorFactory> operatorF
       keyColsOfWindow.add(kc);
     }
 
-    return new HashShuffleSpec(new ClusterBy(keyColsOfWindow, 0), maxWorkerCount);
+    return new HashShuffleSpec(new ClusterBy(keyColsOfWindow, 0), partitionCount);
   }
 
   /**
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByQueryKit.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByQueryKit.java
index 7e4ebf5e7fab..45a91a3d8870 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByQueryKit.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByQueryKit.java
@@ -71,6 +71,7 @@ public QueryDefinition makeQueryDefinition(
       final QueryKit<Query<?>> queryKit,
       final ShuffleSpecFactory resultShuffleSpecFactory,
       final int maxWorkerCount,
+      final int targetPartitionsPerWorker,
       final int minStageNumber
   )
   {
@@ -86,6 +87,7 @@ public QueryDefinition makeQueryDefinition(
         originalQuery.getFilter(),
         null,
         maxWorkerCount,
+        targetPartitionsPerWorker,
         minStageNumber,
         false
     );
@@ -139,9 +141,10 @@ public QueryDefinition makeQueryDefinition(
       // __time in such queries is generated using either an aggregator (e.g. sum(metric) as __time) or using a
       // post-aggregator (e.g. TIMESTAMP '2000-01-01' as __time)
       // For example: INSERT INTO foo SELECT COUNT(*), TIMESTAMP '2000-01-01' AS __time FROM bar PARTITIONED BY DAY
-      shuffleSpecFactoryPreAggregation = intermediateClusterBy.isEmpty()
-                                         ? ShuffleSpecFactories.singlePartition()
-                                         : ShuffleSpecFactories.globalSortWithMaxPartitionCount(maxWorkerCount);
+      shuffleSpecFactoryPreAggregation =
+          intermediateClusterBy.isEmpty()
+          ? ShuffleSpecFactories.singlePartition()
+          : ShuffleSpecFactories.globalSortWithMaxPartitionCount(maxWorkerCount * targetPartitionsPerWorker);
 
       if (doLimitOrOffset) {
         shuffleSpecFactoryPostAggregation = ShuffleSpecFactories.singlePartitionWithLimit(postAggregationLimitHint);
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/scan/ScanQueryKit.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/scan/ScanQueryKit.java
index f4f50106e813..051caeb0e718 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/scan/ScanQueryKit.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/scan/ScanQueryKit.java
@@ -91,6 +91,7 @@ public QueryDefinition makeQueryDefinition(
       final QueryKit<Query<?>> queryKit,
       final ShuffleSpecFactory resultShuffleSpecFactory,
       final int maxWorkerCount,
+      final int targetPartitionsPerWorker,
       final int minStageNumber
   )
   {
@@ -104,6 +105,7 @@ public QueryDefinition makeQueryDefinition(
         originalQuery.getFilter(),
         null,
         maxWorkerCount,
+        targetPartitionsPerWorker,
         minStageNumber,
         false
     );
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/util/MultiStageQueryContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/util/MultiStageQueryContext.java
index ed6a7c0e7b9b..63601c907a24 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/util/MultiStageQueryContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/util/MultiStageQueryContext.java
@@ -190,6 +190,12 @@ public class MultiStageQueryContext
 
   public static final String CTX_SKIP_TYPE_VERIFICATION = "skipTypeVerification";
 
+  /**
+   * Number of partitions to target per worker when creating shuffle specs that involve specific numbers of
+   * partitions. This helps us utilize more parallelism when workers are multi-threaded.
+   */
+  public static final String CTX_TARGET_PARTITIONS_PER_WORKER = "targetPartitionsPerWorker";
+
   private static final Pattern LOOKS_LIKE_JSON_ARRAY = Pattern.compile("^\\s*\\[.*", Pattern.DOTALL);
 
   public static String getMSQMode(final QueryContext queryContext)
@@ -380,6 +386,14 @@ public static ArrayIngestMode getArrayIngestMode(final QueryContext queryContext
     return queryContext.getEnum(CTX_ARRAY_INGEST_MODE, ArrayIngestMode.class, DEFAULT_ARRAY_INGEST_MODE);
   }
 
+  public static int getTargetPartitionsPerWorkerWithDefault(
+      final QueryContext queryContext,
+      final int defaultValue
+  )
+  {
+    return queryContext.getInt(CTX_TARGET_PARTITIONS_PER_WORKER, defaultValue);
+  }
+
   /**
    * See {@link #CTX_INCLUDE_ALL_COUNTERS}.
    */
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java
index e65104302032..3034be399849 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java
@@ -342,4 +342,10 @@ public WorkerClient newWorkerClient()
   {
     return new MSQTestWorkerClient(inMemoryWorkers);
   }
+
+  @Override
+  public int defaultTargetPartitionsPerWorker()
+  {
+    return 1;
+  }
 }

From f1d0879898eeb3ba73a93dc2c9959649dbc6bd16 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Sat, 14 Sep 2024 15:20:48 -0700
Subject: [PATCH 15/47] Fix formatting of error message from
 validateNoIllegalRightyJoins. (#17061)

The prior formatting was inconsistent in terms of punctuation and
capitalization.
---
 .../org/apache/druid/sql/calcite/planner/QueryValidations.java  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/src/main/java/org/apache/druid/sql/calcite/planner/QueryValidations.java b/sql/src/main/java/org/apache/druid/sql/calcite/planner/QueryValidations.java
index 1f38debfb9a8..c516496af31e 100644
--- a/sql/src/main/java/org/apache/druid/sql/calcite/planner/QueryValidations.java
+++ b/sql/src/main/java/org/apache/druid/sql/calcite/planner/QueryValidations.java
@@ -75,7 +75,7 @@ public RelNode visit(LogicalJoin join)
       if (shuttle.found != null) {
         throw new ValidationException(
             StringUtils.format(
-                "%s join is not supported by engine [%s] with %s: [%s]. Try %s: %s.",
+                "%s JOIN is not supported by engine[%s] with %s[%s]. Try %s[%s].",
                 shuttle.found.getJoinType(),
                 plannerContext.getEngine().name(),
                 PlannerContext.CTX_SQL_JOIN_ALGORITHM,

From ad62c1beca8731f3a9cf4d83857702a467c29270 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Sat, 14 Sep 2024 15:32:49 -0700
Subject: [PATCH 16/47] QueryResource: Don't close JSON content on error.
 (#17034)

* QueryResource: Don't close JSON content on error.

Following similar issues fixed in #11685 and #15880, this patch fixes
a bug where QueryResource would write a closing array marker if it
encountered an exception after starting to push results. This makes it
difficult for callers to detect errors.

The prior patches didn't catch this problem because QueryResource uses
the ObjectMapper in a unique way, through writeValuesAsArray, which
doesn't respect the global AUTO_CLOSE_JSON_CONTENT setting.

* Fix usage of customized ObjectMappers.
---
 .../apache/druid/server/QueryLifecycle.java   |  5 +-
 .../apache/druid/server/QueryResource.java    | 85 +++++++++++--------
 .../druid/server/QueryResourceTest.java       | 50 ++++++++++-
 3 files changed, 100 insertions(+), 40 deletions(-)

diff --git a/server/src/main/java/org/apache/druid/server/QueryLifecycle.java b/server/src/main/java/org/apache/druid/server/QueryLifecycle.java
index e0bb9875240d..a91959ca20bb 100644
--- a/server/src/main/java/org/apache/druid/server/QueryLifecycle.java
+++ b/server/src/main/java/org/apache/druid/server/QueryLifecycle.java
@@ -19,7 +19,7 @@
 
 package org.apache.druid.server;
 
-import com.fasterxml.jackson.databind.ObjectWriter;
+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.base.Preconditions;
 import com.google.common.base.Strings;
 import com.google.common.collect.Iterables;
@@ -62,7 +62,6 @@
 
 import javax.annotation.Nullable;
 import javax.servlet.http.HttpServletRequest;
-
 import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.Map;
@@ -434,7 +433,7 @@ private boolean isSerializeDateTimeAsLong()
            || (!shouldFinalize && queryContext.isSerializeDateTimeAsLongInner(false));
   }
 
-  public ObjectWriter newOutputWriter(ResourceIOReaderWriter ioReaderWriter)
+  public ObjectMapper newOutputWriter(ResourceIOReaderWriter ioReaderWriter)
   {
     return ioReaderWriter.getResponseWriter().newOutputWriter(
         getToolChest(),
diff --git a/server/src/main/java/org/apache/druid/server/QueryResource.java b/server/src/main/java/org/apache/druid/server/QueryResource.java
index 2db205ca0bed..61696dd5cec3 100644
--- a/server/src/main/java/org/apache/druid/server/QueryResource.java
+++ b/server/src/main/java/org/apache/druid/server/QueryResource.java
@@ -19,11 +19,12 @@
 
 package org.apache.druid.server;
 
+import com.fasterxml.jackson.core.JsonGenerator;
 import com.fasterxml.jackson.core.JsonParseException;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
-import com.fasterxml.jackson.databind.ObjectWriter;
-import com.fasterxml.jackson.databind.SequenceWriter;
+import com.fasterxml.jackson.databind.SerializationFeature;
+import com.fasterxml.jackson.databind.SerializerProvider;
 import com.fasterxml.jackson.databind.module.SimpleModule;
 import com.fasterxml.jackson.datatype.joda.ser.DateTimeSerializer;
 import com.fasterxml.jackson.jaxrs.smile.SmileMediaTypes;
@@ -37,6 +38,7 @@
 import org.apache.druid.guice.annotations.Json;
 import org.apache.druid.guice.annotations.Self;
 import org.apache.druid.guice.annotations.Smile;
+import org.apache.druid.java.util.common.jackson.JacksonUtils;
 import org.apache.druid.java.util.emitter.EmittingLogger;
 import org.apache.druid.query.BadJsonQueryException;
 import org.apache.druid.query.Query;
@@ -374,7 +376,7 @@ String getResponseType()
       return responseType;
     }
 
-    ObjectWriter newOutputWriter(
+    ObjectMapper newOutputWriter(
         @Nullable QueryToolChest<?, Query<?>> toolChest,
         @Nullable Query<?> query,
         boolean serializeDateTimeAsLong
@@ -387,7 +389,7 @@ ObjectWriter newOutputWriter(
       } else {
         decoratedMapper = mapper;
       }
-      return isPretty ? decoratedMapper.writerWithDefaultPrettyPrinter() : decoratedMapper.writer();
+      return isPretty ? decoratedMapper.copy().enable(SerializationFeature.INDENT_OUTPUT) : decoratedMapper;
     }
 
     Response ok(Object object) throws IOException
@@ -531,35 +533,7 @@ public QueryResponse<Object> getQueryResponse()
         @Override
         public Writer makeWriter(OutputStream out) throws IOException
         {
-          final ObjectWriter objectWriter = queryLifecycle.newOutputWriter(io);
-          final SequenceWriter sequenceWriter = objectWriter.writeValuesAsArray(out);
-          return new Writer()
-          {
-
-            @Override
-            public void writeResponseStart()
-            {
-              // Do nothing
-            }
-
-            @Override
-            public void writeRow(Object obj) throws IOException
-            {
-              sequenceWriter.write(obj);
-            }
-
-            @Override
-            public void writeResponseEnd()
-            {
-              // Do nothing
-            }
-
-            @Override
-            public void close() throws IOException
-            {
-              sequenceWriter.close();
-            }
-          };
+          return new NativeQueryWriter(queryLifecycle.newOutputWriter(io), out);
         }
 
         @Override
@@ -585,8 +559,49 @@ public void close()
     @Override
     public void writeException(Exception e, OutputStream out) throws IOException
     {
-      final ObjectWriter objectWriter = queryLifecycle.newOutputWriter(io);
-      out.write(objectWriter.writeValueAsBytes(e));
+      final ObjectMapper objectMapper = queryLifecycle.newOutputWriter(io);
+      out.write(objectMapper.writeValueAsBytes(e));
+    }
+  }
+
+  static class NativeQueryWriter implements QueryResultPusher.Writer
+  {
+    private final SerializerProvider serializers;
+    private final JsonGenerator jsonGenerator;
+
+    public NativeQueryWriter(final ObjectMapper responseMapper, final OutputStream out) throws IOException
+    {
+      // Don't use objectWriter.writeValuesAsArray(out), because that causes an end array ] to be written when the
+      // writer is closed, even if it's closed in case of an exception. This causes valid JSON to be emitted in case
+      // of an exception, which makes it difficult for callers to detect problems. Note: this means that if an error
+      // occurs on a Historical (or other data server) after it started to push results to the Broker, the Broker
+      // will experience that as "JsonEOFException: Unexpected end-of-input: expected close marker for Array".
+      this.serializers = responseMapper.getSerializerProviderInstance();
+      this.jsonGenerator = responseMapper.createGenerator(out);
+    }
+
+    @Override
+    public void writeResponseStart() throws IOException
+    {
+      jsonGenerator.writeStartArray();
+    }
+
+    @Override
+    public void writeRow(Object obj) throws IOException
+    {
+      JacksonUtils.writeObjectUsingSerializerProvider(jsonGenerator, serializers, obj);
+    }
+
+    @Override
+    public void writeResponseEnd() throws IOException
+    {
+      jsonGenerator.writeEndArray();
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+      jsonGenerator.close();
     }
   }
 }
diff --git a/server/src/test/java/org/apache/druid/server/QueryResourceTest.java b/server/src/test/java/org/apache/druid/server/QueryResourceTest.java
index f47460320479..32c26edffee1 100644
--- a/server/src/test/java/org/apache/druid/server/QueryResourceTest.java
+++ b/server/src/test/java/org/apache/druid/server/QueryResourceTest.java
@@ -19,6 +19,7 @@
 
 package org.apache.druid.server;
 
+import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.core.type.TypeReference;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.jaxrs.smile.SmileMediaTypes;
@@ -80,12 +81,14 @@
 import org.apache.druid.server.security.ForbiddenException;
 import org.apache.druid.server.security.Resource;
 import org.apache.http.HttpStatus;
+import org.hamcrest.CoreMatchers;
 import org.hamcrest.MatcherAssert;
 import org.joda.time.Interval;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
+import org.junit.internal.matchers.ThrowableMessageMatcher;
 
 import javax.annotation.Nonnull;
 import javax.annotation.Nullable;
@@ -98,6 +101,7 @@
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
@@ -424,7 +428,8 @@ public QueryLifecycle factorize()
                 overrideConfig,
                 new AuthConfig(),
                 System.currentTimeMillis(),
-                System.nanoTime())
+                System.nanoTime()
+            )
             {
               @Override
               public void emitLogsAndMetrics(@Nullable Throwable e, @Nullable String remoteAddress, long bytesWritten)
@@ -453,7 +458,8 @@ public void emitLogsAndMetrics(@Nullable Throwable e, @Nullable String remoteAdd
         entity.getUnderlyingException(),
         new DruidExceptionMatcher(
             DruidException.Persona.OPERATOR,
-            DruidException.Category.RUNTIME_FAILURE, "legacyQueryException")
+            DruidException.Category.RUNTIME_FAILURE, "legacyQueryException"
+        )
             .expectMessageIs("something")
     );
   }
@@ -1250,6 +1256,46 @@ public void testTooManyQueryInLaneImplicitFromDurationThreshold() throws Interru
     }
   }
 
+  @Test
+  public void testNativeQueryWriter_goodResponse() throws IOException
+  {
+    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    final QueryResultPusher.Writer writer = new QueryResource.NativeQueryWriter(jsonMapper, baos);
+    writer.writeResponseStart();
+    writer.writeRow(Arrays.asList("foo", "bar"));
+    writer.writeRow(Collections.singletonList("baz"));
+    writer.writeResponseEnd();
+    writer.close();
+
+    Assert.assertEquals(
+        ImmutableList.of(
+            ImmutableList.of("foo", "bar"),
+            ImmutableList.of("baz")
+        ),
+        jsonMapper.readValue(baos.toByteArray(), Object.class)
+    );
+  }
+
+  @Test
+  public void testNativeQueryWriter_truncatedResponse() throws IOException
+  {
+    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    final QueryResultPusher.Writer writer = new QueryResource.NativeQueryWriter(jsonMapper, baos);
+    writer.writeResponseStart();
+    writer.writeRow(Arrays.asList("foo", "bar"));
+    writer.close(); // Simulate an error that occurs midstream; close writer without calling writeResponseEnd.
+
+    final JsonProcessingException e = Assert.assertThrows(
+        JsonProcessingException.class,
+        () -> jsonMapper.readValue(baos.toByteArray(), Object.class)
+    );
+
+    MatcherAssert.assertThat(
+        e,
+        ThrowableMessageMatcher.hasMessage(CoreMatchers.containsString("expected close marker for Array"))
+    );
+  }
+
   private void createScheduledQueryResource(
       QueryScheduler scheduler,
       Collection<CountDownLatch> beforeScheduler,

From 9d0d7c7fc749e053a6fb108a7b80ae9ee46e9b71 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Sat, 14 Sep 2024 15:35:21 -0700
Subject: [PATCH 17/47] MSQ: Rework memory management. (#17057)

* MSQ: Rework memory management.

This patch reworks memory management to better support multi-threaded
workers running in shared JVMs. There are two main changes.

First, processing buffers and threads are moved from a per-JVM model to
a per-worker model. This enables queries to hold processing buffers
without blocking other concurrently-running queries. Changes:

- Introduce ProcessingBuffersSet and ProcessingBuffers to hold the
  per-worker and per-work-order processing buffers (respectively). On Peons,
  this is the JVM-wide processing pool. On Indexers, this is a per-worker
  pool of on-heap buffers. (This change fixes a bug on Indexers where
  excessive processing buffers could be used if MSQ tasks ran concurrently
  with realtime tasks.)

- Add "bufferPool" argument to GroupingEngine#process so a per-worker pool
  can be passed in.

- Add "druid.msq.task.memory.maxThreads" property, which controls the
  maximum number of processing threads to use per task. This allows usage of
  multiple processing buffers per task if admins desire.

- IndexerWorkerContext acquires processingBuffers when creating the FrameContext
  for a work order, and releases them when closing the FrameContext.

- Add "usesProcessingBuffers()" to FrameProcessorFactory so workers know
  how many sets of processing buffers are needed to run a given query.

Second, adjustments to how WorkerMemoryParameters slices up bundles, to
favor more memory for sorting and segment generation. Changes:

- Instead of using same-sized bundles for processing and for sorting,
  workers now use minimally-sized processing bundles (just enough to read
  inputs plus a little overhead). The rest is devoted to broadcast data
  buffering, sorting, and segment-building.

- Segment-building is now limited to 1 concurrent segment per work order.
  This allows each segment-building action to use more memory. Note that
  segment-building is internally multi-threaded to a degree. (Build and
  persist can run concurrently.)

- Simplify frame size calculations by removing the distinction between
  "standard" and "large" frames. The new default frame size is the same
  as the old "standard" frames, 1 MB. The original goal of of the large
  frames was to reduce the number of temporary files during sorting, but
  I think we can achieve the same thing by simply merging a larger number
  of standard frames at once.

- Remove the small worker adjustment that was added in #14117 to account
  for an extra frame involved in writing to durable storage. Instead,
  account for the extra frame whenever we are actually using durable storage.

- Cap super-sorter parallelism using the number of output partitions, rather
  than using a hard coded cap at 4. Note that in practice, so far, this cap
  has not been relevant for tasks because they have only been using a single
  processing thread anyway.

* Remove unused import.

* Fix errorprone annotation.

* Fixes for javadocs and inspections.

* Additional test coverage.

* Fix test.
---
 .../GroupByTypeInterfaceBenchmark.java        |   4 +-
 .../CachingClusteredClientBenchmark.java      |   3 +-
 .../benchmark/query/GroupByBenchmark.java     |   4 +-
 .../segment/MapVirtualColumnGroupByTest.java  |   6 +-
 .../msq/exec/ControllerMemoryParameters.java  |  34 +-
 .../org/apache/druid/msq/exec/Limits.java     |   2 +-
 .../druid/msq/exec/MemoryIntrospector.java    |  33 +-
 .../msq/exec/MemoryIntrospectorImpl.java      |  94 ++-
 .../druid/msq/exec/ProcessingBuffers.java     |  63 ++
 .../msq/exec/ProcessingBuffersProvider.java   |  58 ++
 .../druid/msq/exec/ProcessingBuffersSet.java  |  92 +++
 .../apache/druid/msq/exec/RunWorkOrder.java   |  34 +-
 .../apache/druid/msq/exec/WorkerContext.java  |   6 +-
 .../org/apache/druid/msq/exec/WorkerImpl.java |   8 +-
 .../msq/exec/WorkerMemoryParameters.java      | 732 ++++++++----------
 .../guice/IndexerMemoryManagementModule.java  |  46 +-
 .../msq/guice/PeonMemoryManagementModule.java |  64 +-
 .../msq/guice/TaskMemoryManagementConfig.java |  51 ++
 .../msq/indexing/IndexerFrameContext.java     |  25 +-
 .../IndexerProcessingBuffersProvider.java     |  89 +++
 .../msq/indexing/IndexerWorkerContext.java    |  48 +-
 .../PeonProcessingBuffersProvider.java        |  98 +++
 .../indexing/error/NotEnoughMemoryFault.java  |  43 +-
 .../error/TooManyRowsWithSameKeyFault.java    |   5 +-
 ...SegmentGeneratorFrameProcessorFactory.java |  31 +-
 .../apache/druid/msq/input/InputSpecs.java    |   3 +
 .../apache/druid/msq/kernel/FrameContext.java |  18 +-
 .../msq/kernel/FrameProcessorFactory.java     |   5 +
 .../druid/msq/kernel/StageDefinition.java     |  13 +-
 .../msq/kernel/worker/WorkerStagePhase.java   |   4 +
 .../BaseLeafFrameProcessorFactory.java        |   2 +-
 .../BroadcastJoinSegmentMapFnProcessor.java   |   2 +-
 ...dowOperatorQueryFrameProcessorFactory.java |   5 +
 .../OffsetLimitFrameProcessorFactory.java     |   6 +
 .../SortMergeJoinFrameProcessorFactory.java   |   6 +
 ...oupByPostShuffleFrameProcessorFactory.java |   6 +
 .../GroupByPreShuffleFrameProcessor.java      |   7 +
 ...roupByPreShuffleFrameProcessorFactory.java |   7 +
 .../ExportResultsFrameProcessorFactory.java   |   6 +
 .../QueryResultFrameProcessorFactory.java     |   6 +
 .../scan/ScanQueryFrameProcessorFactory.java  |   6 +
 .../exec/ControllerMemoryParametersTest.java  |  13 +-
 .../apache/druid/msq/exec/MSQInsertTest.java  |   2 +-
 .../msq/exec/WorkerMemoryParametersTest.java  | 443 ++++++++---
 .../indexing/IndexerWorkerContextTest.java    |   1 +
 .../msq/indexing/error/MSQFaultSerdeTest.java |   2 +-
 .../querykit/ChainedProcessorManagerTest.java |   4 +-
 .../apache/druid/msq/test/MSQTestBase.java    |  25 +-
 .../druid/msq/test/MSQTestWorkerContext.java  |  18 +-
 .../collections/QueueNonBlockingPool.java     |  48 ++
 .../ConcurrencyLimitedProcessorManager.java   |  74 ++
 .../groupby/GroupByQueryRunnerFactory.java    |  31 +-
 .../druid/query/groupby/GroupingEngine.java   |   9 +-
 .../collections/QueueNonBlockingPoolTest.java |  76 ++
 ...oncurrencyLimitedProcessorManagerTest.java | 103 +++
 .../processor/manager/NilFrameProcessor.java  |  60 ++
 .../manager/SequenceProcessorManagerTest.java |  38 +-
 ...ByLimitPushDownInsufficientBufferTest.java |   8 +-
 ...roupByLimitPushDownMultiNodeMergeTest.java |  12 +-
 .../groupby/GroupByMultiSegmentTest.java      |   4 +-
 .../groupby/GroupByQueryMergeBufferTest.java  |   3 +-
 .../GroupByQueryQueryToolChestTest.java       |   7 -
 .../GroupByQueryRunnerFailureTest.java        |   9 +-
 .../query/groupby/GroupByQueryRunnerTest.java |   3 +-
 .../groupby/NestedQueryPushDownTest.java      |   8 +-
 .../groupby/UnnestGroupByQueryRunnerTest.java |   3 +-
 .../druid/segment/CursorHolderPreaggTest.java |   9 +-
 67 files changed, 1957 insertions(+), 831 deletions(-)
 create mode 100644 extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ProcessingBuffers.java
 create mode 100644 extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ProcessingBuffersProvider.java
 create mode 100644 extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ProcessingBuffersSet.java
 create mode 100644 extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/TaskMemoryManagementConfig.java
 create mode 100644 extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerProcessingBuffersProvider.java
 create mode 100644 extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/PeonProcessingBuffersProvider.java
 create mode 100644 processing/src/main/java/org/apache/druid/collections/QueueNonBlockingPool.java
 create mode 100644 processing/src/main/java/org/apache/druid/frame/processor/manager/ConcurrencyLimitedProcessorManager.java
 create mode 100644 processing/src/test/java/org/apache/druid/collections/QueueNonBlockingPoolTest.java
 create mode 100644 processing/src/test/java/org/apache/druid/frame/processor/manager/ConcurrencyLimitedProcessorManagerTest.java
 create mode 100644 processing/src/test/java/org/apache/druid/frame/processor/manager/NilFrameProcessor.java

diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/GroupByTypeInterfaceBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/GroupByTypeInterfaceBenchmark.java
index 95d59856395f..bbff131e8671 100644
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/GroupByTypeInterfaceBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/GroupByTypeInterfaceBenchmark.java
@@ -378,7 +378,6 @@ public String getFormatString()
     final GroupingEngine groupingEngine = new GroupingEngine(
         druidProcessingConfig,
         configSupplier,
-        bufferPool,
         groupByResourcesReservationPool,
         TestHelper.makeJsonMapper(),
         new ObjectMapper(new SmileFactory()),
@@ -387,7 +386,8 @@ public String getFormatString()
 
     factory = new GroupByQueryRunnerFactory(
         groupingEngine,
-        new GroupByQueryQueryToolChest(groupingEngine, groupByResourcesReservationPool)
+        new GroupByQueryQueryToolChest(groupingEngine, groupByResourcesReservationPool),
+        bufferPool
     );
   }
 
diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/query/CachingClusteredClientBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/query/CachingClusteredClientBenchmark.java
index 24afa1e84772..8e0715e0fe5c 100644
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/CachingClusteredClientBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/CachingClusteredClientBenchmark.java
@@ -362,14 +362,13 @@ private static GroupByQueryRunnerFactory makeGroupByQueryRunnerFactory(
     final GroupingEngine groupingEngine = new GroupingEngine(
         processingConfig,
         configSupplier,
-        bufferPool,
         groupByResourcesReservationPool,
         mapper,
         mapper,
         QueryRunnerTestHelper.NOOP_QUERYWATCHER
     );
     final GroupByQueryQueryToolChest toolChest = new GroupByQueryQueryToolChest(groupingEngine, groupByResourcesReservationPool);
-    return new GroupByQueryRunnerFactory(groupingEngine, toolChest);
+    return new GroupByQueryRunnerFactory(groupingEngine, toolChest, bufferPool);
   }
 
   @TearDown(Level.Trial)
diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/query/GroupByBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/query/GroupByBenchmark.java
index e7220cc286d9..5ab19b6235f7 100644
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/GroupByBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/GroupByBenchmark.java
@@ -495,7 +495,6 @@ public String getFormatString()
     final GroupingEngine groupingEngine = new GroupingEngine(
         druidProcessingConfig,
         configSupplier,
-        bufferPool,
         groupByResourcesReservationPool,
         TestHelper.makeJsonMapper(),
         new ObjectMapper(new SmileFactory()),
@@ -504,7 +503,8 @@ public String getFormatString()
 
     factory = new GroupByQueryRunnerFactory(
         groupingEngine,
-        new GroupByQueryQueryToolChest(groupingEngine, groupByResourcesReservationPool)
+        new GroupByQueryQueryToolChest(groupingEngine, groupByResourcesReservationPool),
+        bufferPool
     );
   }
 
diff --git a/extensions-contrib/virtual-columns/src/test/java/org/apache/druid/segment/MapVirtualColumnGroupByTest.java b/extensions-contrib/virtual-columns/src/test/java/org/apache/druid/segment/MapVirtualColumnGroupByTest.java
index 9271b9b3e988..c1fed4bc5034 100644
--- a/extensions-contrib/virtual-columns/src/test/java/org/apache/druid/segment/MapVirtualColumnGroupByTest.java
+++ b/extensions-contrib/virtual-columns/src/test/java/org/apache/druid/segment/MapVirtualColumnGroupByTest.java
@@ -99,8 +99,7 @@ public int getNumThreads()
             return 1;
           }
         },
-        () -> config,
-        new StupidPool<>("map-virtual-column-groupby-test", () -> ByteBuffer.allocate(1024)),
+        GroupByQueryConfig::new,
         groupByResourcesReservationPool,
         TestHelper.makeJsonMapper(),
         new DefaultObjectMapper(),
@@ -109,7 +108,8 @@ public int getNumThreads()
 
     final GroupByQueryRunnerFactory factory = new GroupByQueryRunnerFactory(
         groupingEngine,
-        new GroupByQueryQueryToolChest(groupingEngine, groupByResourcesReservationPool)
+        new GroupByQueryQueryToolChest(groupingEngine, groupByResourcesReservationPool),
+        new StupidPool<>("map-virtual-column-groupby-test", () -> ByteBuffer.allocate(1024))
     );
 
     runner = QueryRunnerTestHelper.makeQueryRunner(
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerMemoryParameters.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerMemoryParameters.java
index 2ab016e10e48..c5131ddd84ec 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerMemoryParameters.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerMemoryParameters.java
@@ -19,7 +19,6 @@
 
 package org.apache.druid.msq.exec;
 
-import com.google.common.base.Preconditions;
 import org.apache.druid.msq.indexing.error.MSQException;
 import org.apache.druid.msq.indexing.error.NotEnoughMemoryFault;
 import org.apache.druid.msq.kernel.controller.ControllerQueryKernel;
@@ -29,10 +28,10 @@
  * Class for determining how much JVM heap to allocate to various purposes for {@link Controller}.
  *
  * First, look at how much of total JVM heap that is dedicated for MSQ; see
- * {@link MemoryIntrospector#usableMemoryInJvm()}.
+ * {@link MemoryIntrospector#memoryPerTask()}.
  *
  * Then, we split up that total amount of memory into equally-sized portions per {@link Controller}; see
- * {@link MemoryIntrospector#numQueriesInJvm()}. The number of controllers is based entirely on server configuration,
+ * {@link MemoryIntrospector#numTasksInJvm()}. The number of controllers is based entirely on server configuration,
  * which makes the calculation robust to different queries running simultaneously in the same JVM.
  *
  * Then, we split that up into a chunk used for input channels, and a chunk used for partition statistics.
@@ -70,29 +69,28 @@ public static ControllerMemoryParameters createProductionInstance(
       final int maxWorkerCount
   )
   {
-    final long usableMemoryInJvm = memoryIntrospector.usableMemoryInJvm();
-    final int numControllersInJvm = memoryIntrospector.numQueriesInJvm();
-    Preconditions.checkArgument(usableMemoryInJvm > 0, "Usable memory[%s] must be > 0", usableMemoryInJvm);
-    Preconditions.checkArgument(numControllersInJvm > 0, "Number of controllers[%s] must be > 0", numControllersInJvm);
-    Preconditions.checkArgument(maxWorkerCount > 0, "Number of workers[%s] must be > 0", maxWorkerCount);
-
-    final long memoryPerController = usableMemoryInJvm / numControllersInJvm;
-    final long memoryForInputChannels = WorkerMemoryParameters.memoryNeededForInputChannels(maxWorkerCount);
+    final long totalMemory = memoryIntrospector.memoryPerTask();
+    final long memoryForInputChannels =
+        WorkerMemoryParameters.computeProcessorMemoryForInputChannels(
+            maxWorkerCount,
+            WorkerMemoryParameters.DEFAULT_FRAME_SIZE
+        );
     final int partitionStatisticsMaxRetainedBytes = (int) Math.min(
-        memoryPerController - memoryForInputChannels,
+        totalMemory - memoryForInputChannels,
         PARTITION_STATS_MAX_MEMORY
     );
 
     if (partitionStatisticsMaxRetainedBytes < PARTITION_STATS_MIN_MEMORY) {
-      final long requiredMemory = memoryForInputChannels + PARTITION_STATS_MIN_MEMORY;
+      final long requiredTaskMemory = memoryForInputChannels + PARTITION_STATS_MIN_MEMORY;
       throw new MSQException(
           new NotEnoughMemoryFault(
-              memoryIntrospector.computeJvmMemoryRequiredForUsableMemory(requiredMemory),
+              memoryIntrospector.computeJvmMemoryRequiredForTaskMemory(requiredTaskMemory),
               memoryIntrospector.totalMemoryInJvm(),
-              usableMemoryInJvm,
-              numControllersInJvm,
-              memoryIntrospector.numProcessorsInJvm(),
-              0
+              memoryIntrospector.memoryPerTask(),
+              memoryIntrospector.numTasksInJvm(),
+              memoryIntrospector.numProcessingThreads(),
+              maxWorkerCount,
+              1
           )
       );
     }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/Limits.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/Limits.java
index bb782cb67d9a..fd2107762777 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/Limits.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/Limits.java
@@ -24,7 +24,7 @@ public class Limits
   /**
    * Maximum number of columns that can appear in a frame signature.
    * <p>
-   * Somewhat less than {@link WorkerMemoryParameters#STANDARD_FRAME_SIZE} divided by typical minimum column size:
+   * Somewhat less than {@link WorkerMemoryParameters#DEFAULT_FRAME_SIZE} divided by typical minimum column size:
    * {@link org.apache.druid.frame.allocation.AppendableMemory#DEFAULT_INITIAL_ALLOCATION_SIZE}.
    */
   public static final int MAX_FRAME_COLUMNS = 2000;
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MemoryIntrospector.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MemoryIntrospector.java
index 337e36d14efa..76fcb33005a0 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MemoryIntrospector.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MemoryIntrospector.java
@@ -19,10 +19,8 @@
 
 package org.apache.druid.msq.exec;
 
-import org.apache.druid.msq.kernel.WorkOrder;
-
 /**
- * Introspector used to generate {@link ControllerMemoryParameters}.
+ * Introspector used to generate {@link WorkerMemoryParameters} and {@link ControllerMemoryParameters}.
  */
 public interface MemoryIntrospector
 {
@@ -32,34 +30,23 @@ public interface MemoryIntrospector
   long totalMemoryInJvm();
 
   /**
-   * Amount of memory usable for the multi-stage query engine in the entire JVM.
-   *
-   * This may be an expensive operation. For example, the production implementation {@link MemoryIntrospectorImpl}
-   * estimates size of all lookups as part of computing this value.
+   * Amount of memory alloted to each {@link Worker} or {@link Controller}.
    */
-  long usableMemoryInJvm();
+  long memoryPerTask();
 
   /**
-   * Amount of total JVM memory required for a particular amount of usable memory to be available.
-   *
-   * This may be an expensive operation. For example, the production implementation {@link MemoryIntrospectorImpl}
-   * estimates size of all lookups as part of computing this value.
+   * Computes the amount of total JVM memory that would be required for a particular memory allotment per task, i.e.,
+   * a particular return value from {@link #memoryPerTask()}.
    */
-  long computeJvmMemoryRequiredForUsableMemory(long usableMemory);
+  long computeJvmMemoryRequiredForTaskMemory(long memoryPerTask);
 
   /**
-   * Maximum number of queries that run simultaneously in this JVM.
-   *
-   * On workers, this is the maximum number of {@link Worker} that run simultaneously in this JVM. See
-   * {@link WorkerMemoryParameters} for how memory is divided among and within {@link WorkOrder} run by a worker.
-   *
-   * On controllers, this is the maximum number of {@link Controller} that run simultaneously. See
-   * {@link ControllerMemoryParameters} for how memory is used by controllers.
+   * Maximum number of tasks ({@link Worker} or {@link Controller}) that run simultaneously in this JVM.
    */
-  int numQueriesInJvm();
+  int numTasksInJvm();
 
   /**
-   * Maximum number of processing threads that can be used at once in this JVM.
+   * Maximum number of processing threads that can be used at once by each {@link Worker} or {@link Controller}.
    */
-  int numProcessorsInJvm();
+  int numProcessingThreads();
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MemoryIntrospectorImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MemoryIntrospectorImpl.java
index f7cd501ed8fd..93d0b9de2713 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MemoryIntrospectorImpl.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/MemoryIntrospectorImpl.java
@@ -20,12 +20,14 @@
 package org.apache.druid.msq.exec;
 
 import com.google.common.collect.ImmutableList;
+import org.apache.druid.java.util.common.IAE;
 import org.apache.druid.java.util.common.logger.Logger;
 import org.apache.druid.query.DruidProcessingConfig;
 import org.apache.druid.query.lookup.LookupExtractor;
 import org.apache.druid.query.lookup.LookupExtractorFactoryContainer;
 import org.apache.druid.query.lookup.LookupExtractorFactoryContainerProvider;
 
+import javax.annotation.Nullable;
 import java.util.List;
 
 /**
@@ -34,37 +36,47 @@
 public class MemoryIntrospectorImpl implements MemoryIntrospector
 {
   private static final Logger log = new Logger(MemoryIntrospectorImpl.class);
+  private static final long LOOKUP_FOOTPRINT_INIT = Long.MIN_VALUE;
 
-  private final LookupExtractorFactoryContainerProvider lookupProvider;
   private final long totalMemoryInJvm;
-  private final int numQueriesInJvm;
-  private final int numProcessorsInJvm;
   private final double usableMemoryFraction;
+  private final int numTasksInJvm;
+  private final int numProcessingThreads;
+
+  /**
+   * Lookup footprint per task, set the first time {@link #memoryPerTask()} is called.
+   */
+  private volatile long lookupFootprint = LOOKUP_FOOTPRINT_INIT;
+
+  @Nullable
+  private final LookupExtractorFactoryContainerProvider lookupProvider;
 
   /**
    * Create an introspector.
    *
-   * @param lookupProvider       provider of lookups; we use this to subtract lookup size from total JVM memory when
-   *                             computing usable memory
    * @param totalMemoryInJvm     maximum JVM heap memory
    * @param usableMemoryFraction fraction of JVM memory, after subtracting lookup overhead, that we consider usable
-   *                             for multi-stage queries
-   * @param numQueriesInJvm      maximum number of {@link Controller} or {@link Worker} that may run concurrently
-   * @param numProcessorsInJvm   size of processing thread pool, typically {@link DruidProcessingConfig#getNumThreads()}
+   *                             for {@link Controller} or {@link Worker}
+   * @param numTasksInJvm        maximum number of {@link Controller} or {@link Worker} that may run concurrently
+   * @param numProcessingThreads size of processing thread pool, typically {@link DruidProcessingConfig#getNumThreads()}
+   * @param lookupProvider       provider of lookups; we use this to subtract lookup size from total JVM memory when
+   *                             computing usable memory. Ignored if null. This is used once the first time
+   *                             {@link #memoryPerTask()} is called, then the footprint is cached. As such, it provides
+   *                             a point-in-time view only.
    */
   public MemoryIntrospectorImpl(
-      final LookupExtractorFactoryContainerProvider lookupProvider,
       final long totalMemoryInJvm,
       final double usableMemoryFraction,
-      final int numQueriesInJvm,
-      final int numProcessorsInJvm
+      final int numTasksInJvm,
+      final int numProcessingThreads,
+      @Nullable final LookupExtractorFactoryContainerProvider lookupProvider
   )
   {
-    this.lookupProvider = lookupProvider;
     this.totalMemoryInJvm = totalMemoryInJvm;
-    this.numQueriesInJvm = numQueriesInJvm;
-    this.numProcessorsInJvm = numProcessorsInJvm;
     this.usableMemoryFraction = usableMemoryFraction;
+    this.numTasksInJvm = numTasksInJvm;
+    this.numProcessingThreads = numProcessingThreads;
+    this.lookupProvider = lookupProvider;
   }
 
   @Override
@@ -74,33 +86,52 @@ public long totalMemoryInJvm()
   }
 
   @Override
-  public long usableMemoryInJvm()
+  public long memoryPerTask()
   {
-    final long totalMemory = totalMemoryInJvm();
-    final long totalLookupFootprint = computeTotalLookupFootprint(true);
     return Math.max(
         0,
-        (long) ((totalMemory - totalLookupFootprint) * usableMemoryFraction)
+        (long) ((totalMemoryInJvm - getTotalLookupFootprint()) * usableMemoryFraction) / numTasksInJvm
     );
   }
 
   @Override
-  public long computeJvmMemoryRequiredForUsableMemory(long usableMemory)
+  public long computeJvmMemoryRequiredForTaskMemory(long memoryPerTask)
   {
-    final long totalLookupFootprint = computeTotalLookupFootprint(false);
-    return (long) Math.ceil(usableMemory / usableMemoryFraction + totalLookupFootprint);
+    if (memoryPerTask <= 0) {
+      throw new IAE("Invalid memoryPerTask[%d], expected a positive number", memoryPerTask);
+    }
+
+    return (long) Math.ceil(memoryPerTask * numTasksInJvm / usableMemoryFraction) + getTotalLookupFootprint();
   }
 
   @Override
-  public int numQueriesInJvm()
+  public int numTasksInJvm()
   {
-    return numQueriesInJvm;
+    return numTasksInJvm;
   }
 
   @Override
-  public int numProcessorsInJvm()
+  public int numProcessingThreads()
   {
-    return numProcessorsInJvm;
+    return numProcessingThreads;
+  }
+
+  /**
+   * Get a possibly-cached value of {@link #computeTotalLookupFootprint()}. The underlying computation method is
+   * called just once, meaning this is not a good way to track the size of lookups over time. This is done to keep
+   * memory calculations as consistent as possible.
+   */
+  private long getTotalLookupFootprint()
+  {
+    if (lookupFootprint == LOOKUP_FOOTPRINT_INIT) {
+      synchronized (this) {
+        if (lookupFootprint == LOOKUP_FOOTPRINT_INIT) {
+          lookupFootprint = computeTotalLookupFootprint();
+        }
+      }
+    }
+
+    return lookupFootprint;
   }
 
   /**
@@ -108,11 +139,13 @@ public int numProcessorsInJvm()
    *
    * Correctness of this approach depends on lookups being loaded *before* calling this method. Luckily, this is the
    * typical mode of operation, since by default druid.lookup.enableLookupSyncOnStartup = true.
-   *
-   * @param logFootprint whether footprint should be logged
    */
-  private long computeTotalLookupFootprint(final boolean logFootprint)
+  private long computeTotalLookupFootprint()
   {
+    if (lookupProvider == null) {
+      return 0;
+    }
+
     final List<String> lookupNames = ImmutableList.copyOf(lookupProvider.getAllLookupNames());
 
     long lookupFootprint = 0;
@@ -131,10 +164,7 @@ private long computeTotalLookupFootprint(final boolean logFootprint)
       }
     }
 
-    if (logFootprint) {
-      log.info("Lookup footprint: lookup count[%d], total bytes[%,d].", lookupNames.size(), lookupFootprint);
-    }
-
+    log.info("Lookup footprint: lookup count[%d], total bytes[%,d].", lookupNames.size(), lookupFootprint);
     return lookupFootprint;
   }
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ProcessingBuffers.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ProcessingBuffers.java
new file mode 100644
index 000000000000..b12f23be8519
--- /dev/null
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ProcessingBuffers.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.exec;
+
+import org.apache.druid.collections.NonBlockingPool;
+import org.apache.druid.collections.QueueNonBlockingPool;
+import org.apache.druid.frame.processor.Bouncer;
+import org.apache.druid.msq.kernel.FrameContext;
+
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+
+/**
+ * Holds a processing buffer pool, and a {@link Bouncer} used to limit concurrent access to the buffer pool.
+ * Thread-safe. Used by {@link RunWorkOrder} by way of {@link FrameContext#processingBuffers()}.
+ */
+public class ProcessingBuffers
+{
+  private final NonBlockingPool<ByteBuffer> bufferPool;
+  private final Bouncer bouncer;
+
+  public ProcessingBuffers(final NonBlockingPool<ByteBuffer> bufferPool, final Bouncer bouncer)
+  {
+    this.bufferPool = bufferPool;
+    this.bouncer = bouncer;
+  }
+
+  public static ProcessingBuffers fromCollection(final Collection<ByteBuffer> bufferPool)
+  {
+    final BlockingQueue<ByteBuffer> queue = new ArrayBlockingQueue<>(bufferPool.size());
+    queue.addAll(bufferPool);
+    return new ProcessingBuffers(new QueueNonBlockingPool<>(queue), new Bouncer(queue.size()));
+  }
+
+  public NonBlockingPool<ByteBuffer> getBufferPool()
+  {
+    return bufferPool;
+  }
+
+  public Bouncer getBouncer()
+  {
+    return bouncer;
+  }
+}
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ProcessingBuffersProvider.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ProcessingBuffersProvider.java
new file mode 100644
index 000000000000..fb77d1c30783
--- /dev/null
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ProcessingBuffersProvider.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.exec;
+
+import org.apache.druid.collections.ResourceHolder;
+import org.apache.druid.msq.kernel.FrameProcessorFactory;
+import org.apache.druid.msq.kernel.QueryDefinition;
+
+/**
+ * Provides processing buffers for {@link org.apache.druid.msq.kernel.WorkOrder}. Thread-safe, shared by all
+ * {@link Worker} in a particular JVM.
+ */
+public interface ProcessingBuffersProvider
+{
+  /**
+   * Acquire buffers for a {@link Worker}.
+   */
+  ResourceHolder<ProcessingBuffersSet> acquire(int poolSize);
+
+  /**
+   * Acquire buffers for a {@link Worker}, using a pool size equal to the minimum of
+   * {@link WorkerContext#maxConcurrentStages()} and the number of stages in the query where
+   * {@link FrameProcessorFactory#usesProcessingBuffers()}. (These are both caps on the number of concurrent
+   * stages that will need processing buffers at once.)
+   */
+  default ResourceHolder<ProcessingBuffersSet> acquire(
+      final QueryDefinition queryDef,
+      final int maxConcurrentStages
+  )
+  {
+    final int poolSize = Math.min(
+        maxConcurrentStages,
+        (int) queryDef.getStageDefinitions()
+                      .stream()
+                      .filter(stageDef -> stageDef.getProcessorFactory().usesProcessingBuffers())
+                      .count()
+    );
+
+    return acquire(poolSize);
+  }
+}
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ProcessingBuffersSet.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ProcessingBuffersSet.java
new file mode 100644
index 000000000000..7f81a9c4a9c1
--- /dev/null
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ProcessingBuffersSet.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.exec;
+
+import org.apache.druid.collections.ResourceHolder;
+import org.apache.druid.error.DruidException;
+import org.apache.druid.msq.kernel.StageDefinition;
+
+import javax.annotation.Nullable;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.stream.Collectors;
+
+/**
+ * Holds a set of {@link ProcessingBuffers} for a {@link Worker}. Acquired from {@link ProcessingBuffersProvider}.
+ */
+public class ProcessingBuffersSet
+{
+  public static final ProcessingBuffersSet EMPTY = new ProcessingBuffersSet(Collections.emptyList());
+
+  private final BlockingQueue<ProcessingBuffers> pool;
+
+  public ProcessingBuffersSet(Collection<ProcessingBuffers> buffers)
+  {
+    this.pool = new ArrayBlockingQueue<>(buffers.isEmpty() ? 1 : buffers.size());
+    this.pool.addAll(buffers);
+  }
+
+  /**
+   * Equivalent to calling {@link ProcessingBuffers#fromCollection} on each collection in the overall collection,
+   * then creating an instance.
+   */
+  public static <T extends Collection<ByteBuffer>> ProcessingBuffersSet fromCollection(final Collection<T> processingBuffers)
+  {
+    return new ProcessingBuffersSet(
+        processingBuffers.stream()
+                         .map(ProcessingBuffers::fromCollection)
+                         .collect(Collectors.toList())
+    );
+  }
+
+  @Nullable
+  public ResourceHolder<ProcessingBuffers> acquireForStage(final StageDefinition stageDef)
+  {
+    if (!stageDef.getProcessorFactory().usesProcessingBuffers()) {
+      return null;
+    }
+
+    final ProcessingBuffers buffers = pool.poll();
+
+    if (buffers == null) {
+      // Never happens, because the pool acquired from ProcessingBuffersProvider must be big enough for all
+      // concurrent processing buffer needs. (In other words: if this does happen, it's a bug.)
+      throw DruidException.defensive("Processing buffers not available");
+    }
+
+    return new ResourceHolder<ProcessingBuffers>()
+    {
+      @Override
+      public ProcessingBuffers get()
+      {
+        return buffers;
+      }
+
+      @Override
+      public void close()
+      {
+        pool.add(buffers);
+      }
+    };
+  }
+}
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrder.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrder.java
index a4d6a2180bde..4d028147af02 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrder.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrder.java
@@ -242,7 +242,7 @@ private void makeInputSliceReader()
             workOrder.getQueryDefinition(),
             InputSlices.allReadablePartitions(workOrder.getInputs()),
             inputChannelFactory,
-            () -> ArenaMemoryAllocator.createOnHeap(frameContext.memoryParameters().getStandardFrameSize()),
+            () -> ArenaMemoryAllocator.createOnHeap(frameContext.memoryParameters().getFrameSize()),
             exec,
             cancellationId,
             counterTracker,
@@ -270,18 +270,8 @@ private void makeWorkOutputChannelFactory()
     final OutputChannelFactory baseOutputChannelFactory;
 
     if (workOrder.getStageDefinition().doesShuffle()) {
-      // Writing to a consumer in the same JVM (which will be set up later on in this method). Use the large frame
-      // size if we're writing to a SuperSorter, since we'll generate fewer temp files if we use larger frames.
-      // Otherwise, use the standard frame size.
-      final int frameSize;
-
-      if (workOrder.getStageDefinition().getShuffleSpec().kind().isSort()) {
-        frameSize = frameContext.memoryParameters().getLargeFrameSize();
-      } else {
-        frameSize = frameContext.memoryParameters().getStandardFrameSize();
-      }
-
-      baseOutputChannelFactory = new BlockingQueueOutputChannelFactory(frameSize);
+      // Writing to a consumer in the same JVM (which will be set up later on in this method).
+      baseOutputChannelFactory = new BlockingQueueOutputChannelFactory(frameContext.memoryParameters().getFrameSize());
     } else {
       // Writing stage output.
       baseOutputChannelFactory = makeStageOutputChannelFactory();
@@ -353,7 +343,7 @@ private <FactoryType extends FrameProcessorFactory<ProcessorReturnType, ManagerR
     final ListenableFuture<ManagerReturnType> workResultFuture = exec.runAllFully(
         counterTracker.trackCpu(processorManager, CpuCounters.LABEL_MAIN),
         maxOutstandingProcessors,
-        frameContext.processorBouncer(),
+        processorFactory.usesProcessingBuffers() ? frameContext.processingBuffers().getBouncer() : Bouncer.unlimited(),
         cancellationId
     );
 
@@ -394,13 +384,13 @@ private void makeAndRunShuffleProcessors()
         if (shuffleSpec.partitionCount() == 1) {
           // Single partition; no need to write temporary files.
           hashOutputChannelFactory =
-              new BlockingQueueOutputChannelFactory(frameContext.memoryParameters().getStandardFrameSize());
+              new BlockingQueueOutputChannelFactory(frameContext.memoryParameters().getFrameSize());
         } else {
           // Multi-partition; write temporary files and then sort each one file-by-file.
           hashOutputChannelFactory =
               new FileOutputChannelFactory(
                   frameContext.tempDir("hash-parts"),
-                  frameContext.memoryParameters().getStandardFrameSize(),
+                  frameContext.memoryParameters().getFrameSize(),
                   null
               );
         }
@@ -490,7 +480,7 @@ private void writeDurableStorageSuccessFile()
     final DurableStorageOutputChannelFactory durableStorageOutputChannelFactory =
         makeDurableStorageOutputChannelFactory(
             frameContext.tempDir("durable"),
-            frameContext.memoryParameters().getStandardFrameSize(),
+            frameContext.memoryParameters().getFrameSize(),
             workOrder.getOutputChannelMode() == OutputChannelMode.DURABLE_STORAGE_QUERY_RESULTS
         );
 
@@ -510,7 +500,7 @@ private OutputChannelFactory makeStageOutputChannelFactory()
   {
     // Use the standard frame size, since we assume this size when computing how much is needed to merge output
     // files from different workers.
-    final int frameSize = frameContext.memoryParameters().getStandardFrameSize();
+    final int frameSize = frameContext.memoryParameters().getFrameSize();
     final OutputChannelMode outputChannelMode = workOrder.getOutputChannelMode();
 
     switch (outputChannelMode) {
@@ -542,7 +532,7 @@ private OutputChannelFactory makeStageOutputChannelFactory()
 
   private OutputChannelFactory makeSuperSorterIntermediateOutputChannelFactory(final File tmpDir)
   {
-    final int frameSize = frameContext.memoryParameters().getLargeFrameSize();
+    final int frameSize = frameContext.memoryParameters().getFrameSize();
     final File fileChannelDirectory =
         new File(tmpDir, StringUtils.format("intermediate_output_stage_%06d", workOrder.getStageNumber()));
     final FileOutputChannelFactory fileOutputChannelFactory =
@@ -736,8 +726,8 @@ public <T> FrameProcessor<T> decorate(FrameProcessor<T> processor)
                 },
                 outputChannelFactory,
                 makeSuperSorterIntermediateOutputChannelFactory(sorterTmpDir),
-                memoryParameters.getSuperSorterMaxActiveProcessors(),
-                memoryParameters.getSuperSorterMaxChannelsPerProcessor(),
+                memoryParameters.getSuperSorterConcurrentProcessors(),
+                memoryParameters.getSuperSorterMaxChannelsPerMerger(),
                 stageDefinition.getShuffleSpec().limitHint(),
                 cancellationId,
                 counterTracker.sortProgress(),
@@ -774,7 +764,7 @@ public void hashPartition(final OutputChannelFactory outputChannelFactory)
                 workOrder.getStageDefinition().getFrameReader(),
                 workOrder.getStageDefinition().getClusterBy().getColumns().size(),
                 FrameWriters.makeRowBasedFrameWriterFactory(
-                    new ArenaMemoryAllocatorFactory(frameContext.memoryParameters().getStandardFrameSize()),
+                    new ArenaMemoryAllocatorFactory(frameContext.memoryParameters().getFrameSize()),
                     workOrder.getStageDefinition().getSignature(),
                     workOrder.getStageDefinition().getSortKey(),
                     removeNullBytes
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerContext.java
index 95a4ce7c7ba5..90082fcf0dd0 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerContext.java
@@ -25,7 +25,6 @@
 import org.apache.druid.msq.indexing.MSQWorkerTask;
 import org.apache.druid.msq.kernel.FrameContext;
 import org.apache.druid.msq.kernel.FrameProcessorFactory;
-import org.apache.druid.msq.kernel.QueryDefinition;
 import org.apache.druid.msq.kernel.WorkOrder;
 import org.apache.druid.msq.util.MultiStageQueryContext;
 import org.apache.druid.server.DruidNode;
@@ -79,14 +78,15 @@ public interface WorkerContext
   WorkerClient makeWorkerClient();
 
   /**
-   * Directory for temporary outputs.
+   * Directory for temporary outputs, used as a base for {@link FrameContext#tempDir()}. This directory is not
+   * necessarily fully owned by the worker.
    */
   File tempDir();
 
   /**
    * Create a context with useful objects required by {@link FrameProcessorFactory#makeProcessors}.
    */
-  FrameContext frameContext(QueryDefinition queryDef, int stageNumber, OutputChannelMode outputChannelMode);
+  FrameContext frameContext(WorkOrder workOrder);
 
   /**
    * Number of available processing threads.
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
index f28d1be5e614..5d9f9b9db541 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
@@ -376,13 +376,7 @@ private void handleNewWorkOrder(
          ? StringUtils.format(", payload[%s]", context.jsonMapper().writeValueAsString(workOrder)) : "")
     );
 
-    final FrameContext frameContext = kernelHolder.processorCloser.register(
-        context.frameContext(
-            workOrder.getQueryDefinition(),
-            stageDefinition.getStageNumber(),
-            workOrder.getOutputChannelMode()
-        )
-    );
+    final FrameContext frameContext = kernelHolder.processorCloser.register(context.frameContext(workOrder));
     kernelHolder.processorCloser.register(() -> {
       try {
         workerExec.cancel(cancellationId);
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerMemoryParameters.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerMemoryParameters.java
index aeaae030e613..2884efe1f0b0 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerMemoryParameters.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerMemoryParameters.java
@@ -19,92 +19,66 @@
 
 package org.apache.druid.msq.exec;
 
-import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterables;
 import com.google.common.primitives.Ints;
-import com.google.inject.Injector;
+import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
 import it.unimi.dsi.fastutil.ints.IntSet;
-import org.apache.druid.frame.processor.Bouncer;
-import org.apache.druid.indexing.worker.config.WorkerConfig;
-import org.apache.druid.java.util.common.logger.Logger;
+import org.apache.druid.frame.processor.FrameProcessor;
+import org.apache.druid.frame.processor.SuperSorter;
 import org.apache.druid.msq.indexing.error.MSQException;
 import org.apache.druid.msq.indexing.error.NotEnoughMemoryFault;
 import org.apache.druid.msq.indexing.error.TooManyWorkersFault;
-import org.apache.druid.msq.input.InputSpecs;
-import org.apache.druid.msq.kernel.QueryDefinition;
+import org.apache.druid.msq.indexing.processor.KeyStatisticsCollectionProcessor;
+import org.apache.druid.msq.indexing.processor.SegmentGeneratorFrameProcessorFactory;
+import org.apache.druid.msq.input.InputSlice;
+import org.apache.druid.msq.input.InputSlices;
+import org.apache.druid.msq.input.stage.ReadablePartition;
+import org.apache.druid.msq.input.stage.StageInputSlice;
+import org.apache.druid.msq.kernel.GlobalSortMaxCountShuffleSpec;
+import org.apache.druid.msq.kernel.ShuffleSpec;
 import org.apache.druid.msq.kernel.StageDefinition;
+import org.apache.druid.msq.kernel.WorkOrder;
 import org.apache.druid.msq.querykit.BroadcastJoinSegmentMapFnProcessor;
 import org.apache.druid.msq.statistics.ClusterByStatisticsCollectorImpl;
-import org.apache.druid.query.lookup.LookupExtractor;
-import org.apache.druid.query.lookup.LookupExtractorFactoryContainer;
-import org.apache.druid.query.lookup.LookupExtractorFactoryContainerProvider;
-import org.apache.druid.segment.realtime.appenderator.AppenderatorsManager;
-import org.apache.druid.segment.realtime.appenderator.UnifiedIndexerAppenderatorsManager;
+import org.apache.druid.segment.incremental.IncrementalIndex;
 
+import javax.annotation.Nullable;
+import java.util.List;
 import java.util.Objects;
 
 /**
- * Class for determining how much JVM heap to allocate to various purposes.
+ * Class for determining how much JVM heap to allocate to various purposes for executing a {@link WorkOrder}.
  *
- * First, we take a chunk out of the total JVM heap that is dedicated for MSQ; see {@link #computeUsableMemoryInJvm}.
+ * First, we split each worker's memory allotment, given by {@link MemoryIntrospector#memoryPerTask()}, into
+ * equally-sized "bundles" for each {@link WorkOrder} that may be running simultaneously within the {@link Worker}
+ * for that {@link WorkOrder}.
  *
- * Then, we carve out some space for each worker that may be running in our JVM; see {@link #memoryPerWorker}.
+ * Within each bundle, we carve out memory required for buffering broadcast data
+ * (see {@link #computeBroadcastBufferMemory}) and for concurrently-running processors
+ * (see {@link #computeProcessorMemory}).
  *
- * Then, we split the rest into "bundles" of equal size; see {@link #memoryPerBundle}. The number of bundles is based
- * entirely on server configuration; this makes the calculation robust to different queries running simultaneously in
- * the same JVM.
- *
- * Within each bundle, we split up memory in two different ways: one assuming it'll be used for a
- * {@link org.apache.druid.frame.processor.SuperSorter}, and one assuming it'll be used for a regular
- * processor. Callers can then use whichever set of allocations makes sense. (We assume no single bundle
- * will be used for both purposes.)
+ * The remainder is called "bundle free memory", a pool of memory that can be used for {@link SuperSorter} or
+ * {@link SegmentGeneratorFrameProcessorFactory}. The amounts overlap, because the same {@link WorkOrder} never
+ * does both.
  */
 public class WorkerMemoryParameters
 {
-  private static final Logger log = new Logger(WorkerMemoryParameters.class);
-
   /**
-   * Percent of memory that we allocate to bundles. It is less than 100% because we need to leave some space
-   * left over for miscellaneous other stuff, and to ensure that GC pressure does not get too high.
+   * Default size for frames.
    */
-  static final double USABLE_MEMORY_FRACTION = 0.75;
+  public static final int DEFAULT_FRAME_SIZE = 1_000_000;
 
   /**
-   * Percent of each bundle's memory that we allocate to appenderators. It is less than 100% because appenderators
-   * unfortunately have a variety of unaccounted-for memory usage.
-   */
-  static final double APPENDERATOR_MEMORY_FRACTION = 0.67;
-
-  /**
-   * Size for "standard frames", which are used for most purposes, except inputs to super-sorters.
-   *
-   * In particular, frames that travel between workers are always the minimum size. This is helpful because it makes
-   * it easier to compute the amount of memory needed to merge input streams.
+   * Amount of extra memory available for each processing thread, beyond what is needed for input and output
+   * channels. This memory is used for miscellaneous purposes within the various {@link FrameProcessor}.
    */
-  private static final int STANDARD_FRAME_SIZE = 1_000_000;
+  private static final long EXTRA_MEMORY_PER_PROCESSOR = 25_000_000;
 
   /**
-   * Size for "large frames", which are used for inputs and inner channels in to super-sorters.
-   *
-   * This is helpful because it minimizes the number of temporary files needed during super-sorting.
-   */
-  private static final int LARGE_FRAME_SIZE = 8_000_000;
-
-  /**
-   * Minimum amount of bundle memory available for processing (i.e., total bundle size minus the amount
-   * needed for input channels). This memory is guaranteed to be available for things like segment generation
-   * and broadcast data.
-   */
-  public static final long PROCESSING_MINIMUM_BYTES = 25_000_000;
-
-  /**
-   * Maximum amount of parallelism for the super-sorter. Higher amounts of concurrency tend to be wasteful.
-   */
-  private static final int MAX_SUPER_SORTER_PROCESSORS = 4;
-
-  /**
-   * Each super-sorter must have at least 1 processor with 2 input frames and 1 output frame. That's 3 total.
+   * Percent of each bundle's free memory that we allocate to appenderators. It is less than 100% because appenderators
+   * unfortunately have a variety of unaccounted-for memory usage.
    */
-  private static final int MIN_SUPER_SORTER_FRAMES = 3;
+  private static final double APPENDERATOR_BUNDLE_FREE_MEMORY_FRACTION = 0.67;
 
   /**
    * (Very) rough estimate of the on-heap overhead of reading a column.
@@ -112,256 +86,214 @@ public class WorkerMemoryParameters
   private static final int APPENDERATOR_MERGE_ROUGH_MEMORY_PER_COLUMN = 3_000;
 
   /**
-   * Maximum percent of *total* available memory (not each bundle), i.e. {@link #USABLE_MEMORY_FRACTION}, that we'll
-   * ever use for maxRetainedBytes of {@link ClusterByStatisticsCollectorImpl} across all workers.
+   * Maximum percent of each bundle's free memory that will be used for maxRetainedBytes of
+   * {@link ClusterByStatisticsCollectorImpl}.
    */
-  private static final double PARTITION_STATS_MEMORY_MAX_FRACTION = 0.1;
+  private static final double PARTITION_STATS_MAX_BUNDLE_FREE_MEMORY_FRACTION = 0.1;
 
   /**
-   * Maximum number of bytes we'll ever use for maxRetainedBytes of {@link ClusterByStatisticsCollectorImpl} for
-   * a single worker. Acts as a limit on the value computed based on {@link #PARTITION_STATS_MEMORY_MAX_FRACTION}.
+   * Maximum number of bytes from each bundle's free memory that we'll ever use for maxRetainedBytes of
+   * {@link ClusterByStatisticsCollectorImpl}. Limits the value computed based on
+   * {@link #PARTITION_STATS_MAX_BUNDLE_FREE_MEMORY_FRACTION}.
    */
-  private static final long PARTITION_STATS_MEMORY_MAX_BYTES = 300_000_000;
+  private static final long PARTITION_STATS_MAX_MEMORY_PER_BUNDLE = 300_000_000;
 
   /**
-   * Threshold in bytes below which we assume that the worker is "small". While calculating the memory requirements for
-   * a small worker, we try to be as conservatives with the estimates and the extra temporary space required by the
-   * frames, since that can add up quickly and cause OOM.
+   * Minimum number of bytes from each bundle's free memory that we'll use for maxRetainedBytes of
+   * {@link ClusterByStatisticsCollectorImpl}.
    */
-  private static final long SMALL_WORKER_CAPACITY_THRESHOLD_BYTES = 256_000_000;
+  private static final long PARTITION_STATS_MIN_MEMORY_PER_BUNDLE = 10_000_000;
 
   /**
-   * Fraction of free memory per bundle that can be used by {@link BroadcastJoinSegmentMapFnProcessor} to store broadcast
-   * data on-heap. This is used to limit the total size of input frames, which we expect to expand on-heap. Expansion
-   * can potentially be somewhat over 2x: for example, strings are UTF-8 in frames, but are UTF-16 on-heap, which is
-   * a 2x expansion, and object and index overhead must be considered on top of that. So we use a value somewhat
-   * lower than 0.5.
+   * Fraction of each bundle's total memory that can be used to buffer broadcast inputs. This is used by
+   * {@link BroadcastJoinSegmentMapFnProcessor} to limit how much joinable data is stored on-heap. This is carved
+   * directly out of the total bundle memory, which makes its size more predictable and stable: it only depends on
+   * the total JVM memory, the number of tasks per JVM, and the value of maxConcurrentStages for the query. This
+   * stability is important, because if the broadcast buffer fills up, the query fails. So any time its size changes,
+   * we risk queries failing that would formerly have succeeded.
    */
-  static final double BROADCAST_JOIN_MEMORY_FRACTION = 0.3;
+  private static final double BROADCAST_BUFFER_TOTAL_MEMORY_FRACTION = 0.2;
 
   /**
-   * Fraction of free memory per bundle that can be used by
-   * {@link org.apache.druid.msq.querykit.common.SortMergeJoinFrameProcessor} to buffer frames in its trackers.
+   * Multiplier to apply to {@link #BROADCAST_BUFFER_TOTAL_MEMORY_FRACTION} when determining how much free bundle
+   * memory is left over. This fudge factor exists because {@link BroadcastJoinSegmentMapFnProcessor} applies data
+   * size limits based on frame size, which we expect to expand somewhat in memory due to indexing structures in
+   * {@link org.apache.druid.segment.join.table.FrameBasedIndexedTable}.
    */
-  static final double SORT_MERGE_JOIN_MEMORY_FRACTION = 0.9;
+  private static final double BROADCAST_BUFFER_OVERHEAD_RATIO = 1.5;
 
   /**
-   * In case {@link NotEnoughMemoryFault} is thrown, a fixed estimation overhead is added when estimating total memory required for the process.
+   * Amount of memory that can be used by
+   * {@link org.apache.druid.msq.querykit.common.SortMergeJoinFrameProcessor} to buffer frames in its trackers.
    */
-  private static final long BUFFER_BYTES_FOR_ESTIMATION = 1000;
+  private static final long SORT_MERGE_JOIN_MEMORY_PER_PROCESSOR = (long) (EXTRA_MEMORY_PER_PROCESSOR * 0.9);
 
-  private final long processorBundleMemory;
-  private final int superSorterMaxActiveProcessors;
-  private final int superSorterMaxChannelsPerProcessor;
+  private final long bundleFreeMemory;
+  private final int frameSize;
+  private final int superSorterConcurrentProcessors;
+  private final int superSorterMaxChannelsPerMerger;
   private final int partitionStatisticsMaxRetainedBytes;
-
-  WorkerMemoryParameters(
-      final long processorBundleMemory,
-      final int superSorterMaxActiveProcessors,
-      final int superSorterMaxChannelsPerProcessor,
-      final int partitionStatisticsMaxRetainedBytes
+  private final long broadcastBufferMemory;
+
+  public WorkerMemoryParameters(
+      final long bundleFreeMemory,
+      final int frameSize,
+      final int superSorterConcurrentProcessors,
+      final int superSorterMaxChannelsPerMerger,
+      final int partitionStatisticsMaxRetainedBytes,
+      final long broadcastBufferMemory
   )
   {
-    this.processorBundleMemory = processorBundleMemory;
-    this.superSorterMaxActiveProcessors = superSorterMaxActiveProcessors;
-    this.superSorterMaxChannelsPerProcessor = superSorterMaxChannelsPerProcessor;
+    this.bundleFreeMemory = bundleFreeMemory;
+    this.frameSize = frameSize;
+    this.superSorterConcurrentProcessors = superSorterConcurrentProcessors;
+    this.superSorterMaxChannelsPerMerger = superSorterMaxChannelsPerMerger;
     this.partitionStatisticsMaxRetainedBytes = partitionStatisticsMaxRetainedBytes;
+    this.broadcastBufferMemory = broadcastBufferMemory;
   }
 
   /**
-   * Create a production instance for {@link org.apache.druid.msq.indexing.MSQWorkerTask}.
+   * Create a production instance for a given {@link WorkOrder}.
    */
-  public static WorkerMemoryParameters createProductionInstanceForWorker(
-      final Injector injector,
-      final QueryDefinition queryDef,
-      final int stageNumber,
+  public static WorkerMemoryParameters createProductionInstance(
+      final WorkOrder workOrder,
+      final MemoryIntrospector memoryIntrospector,
       final int maxConcurrentStages
   )
   {
-    final StageDefinition stageDef = queryDef.getStageDefinition(stageNumber);
-    final IntSet inputStageNumbers = InputSpecs.getStageNumbers(stageDef.getInputSpecs());
-    final int numInputWorkers =
-        inputStageNumbers.intStream()
-                         .map(inputStageNumber -> queryDef.getStageDefinition(inputStageNumber).getMaxWorkerCount())
-                         .sum();
-    long totalLookupFootprint = computeTotalLookupFootprint(injector);
-
-    final int numHashOutputPartitions;
-    if (stageDef.doesShuffle() && stageDef.getShuffleSpec().kind().isHash()) {
-      numHashOutputPartitions = stageDef.getShuffleSpec().partitionCount();
-    } else {
-      numHashOutputPartitions = 0;
-    }
-
+    final StageDefinition stageDef = workOrder.getStageDefinition();
     return createInstance(
-        Runtime.getRuntime().maxMemory(),
-        computeNumWorkersInJvm(injector),
-        computeNumProcessorsInJvm(injector),
+        memoryIntrospector,
+        DEFAULT_FRAME_SIZE,
+        workOrder.getInputs(),
+        stageDef.getBroadcastInputNumbers(),
+        stageDef.doesShuffle() ? stageDef.getShuffleSpec() : null,
         maxConcurrentStages,
-        numInputWorkers,
-        numHashOutputPartitions,
-        totalLookupFootprint
+        computeFramesPerOutputChannel(workOrder.getOutputChannelMode())
     );
   }
 
   /**
-   * Returns an object specifying memory-usage parameters.
+   * Returns an object specifying memory-usage parameters for a {@link WorkOrder} running inside a {@link Worker}.
    *
    * Throws a {@link MSQException} with an appropriate fault if the provided combination of parameters cannot
    * yield a workable memory situation.
    *
-   * @param maxMemoryInJvm            memory available in the entire JVM. This will be divided amongst processors.
-   * @param numWorkersInJvm           number of workers that can run concurrently in this JVM. Generally equal to
-   *                                  the task capacity.
-   * @param numProcessingThreadsInJvm size of the processing thread pool in the JVM.
-   * @param maxConcurrentStages       maximum number of concurrent stages per worker.
-   * @param numInputWorkers           total number of workers across all input stages.
-   * @param numHashOutputPartitions   total number of output partitions, if using hash partitioning; zero if not using
-   *                                  hash partitioning.
-   * @param totalLookupFootprint      estimated size of the lookups loaded by the process.
+   * @param memoryIntrospector           memory introspector
+   * @param frameSize                    frame size
+   * @param inputSlices                  from {@link WorkOrder#getInputs()}
+   * @param broadcastInputNumbers        from {@link StageDefinition#getBroadcastInputNumbers()}
+   * @param shuffleSpec                  from {@link StageDefinition#getShuffleSpec()}
+   * @param maxConcurrentStages          figure from {@link WorkerContext#maxConcurrentStages()}
+   * @param numFramesPerOutputChannel    figure from {@link #computeFramesPerOutputChannel(OutputChannelMode)}
+   *
+   * @throws MSQException with {@link TooManyWorkersFault} or {@link NotEnoughMemoryFault} if not enough memory
+   *                      is available to generate a usable instance
    */
   public static WorkerMemoryParameters createInstance(
-      final long maxMemoryInJvm,
-      final int numWorkersInJvm,
-      final int numProcessingThreadsInJvm,
+      final MemoryIntrospector memoryIntrospector,
+      final int frameSize,
+      final List<InputSlice> inputSlices,
+      final IntSet broadcastInputNumbers,
+      @Nullable final ShuffleSpec shuffleSpec,
       final int maxConcurrentStages,
-      final int numInputWorkers,
-      final int numHashOutputPartitions,
-      final long totalLookupFootprint
+      final int numFramesPerOutputChannel
   )
   {
-    Preconditions.checkArgument(maxMemoryInJvm > 0, "Max memory passed: [%s] should be > 0", maxMemoryInJvm);
-    Preconditions.checkArgument(numWorkersInJvm > 0, "Number of workers: [%s] in jvm should be > 0", numWorkersInJvm);
-    Preconditions.checkArgument(
-        numProcessingThreadsInJvm > 0,
-        "Number of processing threads [%s] should be > 0",
-        numProcessingThreadsInJvm
+    final long bundleMemory = computeBundleMemory(memoryIntrospector.memoryPerTask(), maxConcurrentStages);
+    final long processorMemory = computeProcessorMemory(
+        computeMaxSimultaneousInputChannelsPerProcessor(inputSlices, broadcastInputNumbers),
+        frameSize
     );
-    Preconditions.checkArgument(numInputWorkers >= 0, "Number of input workers: [%s] should be >=0", numInputWorkers);
-    Preconditions.checkArgument(
-        totalLookupFootprint >= 0,
-        "Lookup memory footprint: [%s] should be >= 0",
-        totalLookupFootprint
-    );
-    final long usableMemoryInJvm = computeUsableMemoryInJvm(maxMemoryInJvm, totalLookupFootprint);
-    final long workerMemory = memoryPerWorker(usableMemoryInJvm, numWorkersInJvm);
-    final long bundleMemory =
-        memoryPerBundle(usableMemoryInJvm, numWorkersInJvm, numProcessingThreadsInJvm) / maxConcurrentStages;
-    final long bundleMemoryForInputChannels = memoryNeededForInputChannels(numInputWorkers);
-    final long bundleMemoryForHashPartitioning = memoryNeededForHashPartitioning(numHashOutputPartitions);
-    final long bundleMemoryForProcessing =
-        bundleMemory - bundleMemoryForInputChannels - bundleMemoryForHashPartitioning;
-
-    if (bundleMemoryForProcessing < PROCESSING_MINIMUM_BYTES) {
-      final int maxWorkers = computeMaxWorkers(
-          usableMemoryInJvm,
-          numWorkersInJvm,
-          numProcessingThreadsInJvm,
-          maxConcurrentStages,
-          numHashOutputPartitions
-      );
-
-      if (maxWorkers > 0) {
-        throw new MSQException(new TooManyWorkersFault(numInputWorkers, Math.min(Limits.MAX_WORKERS, maxWorkers)));
-      } else {
-        // Not enough memory for even one worker. More of a NotEnoughMemory situation than a TooManyWorkers situation.
-        throw new MSQException(
-            new NotEnoughMemoryFault(
-                calculateSuggestedMinMemoryFromUsableMemory(
-                    estimateUsableMemory(
-                        numWorkersInJvm,
-                        numProcessingThreadsInJvm,
-                        PROCESSING_MINIMUM_BYTES + BUFFER_BYTES_FOR_ESTIMATION + bundleMemoryForInputChannels,
-                        maxConcurrentStages
-                    ), totalLookupFootprint),
-                maxMemoryInJvm,
-                usableMemoryInJvm,
-                numWorkersInJvm,
-                numProcessingThreadsInJvm,
-                maxConcurrentStages
-            )
-        );
-      }
-    }
-
-    // Compute memory breakdown for super-sorting bundles.
-    final int maxNumFramesForSuperSorter = Ints.checkedCast(bundleMemory / WorkerMemoryParameters.LARGE_FRAME_SIZE);
-
-    if (maxNumFramesForSuperSorter < MIN_SUPER_SORTER_FRAMES) {
+    final boolean hasBroadcastInputs = !broadcastInputNumbers.isEmpty();
+    final long broadcastBufferMemory =
+        hasBroadcastInputs ? computeBroadcastBufferMemoryIncludingOverhead(bundleMemory) : 0;
+    final int numProcessingThreads = memoryIntrospector.numProcessingThreads();
+    final int maxSimultaneousWorkProcessors = Math.min(numProcessingThreads, computeNumInputPartitions(inputSlices));
+    final long bundleFreeMemory =
+        bundleMemory - maxSimultaneousWorkProcessors * processorMemory - broadcastBufferMemory;
+
+    final long minimumBundleFreeMemory = computeMinimumBundleFreeMemory(frameSize, numFramesPerOutputChannel);
+    if (bundleFreeMemory < minimumBundleFreeMemory) {
+      final long requiredTaskMemory = bundleMemory - bundleFreeMemory + minimumBundleFreeMemory;
       throw new MSQException(
           new NotEnoughMemoryFault(
-              calculateSuggestedMinMemoryFromUsableMemory(
-                  estimateUsableMemory(
-                      numWorkersInJvm,
-                      (MIN_SUPER_SORTER_FRAMES + BUFFER_BYTES_FOR_ESTIMATION) * LARGE_FRAME_SIZE,
-                      maxConcurrentStages
-                  ),
-                  totalLookupFootprint
-              ),
-              maxMemoryInJvm,
-              usableMemoryInJvm,
-              numWorkersInJvm,
-              numProcessingThreadsInJvm,
+              memoryIntrospector.computeJvmMemoryRequiredForTaskMemory(requiredTaskMemory),
+              memoryIntrospector.totalMemoryInJvm(),
+              memoryIntrospector.memoryPerTask(),
+              memoryIntrospector.numTasksInJvm(),
+              memoryIntrospector.numProcessingThreads(),
+              computeNumInputWorkers(inputSlices),
               maxConcurrentStages
           )
       );
     }
 
-    final int superSorterMaxActiveProcessors = Math.min(
-        numProcessingThreadsInJvm,
-        Math.min(
-            maxNumFramesForSuperSorter / MIN_SUPER_SORTER_FRAMES,
-            MAX_SUPER_SORTER_PROCESSORS
-        )
-    );
+    // Compute memory breakdown for super-sorting bundles.
+    final int partitionStatsMemory =
+        StageDefinition.mustGatherResultKeyStatistics(shuffleSpec) ? computePartitionStatsMemory(bundleFreeMemory) : 0;
+    final long superSorterMemory = bundleFreeMemory - partitionStatsMemory;
+    final int maxOutputPartitions = computeMaxOutputPartitions(shuffleSpec);
 
-    final int isSmallWorker = usableMemoryInJvm < SMALL_WORKER_CAPACITY_THRESHOLD_BYTES ? 1 : 0;
-    // Apportion max frames to all processors equally, then subtract one to account for an output frame and one to account
-    // for the durable storage's output frame in the supersorter. The extra frame is required in case of durable storage
-    // since composing output channel factories keep a frame open while writing to them.
-    // We only account for this extra frame in the workers where the heap size is relatively small to be more
-    // conservative with the memory estimations. In workers with heap size larger than the frame size, we can get away
-    // without accounting for this extra frame, and instead better parallelize the supersorter's operations.
-    final int superSorterMaxChannelsPerProcessor = maxNumFramesForSuperSorter / superSorterMaxActiveProcessors
-                                                   - 1
-                                                   - isSmallWorker;
-    if (superSorterMaxActiveProcessors <= 0) {
+    int superSorterConcurrentProcessors;
+    int superSorterMaxChannelsPerMerger = -1;
+
+    if (maxOutputPartitions == 0) {
+      superSorterConcurrentProcessors = numProcessingThreads;
+    } else {
+      superSorterConcurrentProcessors = Math.min(maxOutputPartitions, numProcessingThreads);
+    }
+
+    for (; superSorterConcurrentProcessors > 0; superSorterConcurrentProcessors--) {
+      final long memoryPerProcessor = superSorterMemory / superSorterConcurrentProcessors;
+
+      // Each processor has at least 2 frames for inputs, plus numFramesPerOutputChannel for outputs.
+      // Compute whether we can support this level of parallelism, given these constraints.
+      final int minMemoryForInputsPerProcessor = 2 * frameSize;
+      final int memoryForOutputsPerProcessor = numFramesPerOutputChannel * frameSize;
+
+      if (memoryPerProcessor >= minMemoryForInputsPerProcessor + memoryForOutputsPerProcessor) {
+        final long memoryForInputsPerProcessor = memoryPerProcessor - memoryForOutputsPerProcessor;
+        superSorterMaxChannelsPerMerger = Ints.checkedCast(memoryForInputsPerProcessor / frameSize);
+        break;
+      }
+    }
+
+    if (superSorterConcurrentProcessors == 0) {
+      // Couldn't support any level of concurrency. Not expected, since we should have accounted for at least a
+      // minimally-sized SuperSorter by way of the calculation in "computeMinimumBundleFreeMemory". Return a
+      // NotEnoughMemoryFault with no suggestedServerMemory, since at this point, we aren't sure what will work.
       throw new MSQException(
           new NotEnoughMemoryFault(
-              calculateSuggestedMinMemoryFromUsableMemory(
-                  estimateUsableMemory(
-                      numWorkersInJvm,
-                      numProcessingThreadsInJvm,
-                      PROCESSING_MINIMUM_BYTES + BUFFER_BYTES_FOR_ESTIMATION + bundleMemoryForInputChannels,
-                      maxConcurrentStages
-                  ), totalLookupFootprint),
-              maxMemoryInJvm,
-              usableMemoryInJvm,
-              numWorkersInJvm,
-              numProcessingThreadsInJvm,
+              0,
+              memoryIntrospector.totalMemoryInJvm(),
+              memoryIntrospector.memoryPerTask(),
+              memoryIntrospector.numTasksInJvm(),
+              memoryIntrospector.numProcessingThreads(),
+              computeNumInputWorkers(inputSlices),
               maxConcurrentStages
           )
       );
     }
 
     return new WorkerMemoryParameters(
-        bundleMemoryForProcessing,
-        superSorterMaxActiveProcessors,
-        superSorterMaxChannelsPerProcessor,
-
-        // 100% of worker memory is devoted to partition statistics
-        Ints.checkedCast(workerMemory / maxConcurrentStages)
+        bundleFreeMemory,
+        frameSize,
+        superSorterConcurrentProcessors,
+        superSorterMaxChannelsPerMerger,
+        Math.min(Integer.MAX_VALUE, partitionStatsMemory / numProcessingThreads),
+        hasBroadcastInputs ? computeBroadcastBufferMemory(bundleMemory) : 0
     );
   }
 
-  public int getSuperSorterMaxActiveProcessors()
+  public int getSuperSorterConcurrentProcessors()
   {
-    return superSorterMaxActiveProcessors;
+    return superSorterConcurrentProcessors;
   }
 
-  public int getSuperSorterMaxChannelsPerProcessor()
+  public int getSuperSorterMaxChannelsPerMerger()
   {
-    return superSorterMaxChannelsPerProcessor;
+    return superSorterMaxChannelsPerMerger;
   }
 
   public long getAppenderatorMaxBytesInMemory()
@@ -376,24 +308,27 @@ public int getAppenderatorMaxColumnsToMerge()
     return Ints.checkedCast(Math.max(2, getAppenderatorMemory() / 2 / APPENDERATOR_MERGE_ROUGH_MEMORY_PER_COLUMN));
   }
 
-  public int getStandardFrameSize()
+  public int getFrameSize()
   {
-    return STANDARD_FRAME_SIZE;
+    return frameSize;
   }
 
-  public int getLargeFrameSize()
-  {
-    return LARGE_FRAME_SIZE;
-  }
-
-  public long getBroadcastJoinMemory()
+  /**
+   * Memory available for buffering broadcast data. Used to restrict the amount of memory used by
+   * {@link BroadcastJoinSegmentMapFnProcessor}.
+   */
+  public long getBroadcastBufferMemory()
   {
-    return (long) (processorBundleMemory * BROADCAST_JOIN_MEMORY_FRACTION);
+    return broadcastBufferMemory;
   }
 
+  /**
+   * Fraction of each processor's memory that can be used by
+   * {@link org.apache.druid.msq.querykit.common.SortMergeJoinFrameProcessor} to buffer frames in its trackers.
+   */
   public long getSortMergeJoinMemory()
   {
-    return (long) (processorBundleMemory * SORT_MERGE_JOIN_MEMORY_FRACTION);
+    return SORT_MERGE_JOIN_MEMORY_PER_PROCESSOR;
   }
 
   public int getPartitionStatisticsMaxRetainedBytes()
@@ -406,7 +341,7 @@ public int getPartitionStatisticsMaxRetainedBytes()
    */
   private long getAppenderatorMemory()
   {
-    return (long) (processorBundleMemory * APPENDERATOR_MEMORY_FRACTION);
+    return (long) (bundleFreeMemory * APPENDERATOR_BUNDLE_FREE_MEMORY_FRACTION);
   }
 
   @Override
@@ -419,20 +354,24 @@ public boolean equals(Object o)
       return false;
     }
     WorkerMemoryParameters that = (WorkerMemoryParameters) o;
-    return processorBundleMemory == that.processorBundleMemory
-           && superSorterMaxActiveProcessors == that.superSorterMaxActiveProcessors
-           && superSorterMaxChannelsPerProcessor == that.superSorterMaxChannelsPerProcessor
-           && partitionStatisticsMaxRetainedBytes == that.partitionStatisticsMaxRetainedBytes;
+    return bundleFreeMemory == that.bundleFreeMemory
+           && frameSize == that.frameSize
+           && superSorterConcurrentProcessors == that.superSorterConcurrentProcessors
+           && superSorterMaxChannelsPerMerger == that.superSorterMaxChannelsPerMerger
+           && partitionStatisticsMaxRetainedBytes == that.partitionStatisticsMaxRetainedBytes
+           && broadcastBufferMemory == that.broadcastBufferMemory;
   }
 
   @Override
   public int hashCode()
   {
     return Objects.hash(
-        processorBundleMemory,
-        superSorterMaxActiveProcessors,
-        superSorterMaxChannelsPerProcessor,
-        partitionStatisticsMaxRetainedBytes
+        bundleFreeMemory,
+        frameSize,
+        superSorterConcurrentProcessors,
+        superSorterMaxChannelsPerMerger,
+        partitionStatisticsMaxRetainedBytes,
+        broadcastBufferMemory
     );
   }
 
@@ -440,206 +379,205 @@ public int hashCode()
   public String toString()
   {
     return "WorkerMemoryParameters{" +
-           "processorBundleMemory=" + processorBundleMemory +
-           ", superSorterMaxActiveProcessors=" + superSorterMaxActiveProcessors +
-           ", superSorterMaxChannelsPerProcessor=" + superSorterMaxChannelsPerProcessor +
+           "bundleFreeMemory=" + bundleFreeMemory +
+           ", frameSize=" + frameSize +
+           ", superSorterConcurrentProcessors=" + superSorterConcurrentProcessors +
+           ", superSorterMaxChannelsPerMerger=" + superSorterMaxChannelsPerMerger +
            ", partitionStatisticsMaxRetainedBytes=" + partitionStatisticsMaxRetainedBytes +
+           ", broadcastBufferMemory=" + broadcastBufferMemory +
            '}';
   }
 
   /**
-   * Computes the highest value of numInputWorkers, for the given parameters, that can be passed to
-   * {@link #createInstance} without resulting in a {@link TooManyWorkersFault}.
-   *
-   * Returns 0 if no number of workers would be OK.
+   * Compute the memory allocated to each {@link WorkOrder} within a {@link Worker}.
    */
-  static int computeMaxWorkers(
-      final long usableMemoryInJvm,
-      final int numWorkersInJvm,
-      final int numProcessingThreadsInJvm,
-      final int maxConcurrentStages,
-      final int numHashOutputPartitions
-  )
+  static long computeBundleMemory(final long memoryPerWorker, final int maxConcurrentStages)
   {
-    final long bundleMemory = memoryPerBundle(usableMemoryInJvm, numWorkersInJvm, numProcessingThreadsInJvm);
-
-    // Compute number of workers that gives us PROCESSING_MINIMUM_BYTES of memory per bundle per concurrent stage, while
-    // accounting for memoryNeededForInputChannels + memoryNeededForHashPartitioning.
-    final int isHashing = numHashOutputPartitions > 0 ? 1 : 0;
-    final long bundleMemoryPerStage = bundleMemory / maxConcurrentStages;
-    final long maxWorkers =
-        (bundleMemoryPerStage - PROCESSING_MINIMUM_BYTES) / ((long) STANDARD_FRAME_SIZE * (1 + isHashing)) - 1;
-    return Math.max(0, Ints.checkedCast(maxWorkers));
+    return memoryPerWorker / maxConcurrentStages;
   }
 
   /**
-   * Computes the amount of memory needed to read a single partition from a given number of workers.
+   * Compute the memory allocated to {@link KeyStatisticsCollectionProcessor} within each bundle.
    */
-  static long memoryNeededForInputChannels(final int numInputWorkers)
+  static int computePartitionStatsMemory(final long bundleFreeMemory)
   {
-    // Workers that read sorted inputs must open all channels at once to do an N-way merge. Calculate memory needs.
-    // Requirement: one input frame per worker, one buffered output frame.
-    return (long) STANDARD_FRAME_SIZE * (numInputWorkers + 1);
+    return Ints.checkedCast(
+        Math.max(
+            (long) Math.min(
+                bundleFreeMemory * PARTITION_STATS_MAX_BUNDLE_FREE_MEMORY_FRACTION,
+                PARTITION_STATS_MAX_MEMORY_PER_BUNDLE
+            ),
+            PARTITION_STATS_MIN_MEMORY_PER_BUNDLE
+        )
+    );
   }
 
   /**
-   * Maximum number of workers that may exist in the current JVM.
+   * Compute the memory limit passed to {@link BroadcastJoinSegmentMapFnProcessor} within each worker bundle. This
+   * is somewhat lower than {@link #computeBroadcastBufferMemoryIncludingOverhead}, because we expect some overhead on
+   * top of this limit due to indexing structures. This overhead isn't accounted for by the processor
+   * {@link BroadcastJoinSegmentMapFnProcessor} itself.
    */
-  private static int computeNumWorkersInJvm(final Injector injector)
+  static long computeBroadcastBufferMemory(final long bundleMemory)
   {
-    final AppenderatorsManager appenderatorsManager = injector.getInstance(AppenderatorsManager.class);
+    return (long) (bundleMemory * BROADCAST_BUFFER_TOTAL_MEMORY_FRACTION);
+  }
 
-    if (appenderatorsManager instanceof UnifiedIndexerAppenderatorsManager) {
-      // CliIndexer
-      return injector.getInstance(WorkerConfig.class).getCapacity();
-    } else {
-      // CliPeon
-      return 1;
-    }
+  /**
+   * Memory allocated to {@link BroadcastJoinSegmentMapFnProcessor} within each worker bundle, including
+   * expected overhead.
+   */
+  static long computeBroadcastBufferMemoryIncludingOverhead(final long bundleMemory)
+  {
+    return (long) (computeBroadcastBufferMemory(bundleMemory) * BROADCAST_BUFFER_OVERHEAD_RATIO);
   }
 
   /**
-   * Maximum number of concurrent processors that exist in the current JVM.
+   * Memory allocated to each processor within a bundle, including fixed overheads and buffered input and output frames.
+   *
+   * @param maxSimultaneousInputChannelsPerProcessor figure from {@link #computeMaxSimultaneousInputChannelsPerProcessor}
+   * @param frameSize                                frame size
    */
-  private static int computeNumProcessorsInJvm(final Injector injector)
+  static long computeProcessorMemory(final int maxSimultaneousInputChannelsPerProcessor, final int frameSize)
   {
-    return injector.getInstance(Bouncer.class).getMaxCount();
+    return EXTRA_MEMORY_PER_PROCESSOR
+           + computeProcessorMemoryForInputChannels(maxSimultaneousInputChannelsPerProcessor, frameSize)
+           + frameSize /* output frame */;
   }
 
   /**
-   * Compute the memory allocated to each worker. Includes anything that exists outside of processing bundles.
+   * Memory allocated to each processor for reading its inputs.
    *
-   * Today, we only look at one thing: the amount of memory taken up by
-   * {@link org.apache.druid.msq.statistics.ClusterByStatisticsCollector}. This is the single largest source of memory
-   * usage outside processing bundles.
+   * @param maxSimultaneousInputChannelsPerProcessor figure from {@link #computeMaxSimultaneousInputChannelsPerProcessor}
+   * @param frameSize                                frame size
    */
-  private static long memoryPerWorker(
-      final long usableMemoryInJvm,
-      final int numWorkersInJvm
+  static long computeProcessorMemoryForInputChannels(
+      final int maxSimultaneousInputChannelsPerProcessor,
+      final int frameSize
   )
   {
-    final long memoryForWorkers = (long) Math.min(
-        usableMemoryInJvm * PARTITION_STATS_MEMORY_MAX_FRACTION,
-        numWorkersInJvm * PARTITION_STATS_MEMORY_MAX_BYTES
-    );
-
-    return memoryForWorkers / numWorkersInJvm;
+    return (long) maxSimultaneousInputChannelsPerProcessor * frameSize;
   }
 
   /**
-   * Compute the memory allocated to each processing bundle. Any computation changes done to this method should also be
-   * done in its corresponding method {@link WorkerMemoryParameters#estimateUsableMemory}
+   * Number of input partitions across all {@link StageInputSlice}.
    */
-  private static long memoryPerBundle(
-      final long usableMemoryInJvm,
-      final int numWorkersInJvm,
-      final int numProcessingThreadsInJvm
-  )
+  static int computeNumInputPartitions(final List<InputSlice> inputSlices)
   {
-    // One bundle per worker + one per processor. The worker bundles are used for sorting (SuperSorter) and the
-    // processing bundles are used for reading input and doing per-partition processing.
-    final int bundleCount = numWorkersInJvm + numProcessingThreadsInJvm;
+    int retVal = 0;
 
-    // Need to subtract memoryForWorkers off the top of usableMemoryInJvm, since this is reserved for
-    // statistics collection.
-    final long memoryForWorkers = numWorkersInJvm * memoryPerWorker(usableMemoryInJvm, numWorkersInJvm);
-    final long memoryForBundles = usableMemoryInJvm - memoryForWorkers;
+    for (final StageInputSlice slice : InputSlices.allStageSlices(inputSlices)) {
+      retVal += Iterables.size(slice.getPartitions());
+    }
 
-    // Divide up the usable memory per bundle.
-    return memoryForBundles / bundleCount;
+    return retVal;
   }
 
   /**
-   * Used for estimating the usable memory for better exception messages when {@link NotEnoughMemoryFault} is thrown.
+   * Maximum number of input channels that a processor may have open at once, given the provided worker assignment.
+   *
+   * To compute this, we take the maximum number of workers associated with some partition for each slice. Then we sum
+   * those maxes up for all broadcast slices, and for all non-broadcast slices, and take the max between those two.
+   * The idea is that processors first read broadcast data, then read non-broadcast data, and during both phases
+   * they should have at most one partition open from each slice at once.
+   *
+   * @param inputSlices           object from {@link WorkOrder#getInputs()}
+   * @param broadcastInputNumbers object from {@link StageDefinition#getBroadcastInputNumbers()}
    */
-  private static long estimateUsableMemory(
-      final int numWorkersInJvm,
-      final int numProcessingThreadsInJvm,
-      final long estimatedEachBundleMemory,
-      final int maxConcurrentStages
+  static int computeMaxSimultaneousInputChannelsPerProcessor(
+      final List<InputSlice> inputSlices,
+      final IntSet broadcastInputNumbers
   )
   {
-    final int bundleCount = numWorkersInJvm + numProcessingThreadsInJvm;
-    return estimateUsableMemory(numWorkersInJvm, estimatedEachBundleMemory * bundleCount, maxConcurrentStages);
+    long totalNonBroadcastInputChannels = 0;
+    long totalBroadcastInputChannels = 0;
+
+    final List<StageInputSlice> allStageSlices = InputSlices.allStageSlices(inputSlices);
+
+    for (int inputNumber = 0; inputNumber < allStageSlices.size(); inputNumber++) {
+      final StageInputSlice slice = allStageSlices.get(inputNumber);
+
+      int maxWorkers = 0;
+      for (final ReadablePartition partition : slice.getPartitions()) {
+        maxWorkers = Math.max(maxWorkers, partition.getWorkerNumbers().size());
+      }
+
+      if (broadcastInputNumbers.contains(inputNumber)) {
+        totalBroadcastInputChannels += maxWorkers;
+      } else {
+        totalNonBroadcastInputChannels += maxWorkers;
+      }
+    }
+
+    return Ints.checkedCast(Math.max(totalBroadcastInputChannels, totalNonBroadcastInputChannels));
   }
 
+
   /**
-   * Add overheads to the estimated bundle memoery for all the workers. Checkout {@link WorkerMemoryParameters#memoryPerWorker(long, int)}
-   * for the overhead calculation outside the processing bundles.
+   * Distinct number of input workers.
    */
-  private static long estimateUsableMemory(
-      final int numWorkersInJvm,
-      final long estimatedTotalBundleMemory,
-      final int maxConcurrentStages
-  )
+  static int computeNumInputWorkers(final List<InputSlice> inputSlices)
   {
-    // Currently, we only add the partition stats overhead since it will be the single largest overhead per worker.
-    final long estimateStatOverHeadPerWorker = PARTITION_STATS_MEMORY_MAX_BYTES;
-    final long requiredUsableMemory = estimatedTotalBundleMemory + (estimateStatOverHeadPerWorker * numWorkersInJvm);
-    return requiredUsableMemory * maxConcurrentStages;
-  }
+    final IntSet workerNumbers = new IntOpenHashSet();
 
-  private static long memoryNeededForHashPartitioning(final int numOutputPartitions)
-  {
-    // One standard frame for each processor output.
-    // May be zero, since numOutputPartitions is zero if not using hash partitioning.
-    return (long) STANDARD_FRAME_SIZE * numOutputPartitions;
+    for (final StageInputSlice slice : InputSlices.allStageSlices(inputSlices)) {
+      for (final ReadablePartition partition : slice.getPartitions()) {
+        workerNumbers.addAll(partition.getWorkerNumbers());
+      }
+    }
+
+    return workerNumbers.size();
   }
 
   /**
-   * Amount of heap memory available for our usage. Any computation changes done to this method should also be done in
-   * its corresponding method {@link WorkerMemoryParameters#calculateSuggestedMinMemoryFromUsableMemory}
+   * Maximum number of output channels for a shuffle spec, or 0 if not knowable in advance.
    */
-  private static long computeUsableMemoryInJvm(final long maxMemory, final long totalLookupFootprint)
+  static int computeMaxOutputPartitions(@Nullable final ShuffleSpec shuffleSpec)
   {
-    // Always report at least one byte, to simplify the math in createInstance.
-    return Math.max(
-        1,
-        (long) ((maxMemory - totalLookupFootprint) * USABLE_MEMORY_FRACTION)
-    );
+    if (shuffleSpec == null) {
+      return 0;
+    } else {
+      switch (shuffleSpec.kind()) {
+        case HASH:
+        case HASH_LOCAL_SORT:
+        case MIX:
+          return shuffleSpec.partitionCount();
+
+        case GLOBAL_SORT:
+          if (shuffleSpec instanceof GlobalSortMaxCountShuffleSpec) {
+            return ((GlobalSortMaxCountShuffleSpec) shuffleSpec).getMaxPartitions();
+          }
+          // Fall through
+
+        default:
+          return 0;
+      }
+    }
   }
 
   /**
-   * Estimate amount of heap memory for the given workload to use in case usable memory is provided. This method is used
-   * for better exception messages when {@link NotEnoughMemoryFault} is thrown.
+   * Maximum number of output channels for a shuffle spec, or 0 if not knowable in advance.
    */
-  private static long calculateSuggestedMinMemoryFromUsableMemory(long usuableMemeory, final long totalLookupFootprint)
+  static int computeFramesPerOutputChannel(final OutputChannelMode outputChannelMode)
   {
-    return (long) ((usuableMemeory / USABLE_MEMORY_FRACTION) + totalLookupFootprint);
+    // If durable storage is enabled, we need one extra frame per output channel.
+    return outputChannelMode.isDurable() ? 2 : 1;
   }
 
   /**
-   * Total estimated lookup footprint. Obtained by calling {@link LookupExtractor#estimateHeapFootprint()} on
-   * all available lookups.
+   * Minimum number of bytes for a bundle's free memory allotment. This must be enough to reasonably produce and
+   * persist an {@link IncrementalIndex}, or to run a {@link SuperSorter} with 1 thread and 2 frames.
    */
-  private static long computeTotalLookupFootprint(final Injector injector)
+  static long computeMinimumBundleFreeMemory(final int frameSize, final int numFramesPerOutputChannel)
   {
-    // Subtract memory taken up by lookups. Correctness of this operation depends on lookups being loaded *before*
-    // we create this instance. Luckily, this is the typical mode of operation, since by default
-    // druid.lookup.enableLookupSyncOnStartup = true.
-    final LookupExtractorFactoryContainerProvider lookupManager =
-        injector.getInstance(LookupExtractorFactoryContainerProvider.class);
-
-    int lookupCount = 0;
-    long lookupFootprint = 0;
-
-    for (final String lookupName : lookupManager.getAllLookupNames()) {
-      final LookupExtractorFactoryContainer container = lookupManager.get(lookupName).orElse(null);
-
-      if (container != null) {
-        try {
-          final LookupExtractor extractor = container.getLookupExtractorFactory().get();
-          lookupFootprint += extractor.estimateHeapFootprint();
-          lookupCount++;
-        }
-        catch (Exception e) {
-          log.noStackTrace().warn(e, "Failed to load lookup [%s] for size estimation. Skipping.", lookupName);
-        }
-      }
-    }
+    // Some for partition statistics.
+    long minMemory = PARTITION_STATS_MIN_MEMORY_PER_BUNDLE;
 
-    log.debug("Lookup footprint: %d lookups with %,d total bytes.", lookupCount, lookupFootprint);
+    // Some for a minimally-sized super-sorter.
+    minMemory += (long) (2 + numFramesPerOutputChannel) * frameSize;
 
-    return lookupFootprint;
+    // That's enough. Don't consider the possibility that the bundle may be used for producing IncrementalIndex,
+    // because PARTITION_STATS_MIN_MEMORY_PER_BUNDLE more or less covers that.
+    return minMemory;
   }
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/IndexerMemoryManagementModule.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/IndexerMemoryManagementModule.java
index 92f16a631d9f..61f03e40ab6f 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/IndexerMemoryManagementModule.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/IndexerMemoryManagementModule.java
@@ -22,13 +22,15 @@
 import com.google.inject.Binder;
 import com.google.inject.Provides;
 import org.apache.druid.discovery.NodeRole;
-import org.apache.druid.frame.processor.Bouncer;
 import org.apache.druid.guice.LazySingleton;
+import org.apache.druid.guice.ManageLifecycle;
 import org.apache.druid.guice.annotations.LoadScope;
 import org.apache.druid.indexing.worker.config.WorkerConfig;
 import org.apache.druid.initialization.DruidModule;
 import org.apache.druid.msq.exec.MemoryIntrospector;
 import org.apache.druid.msq.exec.MemoryIntrospectorImpl;
+import org.apache.druid.msq.exec.ProcessingBuffersProvider;
+import org.apache.druid.msq.indexing.IndexerProcessingBuffersProvider;
 import org.apache.druid.query.DruidProcessingConfig;
 import org.apache.druid.query.lookup.LookupExtractorFactoryContainerProvider;
 import org.apache.druid.utils.JvmUtils;
@@ -42,37 +44,51 @@
 public class IndexerMemoryManagementModule implements DruidModule
 {
   /**
-   * Allocate up to 75% of memory for MSQ-related stuff (if all running tasks are MSQ tasks).
+   * Allocate up to 60% of memory for the MSQ framework (if all running tasks are MSQ tasks). This does not include the
+   * memory allocated to {@link #PROCESSING_MEMORY_FRACTION}.
    */
-  private static final double USABLE_MEMORY_FRACTION = 0.75;
+  private static final double MSQ_MEMORY_FRACTION = 0.60;
+
+  /**
+   * Allocate up to 15% of memory for processing buffers for MSQ tasks.
+   */
+  private static final double PROCESSING_MEMORY_FRACTION = 0.15;
 
   @Override
   public void configure(Binder binder)
   {
-    // Nothing to do.
+    TaskMemoryManagementConfig.bind(binder);
   }
 
   @Provides
-  @LazySingleton
-  public Bouncer makeProcessorBouncer(final DruidProcessingConfig processingConfig)
-  {
-    return new Bouncer(processingConfig.getNumThreads());
-  }
-
-  @Provides
-  @LazySingleton
+  @ManageLifecycle
   public MemoryIntrospector createMemoryIntrospector(
       final LookupExtractorFactoryContainerProvider lookupProvider,
+      final TaskMemoryManagementConfig taskMemoryManagementConfig,
       final DruidProcessingConfig processingConfig,
       final WorkerConfig workerConfig
   )
   {
     return new MemoryIntrospectorImpl(
-        lookupProvider,
         JvmUtils.getRuntimeInfo().getMaxHeapSizeBytes(),
-        USABLE_MEMORY_FRACTION,
+        MSQ_MEMORY_FRACTION,
+        workerConfig.getCapacity(),
+        PeonMemoryManagementModule.getNumThreads(taskMemoryManagementConfig, processingConfig),
+        lookupProvider
+    );
+  }
+
+  @Provides
+  @LazySingleton
+  public ProcessingBuffersProvider createProcessingBuffersProvider(
+      final MemoryIntrospector memoryIntrospector,
+      final WorkerConfig workerConfig
+  )
+  {
+    return new IndexerProcessingBuffersProvider(
+        (long) (JvmUtils.getRuntimeInfo().getMaxHeapSizeBytes() * PROCESSING_MEMORY_FRACTION),
         workerConfig.getCapacity(),
-        processingConfig.getNumThreads()
+        memoryIntrospector.numProcessingThreads()
     );
   }
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/PeonMemoryManagementModule.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/PeonMemoryManagementModule.java
index 9e814c082781..39265434584c 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/PeonMemoryManagementModule.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/PeonMemoryManagementModule.java
@@ -21,22 +21,30 @@
 
 import com.google.inject.Binder;
 import com.google.inject.Provides;
+import org.apache.druid.collections.NonBlockingPool;
 import org.apache.druid.discovery.NodeRole;
-import org.apache.druid.frame.processor.Bouncer;
 import org.apache.druid.guice.LazySingleton;
+import org.apache.druid.guice.annotations.Global;
 import org.apache.druid.guice.annotations.LoadScope;
 import org.apache.druid.initialization.DruidModule;
+import org.apache.druid.java.util.common.IAE;
 import org.apache.druid.msq.exec.MemoryIntrospector;
 import org.apache.druid.msq.exec.MemoryIntrospectorImpl;
+import org.apache.druid.msq.exec.ProcessingBuffersProvider;
+import org.apache.druid.msq.indexing.PeonProcessingBuffersProvider;
+import org.apache.druid.query.DruidProcessingConfig;
 import org.apache.druid.query.lookup.LookupExtractorFactoryContainerProvider;
 import org.apache.druid.utils.JvmUtils;
 
+import java.nio.ByteBuffer;
+
 /**
  * Provides {@link MemoryIntrospector} for single-task-per-JVM model.
  *
  * @see IndexerMemoryManagementModule for multi-task-per-JVM model used on {@link org.apache.druid.cli.CliIndexer}
  */
 @LoadScope(roles = NodeRole.PEON_JSON_NAME)
+
 public class PeonMemoryManagementModule implements DruidModule
 {
   /**
@@ -45,41 +53,61 @@ public class PeonMemoryManagementModule implements DruidModule
   private static final int NUM_WORKERS_IN_JVM = 1;
 
   /**
-   * Peons may have more than one processing thread, but we currently only use one of them.
-   */
-  private static final int NUM_PROCESSING_THREADS = 1;
-
-  /**
-   * Allocate 75% of memory for MSQ-related stuff.
+   * Allocate 75% of memory for the MSQ framework.
    */
   private static final double USABLE_MEMORY_FRACTION = 0.75;
 
   @Override
   public void configure(Binder binder)
   {
-    // Nothing to do.
-  }
-
-  @Provides
-  @LazySingleton
-  public Bouncer makeProcessorBouncer()
-  {
-    return new Bouncer(NUM_PROCESSING_THREADS);
+    TaskMemoryManagementConfig.bind(binder);
   }
 
   @Provides
   @LazySingleton
   public MemoryIntrospector createMemoryIntrospector(
       final LookupExtractorFactoryContainerProvider lookupProvider,
-      final Bouncer bouncer
+      final DruidProcessingConfig processingConfig,
+      final TaskMemoryManagementConfig taskMemoryManagementConfig
   )
   {
     return new MemoryIntrospectorImpl(
-        lookupProvider,
         JvmUtils.getRuntimeInfo().getMaxHeapSizeBytes(),
         USABLE_MEMORY_FRACTION,
         NUM_WORKERS_IN_JVM,
-        bouncer.getMaxCount()
+        getNumThreads(taskMemoryManagementConfig, processingConfig),
+        lookupProvider
     );
   }
+
+  @Provides
+  @LazySingleton
+  public ProcessingBuffersProvider createProcessingBuffersProvider(
+      @Global final NonBlockingPool<ByteBuffer> processingPool,
+      final MemoryIntrospector memoryIntrospector
+  )
+  {
+    return new PeonProcessingBuffersProvider(
+        processingPool,
+        memoryIntrospector.numProcessingThreads()
+    );
+  }
+
+  public static int getNumThreads(
+      final TaskMemoryManagementConfig taskMemoryManagementConfig,
+      final DruidProcessingConfig processingConfig
+  )
+  {
+    if (taskMemoryManagementConfig.getMaxThreads() == TaskMemoryManagementConfig.UNLIMITED) {
+      return processingConfig.getNumThreads();
+    } else if (taskMemoryManagementConfig.getMaxThreads() > 0) {
+      return Math.min(taskMemoryManagementConfig.getMaxThreads(), processingConfig.getNumThreads());
+    } else {
+      throw new IAE(
+          "Invalid value of %s.maxThreads[%d]",
+          TaskMemoryManagementConfig.BASE_PROPERTY,
+          taskMemoryManagementConfig.getMaxThreads()
+      );
+    }
+  }
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/TaskMemoryManagementConfig.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/TaskMemoryManagementConfig.java
new file mode 100644
index 000000000000..d8dc278aa167
--- /dev/null
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/TaskMemoryManagementConfig.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.guice;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.google.inject.Binder;
+import org.apache.druid.guice.JsonConfigProvider;
+import org.apache.druid.java.util.common.StringUtils;
+
+/**
+ * Server configuration for {@link PeonMemoryManagementModule} and {@link IndexerMemoryManagementModule}.
+ */
+public class TaskMemoryManagementConfig
+{
+  public static final String BASE_PROPERTY = StringUtils.format("%s.task.memory", MSQIndexingModule.BASE_MSQ_KEY);
+  public static final int UNLIMITED = -1;
+
+  @JsonProperty("maxThreads")
+  private int maxThreads = 1;
+
+  public static void bind(final Binder binder)
+  {
+    JsonConfigProvider.bind(
+        binder,
+        BASE_PROPERTY,
+        TaskMemoryManagementConfig.class
+    );
+  }
+
+  public int getMaxThreads()
+  {
+    return maxThreads;
+  }
+}
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerFrameContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerFrameContext.java
index fb6e4a0079f1..e8f3739facb4 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerFrameContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerFrameContext.java
@@ -20,9 +20,11 @@
 package org.apache.druid.msq.indexing;
 
 import com.fasterxml.jackson.databind.ObjectMapper;
-import org.apache.druid.frame.processor.Bouncer;
+import org.apache.druid.collections.ResourceHolder;
+import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.msq.exec.DataServerQueryHandlerFactory;
+import org.apache.druid.msq.exec.ProcessingBuffers;
 import org.apache.druid.msq.exec.WorkerMemoryParameters;
 import org.apache.druid.msq.exec.WorkerStorageParameters;
 import org.apache.druid.msq.kernel.FrameContext;
@@ -35,6 +37,7 @@
 import org.apache.druid.segment.incremental.RowIngestionMeters;
 import org.apache.druid.segment.loading.DataSegmentPusher;
 
+import javax.annotation.Nullable;
 import java.io.File;
 
 public class IndexerFrameContext implements FrameContext
@@ -43,6 +46,8 @@ public class IndexerFrameContext implements FrameContext
   private final IndexerWorkerContext context;
   private final IndexIO indexIO;
   private final DataSegmentProvider dataSegmentProvider;
+  @Nullable
+  private final ResourceHolder<ProcessingBuffers> processingBuffers;
   private final WorkerMemoryParameters memoryParameters;
   private final WorkerStorageParameters storageParameters;
   private final DataServerQueryHandlerFactory dataServerQueryHandlerFactory;
@@ -52,6 +57,7 @@ public IndexerFrameContext(
       IndexerWorkerContext context,
       IndexIO indexIO,
       DataSegmentProvider dataSegmentProvider,
+      @Nullable ResourceHolder<ProcessingBuffers> processingBuffers,
       DataServerQueryHandlerFactory dataServerQueryHandlerFactory,
       WorkerMemoryParameters memoryParameters,
       WorkerStorageParameters storageParameters
@@ -61,6 +67,7 @@ public IndexerFrameContext(
     this.context = context;
     this.indexIO = indexIO;
     this.dataSegmentProvider = dataSegmentProvider;
+    this.processingBuffers = processingBuffers;
     this.memoryParameters = memoryParameters;
     this.storageParameters = storageParameters;
     this.dataServerQueryHandlerFactory = dataServerQueryHandlerFactory;
@@ -135,15 +142,19 @@ public IndexMergerV9 indexMerger()
   }
 
   @Override
-  public WorkerMemoryParameters memoryParameters()
+  public ProcessingBuffers processingBuffers()
   {
-    return memoryParameters;
+    if (processingBuffers != null) {
+      return processingBuffers.get();
+    } else {
+      throw new ISE("No processing buffers");
+    }
   }
 
   @Override
-  public Bouncer processorBouncer()
+  public WorkerMemoryParameters memoryParameters()
   {
-    return context.injector().getInstance(Bouncer.class);
+    return memoryParameters;
   }
 
   @Override
@@ -155,6 +166,8 @@ public WorkerStorageParameters storageParameters()
   @Override
   public void close()
   {
-    // Nothing to close.
+    if (processingBuffers != null) {
+      processingBuffers.close();
+    }
   }
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerProcessingBuffersProvider.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerProcessingBuffersProvider.java
new file mode 100644
index 000000000000..dcf499c3f2f9
--- /dev/null
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerProcessingBuffersProvider.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.indexing;
+
+import org.apache.druid.cli.CliIndexer;
+import org.apache.druid.collections.ReferenceCountingResourceHolder;
+import org.apache.druid.collections.ResourceHolder;
+import org.apache.druid.java.util.common.ISE;
+import org.apache.druid.msq.exec.ProcessingBuffersProvider;
+import org.apache.druid.msq.exec.ProcessingBuffersSet;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Production implementation of {@link ProcessingBuffersProvider} for tasks in {@link CliIndexer}.
+ */
+public class IndexerProcessingBuffersProvider implements ProcessingBuffersProvider
+{
+  private static final int MIN_BUFFER_SIZE = 1_000_000;
+
+  private final long heapMemoryToUse;
+  private final int taskCapacity;
+  private final int numThreads;
+
+  public IndexerProcessingBuffersProvider(final long heapMemoryToUse, final int taskCapacity, final int numThreads)
+  {
+    this.heapMemoryToUse = heapMemoryToUse;
+    this.taskCapacity = taskCapacity;
+    this.numThreads = numThreads;
+  }
+
+  @Override
+  public ResourceHolder<ProcessingBuffersSet> acquire(int poolSize)
+  {
+    if (poolSize == 0) {
+      return new ReferenceCountingResourceHolder<>(ProcessingBuffersSet.EMPTY, () -> {});
+    }
+
+    final long heapMemoryPerWorker = heapMemoryToUse / taskCapacity;
+    final int numThreadsPerWorker = (int) Math.min(
+        numThreads,
+        heapMemoryPerWorker / MIN_BUFFER_SIZE
+    );
+
+    if (numThreadsPerWorker < 1) {
+      // Should not happen unless the CliIndexer has an unreasonable configuration.
+      // CliIndexer typically has well in excess of 1 MB (min buffer size) of heap per task.
+      throw new ISE("Cannot acquire buffers, available heap memory is not enough for task capacity[%d]", taskCapacity);
+    }
+
+    // bufferPools has one list per "poolSize"; each of those lists has "bufferCount" buffers of size "sliceSize".
+    final List<List<ByteBuffer>> bufferPools = new ArrayList<>(poolSize);
+    final int sliceSize = (int) Math.min(Integer.MAX_VALUE, heapMemoryPerWorker / numThreadsPerWorker);
+
+    for (int i = 0; i < poolSize; i++) {
+      final List<ByteBuffer> bufferPool = new ArrayList<>(numThreadsPerWorker);
+      bufferPools.add(bufferPool);
+
+      for (int j = 0; j < numThreadsPerWorker; j++) {
+        bufferPool.add(ByteBuffer.allocate(sliceSize));
+      }
+    }
+
+    // bufferPools is built, return it as a ProcessingBuffersSet.
+    return new ReferenceCountingResourceHolder<>(
+        ProcessingBuffersSet.fromCollection(bufferPools),
+        () -> {} // Garbage collection will reclaim the buffers, since they are on-heap
+    );
+  }
+}
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java
index c36b8e291db9..2a7d91c40af2 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java
@@ -24,6 +24,7 @@
 import com.google.errorprone.annotations.concurrent.GuardedBy;
 import com.google.inject.Injector;
 import com.google.inject.Key;
+import org.apache.druid.collections.ResourceHolder;
 import org.apache.druid.guice.annotations.EscalatedGlobal;
 import org.apache.druid.guice.annotations.Smile;
 import org.apache.druid.indexing.common.SegmentCacheManagerFactory;
@@ -34,7 +35,8 @@
 import org.apache.druid.msq.exec.ControllerClient;
 import org.apache.druid.msq.exec.DataServerQueryHandlerFactory;
 import org.apache.druid.msq.exec.MemoryIntrospector;
-import org.apache.druid.msq.exec.OutputChannelMode;
+import org.apache.druid.msq.exec.ProcessingBuffersProvider;
+import org.apache.druid.msq.exec.ProcessingBuffersSet;
 import org.apache.druid.msq.exec.TaskDataSegmentProvider;
 import org.apache.druid.msq.exec.Worker;
 import org.apache.druid.msq.exec.WorkerClient;
@@ -45,7 +47,7 @@
 import org.apache.druid.msq.indexing.client.IndexerWorkerClient;
 import org.apache.druid.msq.indexing.client.WorkerChatHandler;
 import org.apache.druid.msq.kernel.FrameContext;
-import org.apache.druid.msq.kernel.QueryDefinition;
+import org.apache.druid.msq.kernel.WorkOrder;
 import org.apache.druid.msq.util.MultiStageQueryContext;
 import org.apache.druid.query.QueryContext;
 import org.apache.druid.query.QueryToolChestWarehouse;
@@ -79,12 +81,16 @@ public class IndexerWorkerContext implements WorkerContext
   private final DataServerQueryHandlerFactory dataServerQueryHandlerFactory;
   private final ServiceClientFactory clientFactory;
   private final MemoryIntrospector memoryIntrospector;
+  private final ProcessingBuffersProvider processingBuffersProvider;
   private final int maxConcurrentStages;
   private final boolean includeAllCounters;
 
   @GuardedBy("this")
   private ServiceLocator controllerLocator;
 
+  // Written under synchronized(this) using double-checked locking.
+  private volatile ResourceHolder<ProcessingBuffersSet> processingBuffersSet;
+
   public IndexerWorkerContext(
       final MSQWorkerTask task,
       final TaskToolbox toolbox,
@@ -94,6 +100,7 @@ public IndexerWorkerContext(
       final TaskDataSegmentProvider dataSegmentProvider,
       final ServiceClientFactory clientFactory,
       final MemoryIntrospector memoryIntrospector,
+      final ProcessingBuffersProvider processingBuffersProvider,
       final DataServerQueryHandlerFactory dataServerQueryHandlerFactory
   )
   {
@@ -105,6 +112,7 @@ public IndexerWorkerContext(
     this.dataSegmentProvider = dataSegmentProvider;
     this.clientFactory = clientFactory;
     this.memoryIntrospector = memoryIntrospector;
+    this.processingBuffersProvider = processingBuffersProvider;
     this.dataServerQueryHandlerFactory = dataServerQueryHandlerFactory;
 
     final QueryContext queryContext = QueryContext.of(task.getContext());
@@ -127,6 +135,7 @@ public static IndexerWorkerContext createProductionInstance(
     final MemoryIntrospector memoryIntrospector = injector.getInstance(MemoryIntrospector.class);
     final OverlordClient overlordClient =
         injector.getInstance(OverlordClient.class).withRetryPolicy(StandardRetryPolicy.unlimited());
+    final ProcessingBuffersProvider processingBuffersProvider = injector.getInstance(ProcessingBuffersProvider.class);
     final ObjectMapper smileMapper = injector.getInstance(Key.get(ObjectMapper.class, Smile.class));
     final QueryToolChestWarehouse warehouse = injector.getInstance(QueryToolChestWarehouse.class);
 
@@ -139,6 +148,7 @@ public static IndexerWorkerContext createProductionInstance(
         new TaskDataSegmentProvider(toolbox.getCoordinatorClient(), segmentCacheManager, indexIO),
         serviceClientFactory,
         memoryIntrospector,
+        processingBuffersProvider,
         new DataServerQueryHandlerFactory(
             toolbox.getCoordinatorClient(),
             serviceClientFactory,
@@ -191,6 +201,14 @@ public void registerWorker(Worker worker, Closer closer)
         }
       }
     });
+    closer.register(() -> {
+      synchronized (this) {
+        if (processingBuffersSet != null) {
+          processingBuffersSet.close();
+          processingBuffersSet = null;
+        }
+      }
+    });
 
     // Register the periodic controller checker
     final ExecutorService periodicControllerCheckerExec = Execs.singleThreaded("controller-status-checker-%s");
@@ -281,23 +299,39 @@ public WorkerClient makeWorkerClient()
   }
 
   @Override
-  public FrameContext frameContext(QueryDefinition queryDef, int stageNumber, OutputChannelMode outputChannelMode)
+  public FrameContext frameContext(WorkOrder workOrder)
   {
+    if (processingBuffersSet == null) {
+      synchronized (this) {
+        if (processingBuffersSet == null) {
+          processingBuffersSet = processingBuffersProvider.acquire(
+              workOrder.getQueryDefinition(),
+              maxConcurrentStages()
+          );
+        }
+      }
+    }
+
+    final WorkerMemoryParameters memoryParameters =
+        WorkerMemoryParameters.createProductionInstance(workOrder, memoryIntrospector, maxConcurrentStages);
+    log.info("Memory parameters for stage[%s]: %s", workOrder.getStageDefinition().getId(), memoryParameters);
+
     return new IndexerFrameContext(
-        queryDef.getStageDefinition(stageNumber).getId(),
+        workOrder.getStageDefinition().getId(),
         this,
         indexIO,
         dataSegmentProvider,
+        processingBuffersSet.get().acquireForStage(workOrder.getStageDefinition()),
         dataServerQueryHandlerFactory,
-        WorkerMemoryParameters.createProductionInstanceForWorker(injector, queryDef, stageNumber, maxConcurrentStages),
-        WorkerStorageParameters.createProductionInstance(injector, outputChannelMode)
+        memoryParameters,
+        WorkerStorageParameters.createProductionInstance(injector, workOrder.getOutputChannelMode())
     );
   }
 
   @Override
   public int threadCount()
   {
-    return memoryIntrospector.numProcessorsInJvm();
+    return memoryIntrospector.numProcessingThreads();
   }
 
   @Override
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/PeonProcessingBuffersProvider.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/PeonProcessingBuffersProvider.java
new file mode 100644
index 000000000000..264c7af112fc
--- /dev/null
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/PeonProcessingBuffersProvider.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.indexing;
+
+import org.apache.druid.cli.CliPeon;
+import org.apache.druid.collections.NonBlockingPool;
+import org.apache.druid.collections.ReferenceCountingResourceHolder;
+import org.apache.druid.collections.ResourceHolder;
+import org.apache.druid.error.DruidException;
+import org.apache.druid.java.util.common.io.Closer;
+import org.apache.druid.msq.exec.ProcessingBuffersProvider;
+import org.apache.druid.msq.exec.ProcessingBuffersSet;
+import org.apache.druid.utils.CloseableUtils;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+/**
+ * Production implementation of {@link ProcessingBuffersProvider} for tasks in {@link CliPeon}.
+ */
+public class PeonProcessingBuffersProvider implements ProcessingBuffersProvider
+{
+  private final AtomicBoolean acquired = new AtomicBoolean(false);
+  private final NonBlockingPool<ByteBuffer> bufferPool;
+  private final int bufferCount;
+
+  public PeonProcessingBuffersProvider(
+      final NonBlockingPool<ByteBuffer> bufferPool,
+      final int bufferCount
+  )
+  {
+    this.bufferPool = bufferPool;
+    this.bufferCount = bufferCount;
+  }
+
+  @Override
+  public ResourceHolder<ProcessingBuffersSet> acquire(int poolSize)
+  {
+    if (poolSize == 0) {
+      return new ReferenceCountingResourceHolder<>(ProcessingBuffersSet.EMPTY, () -> {});
+    }
+
+    if (!acquired.compareAndSet(false, true)) {
+      // We expect a single task in the JVM for CliPeon.
+      throw DruidException.defensive("Expected a single call to acquire() for[%s]", getClass().getName());
+    }
+
+    final Closer closer = Closer.create();
+
+    try {
+      // bufferPools has one list per "poolSize"; each of those lists has "bufferCount" buffers.
+      // Build these by acquiring "bufferCount" processing buffers and slicing each one up into "poolSize" slices.
+      final List<List<ByteBuffer>> bufferPools = new ArrayList<>();
+      for (int i = 0; i < poolSize; i++) {
+        bufferPools.add(new ArrayList<>(bufferCount));
+      }
+
+      for (int i = 0; i < bufferCount; i++) {
+        final ResourceHolder<ByteBuffer> bufferHolder = closer.register(bufferPool.take());
+        final ByteBuffer buffer = bufferHolder.get().duplicate();
+        final int sliceSize = buffer.capacity() / poolSize;
+
+        for (int j = 0; j < poolSize; j++) {
+          buffer.position(sliceSize * j).limit(sliceSize * (j + 1));
+          bufferPools.get(j).add(buffer.slice());
+        }
+      }
+
+      // bufferPools is built, return it as a ProcessingBuffersSet.
+      return new ReferenceCountingResourceHolder<>(
+          ProcessingBuffersSet.fromCollection(bufferPools),
+          closer
+      );
+    }
+    catch (Throwable e) {
+      throw CloseableUtils.closeAndWrapInCatch(e, closer);
+    }
+  }
+}
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/NotEnoughMemoryFault.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/NotEnoughMemoryFault.java
index 6f4b36da1eec..d4360a09d1a8 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/NotEnoughMemoryFault.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/NotEnoughMemoryFault.java
@@ -23,6 +23,7 @@
 import com.fasterxml.jackson.annotation.JsonInclude;
 import com.fasterxml.jackson.annotation.JsonProperty;
 import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.druid.java.util.common.StringUtils;
 
 import java.util.Objects;
 
@@ -36,6 +37,7 @@ public class NotEnoughMemoryFault extends BaseMSQFault
   private final long usableMemory;
   private final int serverWorkers;
   private final int serverThreads;
+  private final int inputWorkers;
   private final int maxConcurrentStages;
 
   @JsonCreator
@@ -45,22 +47,33 @@ public NotEnoughMemoryFault(
       @JsonProperty("usableMemory") final long usableMemory,
       @JsonProperty("serverWorkers") final int serverWorkers,
       @JsonProperty("serverThreads") final int serverThreads,
+      @JsonProperty("inputWorkers") final int inputWorkers,
       @JsonProperty("maxConcurrentStages") final int maxConcurrentStages
   )
   {
     super(
         CODE,
-        "Not enough memory. Required at least %,d bytes. (total = %,d bytes; usable = %,d bytes; "
-        + "worker capacity = %,d; processing threads = %,d; concurrent stages = %,d). "
+        "Not enough memory. "
+        + (suggestedServerMemory > 0
+           ? StringUtils.format("Minimum bytes[%,d] is needed for the current configuration. ", suggestedServerMemory)
+           : "")
+        + "(total bytes[%,d]; "
+        + "usable bytes[%,d]; "
+        + "input workers[%,d]; "
+        + "concurrent stages[%,d]; "
+        + "server worker capacity[%,d]; "
+        + "server processing threads[%,d]). "
         + "Increase JVM memory with the -Xmx option"
+        + (inputWorkers > 1 ? ", or reduce maxNumTasks for this query" : "")
+        + (maxConcurrentStages > 1 ? ", or reduce maxConcurrentStages for this query" : "")
         + (serverWorkers > 1 ? ", or reduce worker capacity on this server" : "")
-        + (maxConcurrentStages > 1 ? ", or reduce maxConcurrentStages for this query" : ""),
-        suggestedServerMemory,
+        + (serverThreads > 1 ? ", or reduce processing threads on this server" : ""),
         serverMemory,
         usableMemory,
+        inputWorkers,
+        maxConcurrentStages,
         serverWorkers,
-        serverThreads,
-        maxConcurrentStages
+        serverThreads
     );
 
     this.suggestedServerMemory = suggestedServerMemory;
@@ -68,10 +81,12 @@ public NotEnoughMemoryFault(
     this.usableMemory = usableMemory;
     this.serverWorkers = serverWorkers;
     this.serverThreads = serverThreads;
+    this.inputWorkers = inputWorkers;
     this.maxConcurrentStages = maxConcurrentStages;
   }
 
   @JsonProperty
+  @JsonInclude(JsonInclude.Include.NON_DEFAULT)
   public long getSuggestedServerMemory()
   {
     return suggestedServerMemory;
@@ -101,6 +116,13 @@ public int getServerThreads()
     return serverThreads;
   }
 
+  @JsonProperty
+  @JsonInclude(JsonInclude.Include.NON_DEFAULT)
+  public int getInputWorkers()
+  {
+    return inputWorkers;
+  }
+
   @JsonProperty
   @JsonInclude(JsonInclude.Include.NON_DEFAULT)
   public int getMaxConcurrentStages()
@@ -126,6 +148,7 @@ public boolean equals(Object o)
            && usableMemory == that.usableMemory
            && serverWorkers == that.serverWorkers
            && serverThreads == that.serverThreads
+           && inputWorkers == that.inputWorkers
            && maxConcurrentStages == that.maxConcurrentStages;
   }
 
@@ -139,6 +162,7 @@ public int hashCode()
         usableMemory,
         serverWorkers,
         serverThreads,
+        inputWorkers,
         maxConcurrentStages
     );
   }
@@ -148,10 +172,11 @@ public String toString()
   {
     return "NotEnoughMemoryFault{" +
            "suggestedServerMemory=" + suggestedServerMemory +
-           " bytes, serverMemory=" + serverMemory +
-           " bytes, usableMemory=" + usableMemory +
-           " bytes, serverWorkers=" + serverWorkers +
+           ", serverMemory=" + serverMemory +
+           ", usableMemory=" + usableMemory +
+           ", serverWorkers=" + serverWorkers +
            ", serverThreads=" + serverThreads +
+           ", inputWorkers=" + inputWorkers +
            ", maxConcurrentStages=" + maxConcurrentStages +
            '}';
   }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/TooManyRowsWithSameKeyFault.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/TooManyRowsWithSameKeyFault.java
index 60d355579b6b..be284ae502d8 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/TooManyRowsWithSameKeyFault.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/TooManyRowsWithSameKeyFault.java
@@ -44,9 +44,8 @@ public TooManyRowsWithSameKeyFault(
   {
     super(
         CODE,
-        "Too many rows with the same key[%s] during sort-merge join (bytes buffered[%,d], limit[%,d]). "
-        + "Try increasing heap memory available to workers, "
-        + "or adjusting your query to process fewer rows with this key.",
+        "Too many rows with the same key[%s] on both sides of sort-merge join (bytes buffered[%,d], limit[%,d]). "
+        + "Try adjusting your query such that there are fewer rows with this key on at least one side of the join.",
         key,
         numBytes,
         maxBytes
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/processor/SegmentGeneratorFrameProcessorFactory.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/processor/SegmentGeneratorFrameProcessorFactory.java
index 16f9deff63d0..1796df89bf71 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/processor/SegmentGeneratorFrameProcessorFactory.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/processor/SegmentGeneratorFrameProcessorFactory.java
@@ -28,6 +28,7 @@
 import com.google.common.collect.Iterables;
 import org.apache.druid.frame.processor.OutputChannelFactory;
 import org.apache.druid.frame.processor.OutputChannels;
+import org.apache.druid.frame.processor.manager.ConcurrencyLimitedProcessorManager;
 import org.apache.druid.frame.processor.manager.ProcessorManagers;
 import org.apache.druid.indexer.partitions.DynamicPartitionsSpec;
 import org.apache.druid.indexer.partitions.PartitionsSpec;
@@ -210,21 +211,29 @@ public Pair<Integer, ReadableInput> apply(ReadableInput readableInput)
     );
 
     return new ProcessorsAndChannels<>(
-        ProcessorManagers.of(workers)
-                         .withAccumulation(
-                             new HashSet<>(),
-                             (acc, segment) -> {
-                               if (segment != null) {
-                                 acc.add(segment);
-                               }
-
-                               return acc;
-                             }
-                         ),
+        // Run at most one segmentGenerator per work order, since segment generation memory is carved out
+        // per-worker, not per-processor. See WorkerMemoryParameters for how the memory limits are calculated.
+        new ConcurrencyLimitedProcessorManager<>(ProcessorManagers.of(workers), 1)
+            .withAccumulation(
+                new HashSet<>(),
+                (acc, segment) -> {
+                  if (segment != null) {
+                    acc.add(segment);
+                  }
+
+                  return acc;
+                }
+            ),
         OutputChannels.none()
     );
   }
 
+  @Override
+  public boolean usesProcessingBuffers()
+  {
+    return false;
+  }
+
   @Override
   public TypeReference<Set<DataSegment>> getResultTypeReference()
   {
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/input/InputSpecs.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/input/InputSpecs.java
index 78241257710b..250f320118a8 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/input/InputSpecs.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/input/InputSpecs.java
@@ -35,6 +35,9 @@ private InputSpecs()
     // No instantiation.
   }
 
+  /**
+   * Returns the set of input stages, from {@link StageInputSpec}, for a given list of {@link InputSpec}.
+   */
   public static IntSet getStageNumbers(final List<InputSpec> specs)
   {
     final IntSet retVal = new IntRBTreeSet();
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/FrameContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/FrameContext.java
index da962a9d3931..1b80f72f86f5 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/FrameContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/FrameContext.java
@@ -20,9 +20,9 @@
 package org.apache.druid.msq.kernel;
 
 import com.fasterxml.jackson.databind.ObjectMapper;
-import org.apache.druid.frame.processor.Bouncer;
 import org.apache.druid.msq.exec.DataServerQueryHandlerFactory;
-import org.apache.druid.msq.exec.OutputChannelMode;
+import org.apache.druid.msq.exec.ProcessingBuffers;
+import org.apache.druid.msq.exec.WorkerImpl;
 import org.apache.druid.msq.exec.WorkerMemoryParameters;
 import org.apache.druid.msq.exec.WorkerStorageParameters;
 import org.apache.druid.msq.querykit.DataSegmentProvider;
@@ -40,7 +40,7 @@
  * Provides services and objects for the functioning of the frame processors. Scoped to a specific stage of a
  * specific query, i.e., one {@link WorkOrder}.
  *
- * Generated by {@link org.apache.druid.msq.exec.WorkerContext#frameContext(QueryDefinition, int, OutputChannelMode)}.
+ * Generated by {@link org.apache.druid.msq.exec.WorkerContext#frameContext(WorkOrder)}.
  */
 public interface FrameContext extends Closeable
 {
@@ -54,6 +54,9 @@ public interface FrameContext extends Closeable
 
   DataServerQueryHandlerFactory dataServerQueryHandlerFactory();
 
+  /**
+   * Temporary directory, fully owned by this particular stage.
+   */
   File tempDir();
 
   ObjectMapper jsonMapper();
@@ -66,7 +69,7 @@ public interface FrameContext extends Closeable
 
   IndexMergerV9 indexMerger();
 
-  Bouncer processorBouncer();
+  ProcessingBuffers processingBuffers();
 
   WorkerMemoryParameters memoryParameters();
 
@@ -76,4 +79,11 @@ default File tempDir(String name)
   {
     return new File(tempDir(), name);
   }
+
+  /**
+   * Releases resources used in processing. This is called when processing has completed, but before results are
+   * cleaned up. Specifically, it is called by {@link WorkerImpl.KernelHolder#processorCloser}.
+   */
+  @Override
+  void close();
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/FrameProcessorFactory.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/FrameProcessorFactory.java
index fbf02d46e346..1bdba5ee22e0 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/FrameProcessorFactory.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/FrameProcessorFactory.java
@@ -78,6 +78,11 @@ ProcessorsAndChannels<T, R> makeProcessors(
       boolean removeNullBytes
   ) throws IOException;
 
+  /**
+   * Whether processors from this factory use {@link org.apache.druid.msq.exec.ProcessingBuffers}.
+   */
+  boolean usesProcessingBuffers();
+
   @Nullable
   TypeReference<R> getResultTypeReference();
 
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/StageDefinition.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/StageDefinition.java
index 19a7978abba8..cd2bb6a81f44 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/StageDefinition.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/StageDefinition.java
@@ -146,6 +146,13 @@ public class StageDefinition
     }
   }
 
+  public static boolean mustGatherResultKeyStatistics(@Nullable final ShuffleSpec shuffleSpec)
+  {
+    return shuffleSpec != null
+           && shuffleSpec.kind() == ShuffleKind.GLOBAL_SORT
+           && ((GlobalSortShuffleSpec) shuffleSpec).mustGatherResultKeyStatistics();
+  }
+
   public static StageDefinitionBuilder builder(final int stageNumber)
   {
     return new StageDefinitionBuilder(stageNumber);
@@ -302,14 +309,10 @@ public int getStageNumber()
    * For eg: we know there's exactly one partition in query shapes like `select with limit`.
    * <br></br>
    * In such cases, we return a false.
-   *
-   * @return
    */
   public boolean mustGatherResultKeyStatistics()
   {
-    return shuffleSpec != null
-           && shuffleSpec.kind() == ShuffleKind.GLOBAL_SORT
-           && ((GlobalSortShuffleSpec) shuffleSpec).mustGatherResultKeyStatistics();
+    return mustGatherResultKeyStatistics(shuffleSpec);
   }
 
   public Either<Long, ClusterByPartitions> generatePartitionBoundariesForShuffle(
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/worker/WorkerStagePhase.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/worker/WorkerStagePhase.java
index 4e59e7d17a89..10543beeb069 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/worker/WorkerStagePhase.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/worker/WorkerStagePhase.java
@@ -19,6 +19,8 @@
 
 package org.apache.druid.msq.kernel.worker;
 
+import org.apache.druid.msq.exec.ProcessingBuffers;
+
 /**
  * Phases that a stage can be in, as far as the worker is concerned.
  *
@@ -99,6 +101,8 @@ public boolean isTerminal()
   /**
    * Whether this phase indicates a stage is running and consuming its full complement of resources.
    *
+   * Importantly, stages that are not running are not holding {@link ProcessingBuffers}.
+   *
    * There are still some resources that can be consumed by stages that are not running. For example, in the
    * {@link #FINISHED} state, stages can still have data on disk that has not been cleaned-up yet, some pointers
    * to that data that still reside in memory, and some counters in memory available for collection by the controller.
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/BaseLeafFrameProcessorFactory.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/BaseLeafFrameProcessorFactory.java
index 4cf233876338..013b6d4c93c0 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/BaseLeafFrameProcessorFactory.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/BaseLeafFrameProcessorFactory.java
@@ -352,7 +352,7 @@ private FrameProcessor<Function<SegmentReference, SegmentReference>> makeSegment
       return BroadcastJoinSegmentMapFnProcessor.create(
           query,
           broadcastInputs,
-          frameContext.memoryParameters().getBroadcastJoinMemory()
+          frameContext.memoryParameters().getBroadcastBufferMemory()
       );
     }
   }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/BroadcastJoinSegmentMapFnProcessor.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/BroadcastJoinSegmentMapFnProcessor.java
index ab160f7319da..cbb79c45702b 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/BroadcastJoinSegmentMapFnProcessor.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/BroadcastJoinSegmentMapFnProcessor.java
@@ -83,7 +83,7 @@ public class BroadcastJoinSegmentMapFnProcessor implements FrameProcessor<Functi
    * @param channels                         list of input channels
    * @param channelReaders                   list of input channel readers; corresponds one-to-one with "channels"
    * @param memoryReservedForBroadcastJoin   total bytes of frames we are permitted to use; derived from
-   *                                         {@link WorkerMemoryParameters#getBroadcastJoinMemory()}
+   *                                         {@link WorkerMemoryParameters#getBroadcastBufferMemory()}
    */
   public BroadcastJoinSegmentMapFnProcessor(
       final Query<?> query,
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/WindowOperatorQueryFrameProcessorFactory.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/WindowOperatorQueryFrameProcessorFactory.java
index 9852f4f40988..6ad7742672f9 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/WindowOperatorQueryFrameProcessorFactory.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/WindowOperatorQueryFrameProcessorFactory.java
@@ -174,6 +174,11 @@ public ProcessorsAndChannels<Object, Long> makeProcessors(
     );
   }
 
+  @Override
+  public boolean usesProcessingBuffers()
+  {
+    return false;
+  }
 
   @Override
   public boolean equals(Object o)
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/common/OffsetLimitFrameProcessorFactory.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/common/OffsetLimitFrameProcessorFactory.java
index d04a75011fa7..a0332edb9027 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/common/OffsetLimitFrameProcessorFactory.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/common/OffsetLimitFrameProcessorFactory.java
@@ -140,6 +140,12 @@ public ProcessorsAndChannels<Object, Long> makeProcessors(
     );
   }
 
+  @Override
+  public boolean usesProcessingBuffers()
+  {
+    return false;
+  }
+
   @Override
   public boolean equals(Object o)
   {
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/common/SortMergeJoinFrameProcessorFactory.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/common/SortMergeJoinFrameProcessorFactory.java
index 9eb95a468fd7..55391b138619 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/common/SortMergeJoinFrameProcessorFactory.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/common/SortMergeJoinFrameProcessorFactory.java
@@ -197,6 +197,12 @@ public ProcessorsAndChannels<Object, Long> makeProcessors(
     );
   }
 
+  @Override
+  public boolean usesProcessingBuffers()
+  {
+    return false;
+  }
+
   /**
    * Extracts key columns from a {@link JoinConditionAnalysis}. The returned list has two elements: 0 is the
    * left-hand side, 1 is the right-hand side. Each sub-list has one element for each equi-condition.
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPostShuffleFrameProcessorFactory.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPostShuffleFrameProcessorFactory.java
index 851fc21c52e8..ab683c8329ab 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPostShuffleFrameProcessorFactory.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPostShuffleFrameProcessorFactory.java
@@ -129,4 +129,10 @@ public ProcessorsAndChannels<Object, Long> makeProcessors(
         OutputChannels.wrapReadOnly(ImmutableList.copyOf(outputChannels.values()))
     );
   }
+
+  @Override
+  public boolean usesProcessingBuffers()
+  {
+    return false;
+  }
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPreShuffleFrameProcessor.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPreShuffleFrameProcessor.java
index 470b87d9416d..05c6f36c35fb 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPreShuffleFrameProcessor.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPreShuffleFrameProcessor.java
@@ -20,6 +20,7 @@
 package org.apache.druid.msq.querykit.groupby;
 
 import com.google.common.collect.Iterables;
+import org.apache.druid.collections.NonBlockingPool;
 import org.apache.druid.collections.ResourceHolder;
 import org.apache.druid.frame.Frame;
 import org.apache.druid.frame.channel.FrameWithPartition;
@@ -60,6 +61,7 @@
 import org.apache.druid.timeline.SegmentId;
 
 import java.io.IOException;
+import java.nio.ByteBuffer;
 import java.util.List;
 import java.util.function.Function;
 
@@ -72,6 +74,7 @@ public class GroupByPreShuffleFrameProcessor extends BaseLeafFrameProcessor
   private static final Logger log = new Logger(GroupByPreShuffleFrameProcessor.class);
   private final GroupByQuery query;
   private final GroupingEngine groupingEngine;
+  private final NonBlockingPool<ByteBuffer> bufferPool;
   private final ColumnSelectorFactory frameWriterColumnSelectorFactory;
   private final Closer closer = Closer.create();
 
@@ -84,6 +87,7 @@ public class GroupByPreShuffleFrameProcessor extends BaseLeafFrameProcessor
   public GroupByPreShuffleFrameProcessor(
       final GroupByQuery query,
       final GroupingEngine groupingEngine,
+      final NonBlockingPool<ByteBuffer> bufferPool,
       final ReadableInput baseInput,
       final Function<SegmentReference, SegmentReference> segmentMapFn,
       final ResourceHolder<WritableFrameChannel> outputChannelHolder,
@@ -98,6 +102,7 @@ public GroupByPreShuffleFrameProcessor(
     );
     this.query = query;
     this.groupingEngine = groupingEngine;
+    this.bufferPool = bufferPool;
     this.frameWriterColumnSelectorFactory = RowBasedGrouperHelper.createResultRowBasedColumnSelectorFactory(
         query,
         () -> resultYielder.get(),
@@ -155,6 +160,7 @@ protected ReturnOrAwait<Unit> runWithSegment(final SegmentWithDescriptor segment
               query.withQuerySegmentSpec(new SpecificSegmentSpec(segment.getDescriptor())),
               mappedSegment.asCursorFactory(),
               mappedSegment.as(TimeBoundaryInspector.class),
+              bufferPool,
               null
           );
 
@@ -189,6 +195,7 @@ protected ReturnOrAwait<Unit> runWithInputChannel(
                 query.withQuerySegmentSpec(new MultipleIntervalSegmentSpec(Intervals.ONLY_ETERNITY)),
                 mappedSegment.asCursorFactory(),
                 mappedSegment.as(TimeBoundaryInspector.class),
+                bufferPool,
                 null
             );
 
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPreShuffleFrameProcessorFactory.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPreShuffleFrameProcessorFactory.java
index 71f1e531b40e..5ae163c1fd0d 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPreShuffleFrameProcessorFactory.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPreShuffleFrameProcessorFactory.java
@@ -65,10 +65,17 @@ protected FrameProcessor<Object> makeProcessor(
     return new GroupByPreShuffleFrameProcessor(
         query,
         frameContext.groupingEngine(),
+        frameContext.processingBuffers().getBufferPool(),
         baseInput,
         segmentMapFn,
         outputChannelHolder,
         frameWriterFactoryHolder
     );
   }
+
+  @Override
+  public boolean usesProcessingBuffers()
+  {
+    return true;
+  }
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/results/ExportResultsFrameProcessorFactory.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/results/ExportResultsFrameProcessorFactory.java
index 930ba8861555..fe2598a95141 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/results/ExportResultsFrameProcessorFactory.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/results/ExportResultsFrameProcessorFactory.java
@@ -117,6 +117,12 @@ public ResultsContext getResultsContext()
     return resultsContext;
   }
 
+  @Override
+  public boolean usesProcessingBuffers()
+  {
+    return false;
+  }
+
   @Override
   public ProcessorsAndChannels<Object, Object> makeProcessors(
       StageDefinition stageDefinition,
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/results/QueryResultFrameProcessorFactory.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/results/QueryResultFrameProcessorFactory.java
index 17fc6c94817f..b9befa8374d4 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/results/QueryResultFrameProcessorFactory.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/results/QueryResultFrameProcessorFactory.java
@@ -115,4 +115,10 @@ public ProcessorsAndChannels<Object, Long> makeProcessors(
         OutputChannels.wrapReadOnly(ImmutableList.copyOf(outputChannels.values()))
     );
   }
+
+  @Override
+  public boolean usesProcessingBuffers()
+  {
+    return false;
+  }
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/scan/ScanQueryFrameProcessorFactory.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/scan/ScanQueryFrameProcessorFactory.java
index e3d3619dd955..97ade19f5bcd 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/scan/ScanQueryFrameProcessorFactory.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/scan/ScanQueryFrameProcessorFactory.java
@@ -85,4 +85,10 @@ protected FrameProcessor<Object> makeProcessor(
         frameWriterFactoryHolder
     );
   }
+
+  @Override
+  public boolean usesProcessingBuffers()
+  {
+    return false;
+  }
 }
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/ControllerMemoryParametersTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/ControllerMemoryParametersTest.java
index 9d27dcca666b..d6ae0d7e190a 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/ControllerMemoryParametersTest.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/ControllerMemoryParametersTest.java
@@ -39,7 +39,7 @@ public void test_oneQueryInJvm()
         1
     );
 
-    Assert.assertEquals(100_400_000, memoryParameters.getPartitionStatisticsMaxRetainedBytes());
+    Assert.assertEquals(101_400_000, memoryParameters.getPartitionStatisticsMaxRetainedBytes());
   }
 
   @Test
@@ -50,7 +50,7 @@ public void test_oneQueryInJvm_oneHundredWorkers()
         100
     );
 
-    Assert.assertEquals(103_800_000, memoryParameters.getPartitionStatisticsMaxRetainedBytes());
+    Assert.assertEquals(104_800_000, memoryParameters.getPartitionStatisticsMaxRetainedBytes());
   }
 
   @Test
@@ -61,7 +61,7 @@ public void test_twoQueriesInJvm()
         1
     );
 
-    Assert.assertEquals(49_200_000, memoryParameters.getPartitionStatisticsMaxRetainedBytes());
+    Assert.assertEquals(50_200_000, memoryParameters.getPartitionStatisticsMaxRetainedBytes());
   }
 
   @Test
@@ -91,7 +91,6 @@ public void test_notEnoughMemory()
     Assert.assertEquals(1, fault.getServerWorkers());
     Assert.assertEquals(NUM_PROCESSORS_IN_JVM, fault.getServerThreads());
     Assert.assertEquals(24_000_000, fault.getUsableMemory());
-    Assert.assertEquals(33_750_000, fault.getSuggestedServerMemory());
   }
 
   @Test
@@ -102,7 +101,7 @@ public void test_minimalMemory()
         1
     );
 
-    Assert.assertEquals(25_000_000, memoryParameters.getPartitionStatisticsMaxRetainedBytes());
+    Assert.assertEquals(26_000_000, memoryParameters.getPartitionStatisticsMaxRetainedBytes());
   }
 
   private MemoryIntrospector makeMemoryIntrospector(
@@ -111,11 +110,11 @@ private MemoryIntrospector makeMemoryIntrospector(
   )
   {
     return new MemoryIntrospectorImpl(
-        new TestLookupProvider(ImmutableMap.of()),
         totalMemoryInJvm,
         USABLE_MEMORY_FRACTION,
         numQueriesInJvm,
-        NUM_PROCESSORS_IN_JVM
+        NUM_PROCESSORS_IN_JVM,
+        new TestLookupProvider(ImmutableMap.of())
     );
   }
 }
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQInsertTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQInsertTest.java
index 4d61555c80f2..af51c5dd4a7e 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQInsertTest.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQInsertTest.java
@@ -1461,7 +1461,7 @@ public void testInsertWithTooLargeRowShouldThrowException(String contextName, Ma
     final File toRead = getResourceAsTemporaryFile("/wikipedia-sampled.json");
     final String toReadFileNameAsJson = queryFramework().queryJsonMapper().writeValueAsString(toRead.getAbsolutePath());
 
-    Mockito.doReturn(500).when(workerMemoryParameters).getStandardFrameSize();
+    Mockito.doReturn(500).when(workerMemoryParameters).getFrameSize();
 
     testIngestQuery().setSql(" insert into foo1 SELECT\n"
                              + "  floor(TIME_PARSE(\"timestamp\") to day) AS __time,\n"
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/WorkerMemoryParametersTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/WorkerMemoryParametersTest.java
index 1ead2a181fd9..990610af99e8 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/WorkerMemoryParametersTest.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/WorkerMemoryParametersTest.java
@@ -19,185 +19,387 @@
 
 package org.apache.druid.msq.exec;
 
+import com.google.common.collect.ImmutableList;
+import it.unimi.dsi.fastutil.ints.IntSet;
+import it.unimi.dsi.fastutil.ints.IntSets;
 import nl.jqno.equalsverifier.EqualsVerifier;
+import org.apache.druid.frame.key.ClusterBy;
+import org.apache.druid.frame.key.KeyColumn;
+import org.apache.druid.frame.key.KeyOrder;
 import org.apache.druid.msq.indexing.error.MSQException;
-import org.apache.druid.msq.indexing.error.MSQFault;
 import org.apache.druid.msq.indexing.error.NotEnoughMemoryFault;
-import org.apache.druid.msq.indexing.error.TooManyWorkersFault;
+import org.apache.druid.msq.input.InputSlice;
+import org.apache.druid.msq.input.stage.ReadablePartitions;
+import org.apache.druid.msq.input.stage.StageInputSlice;
+import org.apache.druid.msq.kernel.GlobalSortTargetSizeShuffleSpec;
+import org.apache.druid.msq.kernel.ShuffleSpec;
 import org.junit.Assert;
 import org.junit.Test;
 
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
 public class WorkerMemoryParametersTest
 {
   @Test
-  public void test_oneWorkerInJvm_alone()
+  public void test_1WorkerInJvm_alone_1Thread()
   {
-    Assert.assertEquals(params(335_500_000, 1, 41, 75_000_000), create(1_000_000_000, 1, 1, 1, 1, 0, 0));
-    Assert.assertEquals(params(223_000_000, 2, 13, 75_000_000), create(1_000_000_000, 1, 2, 1, 1, 0, 0));
-    Assert.assertEquals(params(133_000_000, 4, 3, 75_000_000), create(1_000_000_000, 1, 4, 1, 1, 0, 0));
-    Assert.assertEquals(params(73_000_000, 3, 2, 75_000_000), create(1_000_000_000, 1, 8, 1, 1, 0, 0));
-    Assert.assertEquals(params(49_923_076, 2, 2, 75_000_000), create(1_000_000_000, 1, 12, 1, 1, 0, 0));
+    final int numThreads = 1;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
 
-    final MSQException e = Assert.assertThrows(
-        MSQException.class,
-        () -> create(1_000_000_000, 1, 32, 1, 1, 0, 0)
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(1_250_000_000, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, 1, numThreads));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
+
+    Assert.assertEquals(
+        new WorkerMemoryParameters(973_000_000, frameSize, 1, 874, 97_300_000, 0),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 1, 1)
     );
-    Assert.assertEquals(new NotEnoughMemoryFault(1_588_044_000, 1_000_000_000, 750_000_000, 1, 32, 1), e.getFault());
+  }
 
-    final MSQFault fault = Assert.assertThrows(MSQException.class, () -> create(1_000_000_000, 2, 32, 1, 1, 0, 0))
-                                 .getFault();
+  @Test
+  public void test_1WorkerInJvm_alone_withBroadcast_1Thread()
+  {
+    final int numThreads = 1;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
+
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(1_250_000_000, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(
+        ReadablePartitions.striped(0, 1, numThreads),
+        ReadablePartitions.striped(0, 1, 1)
+    );
+    final IntSet broadcastInputs = IntSets.singleton(1);
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
 
-    Assert.assertEquals(new NotEnoughMemoryFault(2024045333, 1_000_000_000, 750_000_000, 2, 32, 1), fault);
+    Assert.assertEquals(
+        new WorkerMemoryParameters(673_000_000, frameSize, 1, 604, 67_300_000, 200_000_000),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 1, 1)
+    );
   }
 
   @Test
-  public void test_oneWorkerInJvm_alone_twoConcurrentStages()
+  public void test_1WorkerInJvm_alone_4Threads()
   {
-    Assert.assertEquals(params(166_750_000, 1, 20, 37_500_000), create(1_000_000_000, 1, 1, 2, 1, 0, 0));
-    Assert.assertEquals(params(110_500_000, 2, 6, 37_500_000), create(1_000_000_000, 1, 2, 2, 1, 0, 0));
-    Assert.assertEquals(params(65_500_000, 2, 3, 37_500_000), create(1_000_000_000, 1, 4, 2, 1, 0, 0));
-    Assert.assertEquals(params(35_500_000, 1, 3, 37_500_000), create(1_000_000_000, 1, 8, 2, 1, 0, 0));
+    final int numThreads = 4;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
 
-    final MSQException e = Assert.assertThrows(
-        MSQException.class,
-        () -> create(1_000_000_000, 1, 12, 2, 1, 0, 0)
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(1_250_000_000, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, 1, numThreads));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
+
+    Assert.assertEquals(
+        new WorkerMemoryParameters(892_000_000, frameSize, 4, 199, 22_300_000, 0),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 1, 1)
     );
+  }
 
-    Assert.assertEquals(new NotEnoughMemoryFault(1_736_034_666, 1_000_000_000, 750_000_000, 1, 12, 2), e.getFault());
+  @Test
+  public void test_1WorkerInJvm_alone_withBroadcast_4Threads()
+  {
+    final int numThreads = 4;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
 
-    final MSQFault fault = Assert.assertThrows(MSQException.class, () -> create(1_000_000_000, 2, 32, 2, 1, 0, 0))
-                                 .getFault();
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(1_250_000_000, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(
+        ReadablePartitions.striped(0, 1, numThreads),
+        ReadablePartitions.striped(0, 1, 1)
+    );
+    final IntSet broadcastInputs = IntSets.singleton(1);
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
 
-    Assert.assertEquals(new NotEnoughMemoryFault(4_048_090_666L, 1_000_000_000, 750_000_000, 2, 32, 2), fault);
+    Assert.assertEquals(
+        new WorkerMemoryParameters(592_000_000, frameSize, 4, 132, 14_800_000, 200_000_000),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 1, 1)
+    );
   }
 
   @Test
-  public void test_oneWorkerInJvm_twoHundredWorkersInCluster()
+  public void test_1WorkerInJvm_alone_noStats_4Threads()
   {
-    Assert.assertEquals(params(474_000_000, 1, 83, 150_000_000), create(2_000_000_000, 1, 1, 1, 200, 0, 0));
-    Assert.assertEquals(params(249_000_000, 2, 27, 150_000_000), create(2_000_000_000, 1, 2, 1, 200, 0, 0));
+    final int numThreads = 4;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
 
-    final MSQException e = Assert.assertThrows(
-        MSQException.class,
-        () -> create(1_000_000_000, 1, 4, 1, 200, 0, 0)
-    );
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(1_250_000_000, 1, 4);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, 1, numThreads));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = null;
 
-    Assert.assertEquals(new TooManyWorkersFault(200, 109), e.getFault());
+    Assert.assertEquals(
+        new WorkerMemoryParameters(892_000_000, frameSize, 4, 222, 0, 0),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 1, 1)
+    );
   }
 
   @Test
-  public void test_fourWorkersInJvm_twoHundredWorkersInCluster()
+  public void test_1WorkerInJvm_alone_2ConcurrentStages_4Threads()
   {
-    Assert.assertEquals(params(1_014_000_000, 1, 150, 168_750_000), create(9_000_000_000L, 4, 1, 1, 200, 0, 0));
-    Assert.assertEquals(params(811_500_000, 2, 62, 168_750_000), create(9_000_000_000L, 4, 2, 1, 200, 0, 0));
-    Assert.assertEquals(params(558_375_000, 4, 22, 168_750_000), create(9_000_000_000L, 4, 4, 1, 200, 0, 0));
-    Assert.assertEquals(params(305_250_000, 4, 14, 168_750_000), create(9_000_000_000L, 4, 8, 1, 200, 0, 0));
-    Assert.assertEquals(params(102_750_000, 4, 8, 168_750_000), create(9_000_000_000L, 4, 16, 1, 200, 0, 0));
+    final int numThreads = 4;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
 
-    final MSQException e = Assert.assertThrows(
-        MSQException.class,
-        () -> create(8_000_000_000L, 4, 32, 1, 200, 0, 0)
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(1_250_000_000, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, 1, numThreads));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
+
+    Assert.assertEquals(
+        new WorkerMemoryParameters(392_000_000, frameSize, 4, 87, 9_800_000, 0),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 2, 1)
     );
+  }
 
-    Assert.assertEquals(new TooManyWorkersFault(200, 124), e.getFault());
+  @Test
+  public void test_1WorkerInJvm_alone_2ConcurrentStages_4Threads_highHeap()
+  {
+    final int numThreads = 4;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
 
-    // Make sure 124 actually works, and 125 doesn't. (Verify the error message above.)
-    Assert.assertEquals(params(25_000_000, 4, 3, 150_000_000), create(8_000_000_000L, 4, 32, 1, 124, 0, 0));
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(6_250_000_000L, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, 1, numThreads));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
 
-    final MSQException e2 = Assert.assertThrows(
-        MSQException.class,
-        () -> create(8_000_000_000L, 4, 32, 1, 125, 0, 0)
+    Assert.assertEquals(
+        new WorkerMemoryParameters(2_392_000_000L, frameSize, 4, 537, 59_800_000, 0),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 2, 1)
     );
+  }
+
+  @Test
+  public void test_1WorkerInJvm_alone_32Threads()
+  {
+    final int numThreads = 32;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
 
-    Assert.assertEquals(new TooManyWorkersFault(125, 124), e2.getFault());
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(1_250_000_000, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, 1, numThreads));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
+
+    Assert.assertEquals(
+        new WorkerMemoryParameters(136_000_000, frameSize, 32, 2, 425_000, 0),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 1, 1)
+    );
   }
 
   @Test
-  public void test_fourWorkersInJvm_twoHundredWorkersInCluster_twoConcurrentStages()
+  public void test_1WorkerInJvm_alone_33Threads()
   {
-    Assert.assertEquals(params(406_500_000, 1, 74, 84_375_000), create(9_000_000_000L, 4, 1, 2, 200, 0, 0));
-    Assert.assertEquals(params(305_250_000, 2, 30, 84_375_000), create(9_000_000_000L, 4, 2, 2, 200, 0, 0));
-    Assert.assertEquals(params(178_687_500, 4, 10, 84_375_000), create(9_000_000_000L, 4, 4, 2, 200, 0, 0));
-    Assert.assertEquals(params(52_125_000, 4, 6, 84_375_000), create(9_000_000_000L, 4, 8, 2, 200, 0, 0));
+    final int numThreads = 33;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
 
-    final MSQException e = Assert.assertThrows(
-        MSQException.class,
-        () -> create(8_000_000_000L, 4, 16, 2, 200, 0, 0)
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(1_250_000_000, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, 1, numThreads));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
+
+    Assert.assertEquals(
+        new WorkerMemoryParameters(109_000_000, frameSize, 32, 2, 330_303, 0),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 1, 1)
     );
+  }
 
-    Assert.assertEquals(new TooManyWorkersFault(200, 109), e.getFault());
+  @Test
+  public void test_1WorkerInJvm_alone_40Threads()
+  {
+    final int numThreads = 40;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
 
-    // Make sure 109 actually works, and 110 doesn't. (Verify the error message above.)
-    Assert.assertEquals(params(25_000_000, 4, 3, 75_000_000), create(8_000_000_000L, 4, 16, 2, 109, 0, 0));
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(1_250_000_000, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, 1, numThreads));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
 
-    final MSQException e2 = Assert.assertThrows(
+    final MSQException e = Assert.assertThrows(
         MSQException.class,
-        () -> create(8_000_000_000L, 4, 16, 2, 110, 0, 0)
+        () -> WorkerMemoryParameters.createInstance(
+            memoryIntrospector,
+            frameSize,
+            slices,
+            broadcastInputs,
+            shuffleSpec,
+            1,
+            1
+        )
     );
 
-    Assert.assertEquals(new TooManyWorkersFault(110, 109), e2.getFault());
+    Assert.assertEquals(
+        new NotEnoughMemoryFault(1_366_250_000, 1_250_000_000, 1_000_000_000, 1, 40, 1, 1),
+        e.getFault()
+    );
   }
 
   @Test
-  public void test_oneWorkerInJvm_smallWorkerCapacity()
+  public void test_1WorkerInJvm_alone_40Threads_slightlyLessMemoryThanError()
   {
-    // Supersorter max channels per processer are one less than they are usually to account for extra frames that are required while creating composing output channels
-    Assert.assertEquals(params(41_200_000, 1, 3, 9_600_000), create(128_000_000, 1, 1, 1, 1, 0, 0));
-    Assert.assertEquals(params(26_800_000, 1, 1, 9_600_000), create(128_000_000, 1, 2, 1, 1, 0, 0));
+    // Test with one byte less than the amount of memory recommended in the error message
+    // for test_1WorkerInJvm_alone_40Threads.
+    final int numThreads = 40;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
+
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(1_366_250_000 - 1, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, 1, numThreads));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
 
     final MSQException e = Assert.assertThrows(
         MSQException.class,
-        () -> create(1_000_000_000, 1, 32, 1, 1, 0, 0)
+        () -> WorkerMemoryParameters.createInstance(
+            memoryIntrospector,
+            frameSize,
+            slices,
+            broadcastInputs,
+            shuffleSpec,
+            1,
+            1
+        )
     );
-    Assert.assertEquals(new NotEnoughMemoryFault(1_588_044_000, 1_000_000_000, 750_000_000, 1, 32, 1), e.getFault());
 
-    final MSQException e2 = Assert.assertThrows(
-        MSQException.class,
-        () -> create(128_000_000, 1, 4, 1, 1, 0, 0)
+    Assert.assertEquals(
+        new NotEnoughMemoryFault(1_366_250_000, 1_366_249_999, 1_092_999_999, 1, 40, 1, 1),
+        e.getFault()
     );
-    Assert.assertEquals(new NotEnoughMemoryFault(580_006_666, 12_8000_000, 96_000_000, 1, 4, 1), e2.getFault());
+  }
+
+  @Test
+  public void test_1WorkerInJvm_alone_40Threads_memoryFromError()
+  {
+    // Test with the amount of memory recommended in the error message for test_1WorkerInJvm_alone_40Threads.
+    final int numThreads = 40;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
+
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(1_366_250_000, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, 1, numThreads));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
+
+    Assert.assertEquals(
+        new WorkerMemoryParameters(13_000_000, frameSize, 1, 2, 250_000, 0),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 1, 1)
+    );
+  }
+
+  @Test
+  public void test_1WorkerInJvm_200WorkersInCluster_4Threads()
+  {
+    final int numThreads = 4;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
 
-    final MSQFault fault = Assert.assertThrows(MSQException.class, () -> create(1_000_000_000, 2, 32, 1, 1, 0, 0))
-                                 .getFault();
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(2_500_000_000L, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, 200, numThreads));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
 
-    Assert.assertEquals(new NotEnoughMemoryFault(2024045333, 1_000_000_000, 750_000_000, 2, 32, 1), fault);
+    Assert.assertEquals(
+        new WorkerMemoryParameters(1_096_000_000, frameSize, 4, 245, 27_400_000, 0),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 1, 1)
+    );
   }
 
   @Test
-  public void test_fourWorkersInJvm_twoHundredWorkersInCluster_hashPartitions()
+  public void test_1WorkerInJvm_200WorkersInCluster_4Threads_2OutputPartitions()
   {
-    Assert.assertEquals(params(814_000_000, 1, 150, 168_750_000), create(9_000_000_000L, 4, 1, 1, 200, 200, 0));
-    Assert.assertEquals(params(611_500_000, 2, 62, 168_750_000), create(9_000_000_000L, 4, 2, 1, 200, 200, 0));
-    Assert.assertEquals(params(358_375_000, 4, 22, 168_750_000), create(9_000_000_000L, 4, 4, 1, 200, 200, 0));
-    Assert.assertEquals(params(105_250_000, 4, 14, 168_750_000), create(9_000_000_000L, 4, 8, 1, 200, 200, 0));
+    final int numThreads = 4;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
 
-    final MSQException e = Assert.assertThrows(
-        MSQException.class,
-        () -> create(9_000_000_000L, 4, 16, 1, 200, 200, 0)
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(2_500_000_000L, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, 200, 2));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
+
+    Assert.assertEquals(
+        new WorkerMemoryParameters(1_548_000_000, frameSize, 4, 347, 38_700_000, 0),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 1, 1)
     );
+  }
 
-    Assert.assertEquals(new TooManyWorkersFault(200, 138), e.getFault());
+  @Test
+  public void test_1WorkerInJvm_200WorkersInCluster_2ConcurrentStages_4Threads()
+  {
+    final int numThreads = 4;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
 
-    // Make sure 138 actually works, and 139 doesn't. (Verify the error message above.)
-    Assert.assertEquals(params(26_750_000, 4, 8, 168_750_000), create(9_000_000_000L, 4, 16, 1, 138, 138, 0));
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(2_500_000_000L, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, 200, numThreads));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
 
-    final MSQException e2 = Assert.assertThrows(
-        MSQException.class,
-        () -> create(9_000_000_000L, 4, 16, 1, 139, 139, 0)
+    Assert.assertEquals(
+        new WorkerMemoryParameters(96_000_000, frameSize, 4, 20, 2_500_000, 0),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 2, 1)
     );
+  }
 
-    Assert.assertEquals(new TooManyWorkersFault(139, 138), e2.getFault());
+  @Test
+  public void test_12WorkersInJvm_200WorkersInCluster_64Threads_4OutputPartitions()
+  {
+    final int numThreads = 64;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
+
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(40_000_000_000L, 12, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, 200, 4));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
+
+    Assert.assertEquals(
+        new WorkerMemoryParameters(1_762_666_666, frameSize, 64, 23, 2_754_166, 0),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 1, 1)
+    );
   }
 
   @Test
-  public void test_oneWorkerInJvm_oneByteUsableMemory()
+  public void test_12WorkersInJvm_200WorkersInCluster_2ConcurrentStages_64Threads_4OutputPartitions()
   {
-    final MSQException e = Assert.assertThrows(
-        MSQException.class,
-        () -> WorkerMemoryParameters.createInstance(1, 1, 1, 1, 32, 1, 1)
+    final int numThreads = 64;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
+
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(40_000_000_000L, 12, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, 200, 4));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
+
+    Assert.assertEquals(
+        new WorkerMemoryParameters(429_333_333, frameSize, 64, 5, 670_833, 0),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 2, 1)
     );
+  }
 
-    Assert.assertEquals(new NotEnoughMemoryFault(554669334, 1, 1, 1, 1, 1), e.getFault());
+  @Test
+  public void test_1WorkerInJvm_MaxWorkersInCluster_2ConcurrentStages_2Threads()
+  {
+    final int numWorkers = Limits.MAX_WORKERS;
+    final int numThreads = 2;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
+
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(6_250_000_000L, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, numWorkers, numThreads));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
+
+    Assert.assertEquals(
+        new WorkerMemoryParameters(448_000_000, frameSize, 2, 200, 22_400_000, 0),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 2, 1)
+    );
+  }
+
+  @Test
+  public void test_1WorkerInJvm_MaxWorkersInCluster_1Thread()
+  {
+    final int numWorkers = Limits.MAX_WORKERS;
+    final int numThreads = 1;
+    final int frameSize = WorkerMemoryParameters.DEFAULT_FRAME_SIZE;
+
+    final MemoryIntrospectorImpl memoryIntrospector = createMemoryIntrospector(2_500_000_000L, 1, numThreads);
+    final List<InputSlice> slices = makeInputSlices(ReadablePartitions.striped(0, numWorkers, numThreads));
+    final IntSet broadcastInputs = IntSets.emptySet();
+    final ShuffleSpec shuffleSpec = makeSortShuffleSpec();
+
+    Assert.assertEquals(
+        new WorkerMemoryParameters(974_000_000, frameSize, 1, 875, 97_400_000, 0),
+        WorkerMemoryParameters.createInstance(memoryIntrospector, frameSize, slices, broadcastInputs, shuffleSpec, 1, 1)
+    );
   }
 
   @Test
@@ -206,39 +408,28 @@ public void testEquals()
     EqualsVerifier.forClass(WorkerMemoryParameters.class).usingGetClass().verify();
   }
 
-  private static WorkerMemoryParameters params(
-      final long processorBundleMemory,
-      final int superSorterMaxActiveProcessors,
-      final int superSorterMaxChannelsPerProcessor,
-      final int partitionStatisticsMaxRetainedBytes
+  private static MemoryIntrospectorImpl createMemoryIntrospector(
+      final long totalMemory,
+      final int numTasksInJvm,
+      final int numProcessingThreads
   )
   {
-    return new WorkerMemoryParameters(
-        processorBundleMemory,
-        superSorterMaxActiveProcessors,
-        superSorterMaxChannelsPerProcessor,
-        partitionStatisticsMaxRetainedBytes
-    );
+    return new MemoryIntrospectorImpl(totalMemory, 0.8, numTasksInJvm, numProcessingThreads, null);
   }
 
-  private static WorkerMemoryParameters create(
-      final long maxMemoryInJvm,
-      final int numWorkersInJvm,
-      final int numProcessingThreadsInJvm,
-      final int maxConcurrentStages,
-      final int numInputWorkers,
-      final int numHashOutputPartitions,
-      final int totalLookUpFootprint
-  )
+  private static List<InputSlice> makeInputSlices(final ReadablePartitions... partitionss)
+  {
+    return Arrays.stream(partitionss)
+                 .map(partitions -> new StageInputSlice(0, partitions, OutputChannelMode.LOCAL_STORAGE))
+                 .collect(Collectors.toList());
+  }
+
+  private static ShuffleSpec makeSortShuffleSpec()
   {
-    return WorkerMemoryParameters.createInstance(
-        maxMemoryInJvm,
-        numWorkersInJvm,
-        numProcessingThreadsInJvm,
-        maxConcurrentStages,
-        numInputWorkers,
-        numHashOutputPartitions,
-        totalLookUpFootprint
+    return new GlobalSortTargetSizeShuffleSpec(
+        new ClusterBy(ImmutableList.of(new KeyColumn("foo", KeyOrder.ASCENDING)), 0),
+        1_000_000,
+        false
     );
   }
 }
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/IndexerWorkerContextTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/IndexerWorkerContextTest.java
index dfb88d17b216..8de80cf109f8 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/IndexerWorkerContextTest.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/IndexerWorkerContextTest.java
@@ -59,6 +59,7 @@ public void setup()
         null,
         null,
         null,
+        null,
         null
     );
   }
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/error/MSQFaultSerdeTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/error/MSQFaultSerdeTest.java
index 0c16ae08af5d..55c6c48c1afe 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/error/MSQFaultSerdeTest.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/error/MSQFaultSerdeTest.java
@@ -74,7 +74,7 @@ public void testFaultSerde() throws IOException
     ));
     assertFaultSerde(new InvalidNullByteFault("the source", 1, "the column", "the value", 2));
     assertFaultSerde(new InvalidFieldFault("the source", "the column", 1, "the error", "the log msg"));
-    assertFaultSerde(new NotEnoughMemoryFault(1000, 1000, 900, 1, 2, 2));
+    assertFaultSerde(new NotEnoughMemoryFault(1234, 1000, 1000, 900, 1, 2, 2));
     assertFaultSerde(QueryNotSupportedFault.INSTANCE);
     assertFaultSerde(new QueryRuntimeFault("new error", "base error"));
     assertFaultSerde(new QueryRuntimeFault("new error", null));
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/querykit/ChainedProcessorManagerTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/querykit/ChainedProcessorManagerTest.java
index a7df04963c3b..4fadb7700f7e 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/querykit/ChainedProcessorManagerTest.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/querykit/ChainedProcessorManagerTest.java
@@ -32,9 +32,9 @@
 import org.apache.druid.frame.processor.Bouncer;
 import org.apache.druid.frame.processor.FrameProcessorExecutorTest;
 import org.apache.druid.frame.processor.FrameProcessors;
+import org.apache.druid.frame.processor.manager.NilFrameProcessor;
 import org.apache.druid.frame.processor.manager.ProcessorManager;
 import org.apache.druid.frame.processor.manager.ProcessorManagers;
-import org.apache.druid.frame.processor.manager.SequenceProcessorManagerTest;
 import org.apache.druid.frame.processor.test.SimpleReturningFrameProcessor;
 import org.apache.druid.frame.processor.test.SingleChannelFrameProcessor;
 import org.apache.druid.frame.processor.test.SingleRowWritingFrameProcessor;
@@ -184,7 +184,7 @@ public void test_failing_processor_manager()
         ProcessorManagers.of(
             ImmutableList.of(
                 new SimpleReturningFrameProcessor<>(ImmutableList.of(4L, 5L, 6L)),
-                new SequenceProcessorManagerTest.NilFrameProcessor<>()
+                new NilFrameProcessor<>()
             )
         ),
         (values) -> createNextProcessors(
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java
index c17238d4b683..0bba94f05f9c 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java
@@ -429,6 +429,18 @@ public void setUp2() throws Exception
         binder -> {
           DruidProcessingConfig druidProcessingConfig = new DruidProcessingConfig()
           {
+            @Override
+            public int getNumThreads()
+            {
+              return 1;
+            }
+
+            @Override
+            public int intermediateComputeSizeBytes()
+            {
+              return 10_000_000;
+            }
+
             @Override
             public String getFormatString()
             {
@@ -751,14 +763,13 @@ public static ObjectMapper setupObjectMapper(Injector injector)
 
   public static WorkerMemoryParameters makeTestWorkerMemoryParameters()
   {
-    return WorkerMemoryParameters.createInstance(
-        WorkerMemoryParameters.PROCESSING_MINIMUM_BYTES * 50,
-        2,
-        10,
-        1,
-        2,
+    return new WorkerMemoryParameters(
+        100_000_000,
+        WorkerMemoryParameters.DEFAULT_FRAME_SIZE,
         1,
-        0
+        50,
+        10_000_000,
+        10_000_000
     );
   }
 
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestWorkerContext.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestWorkerContext.java
index 1b92f468fced..0902e978641b 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestWorkerContext.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestWorkerContext.java
@@ -21,20 +21,21 @@
 
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.inject.Injector;
+import org.apache.druid.collections.StupidPool;
 import org.apache.druid.frame.processor.Bouncer;
 import org.apache.druid.java.util.common.FileUtils;
 import org.apache.druid.java.util.common.io.Closer;
 import org.apache.druid.msq.exec.Controller;
 import org.apache.druid.msq.exec.ControllerClient;
 import org.apache.druid.msq.exec.DataServerQueryHandlerFactory;
-import org.apache.druid.msq.exec.OutputChannelMode;
+import org.apache.druid.msq.exec.ProcessingBuffers;
 import org.apache.druid.msq.exec.Worker;
 import org.apache.druid.msq.exec.WorkerClient;
 import org.apache.druid.msq.exec.WorkerContext;
 import org.apache.druid.msq.exec.WorkerMemoryParameters;
 import org.apache.druid.msq.exec.WorkerStorageParameters;
 import org.apache.druid.msq.kernel.FrameContext;
-import org.apache.druid.msq.kernel.QueryDefinition;
+import org.apache.druid.msq.kernel.WorkOrder;
 import org.apache.druid.msq.querykit.DataSegmentProvider;
 import org.apache.druid.query.groupby.GroupingEngine;
 import org.apache.druid.segment.IndexIO;
@@ -48,6 +49,7 @@
 import org.apache.druid.server.DruidNode;
 
 import java.io.File;
+import java.nio.ByteBuffer;
 import java.util.Map;
 
 public class MSQTestWorkerContext implements WorkerContext
@@ -58,7 +60,6 @@ public class MSQTestWorkerContext implements WorkerContext
   private final Injector injector;
   private final Map<String, Worker> inMemoryWorkers;
   private final File file = FileUtils.createTempDir();
-  private final Bouncer bouncer = new Bouncer(1);
   private final WorkerMemoryParameters workerMemoryParameters;
   private final WorkerStorageParameters workerStorageParameters;
 
@@ -130,9 +131,9 @@ public File tempDir()
   }
 
   @Override
-  public FrameContext frameContext(QueryDefinition queryDef, int stageNumber, OutputChannelMode outputChannelMode)
+  public FrameContext frameContext(WorkOrder workOrder)
   {
-    return new FrameContextImpl(new File(tempDir(), queryDef.getStageDefinition(stageNumber).getId().toString()));
+    return new FrameContextImpl(new File(tempDir(), workOrder.getStageDefinition().getId().toString()));
   }
 
   @Override
@@ -246,9 +247,12 @@ public IndexMergerV9 indexMerger()
     }
 
     @Override
-    public Bouncer processorBouncer()
+    public ProcessingBuffers processingBuffers()
     {
-      return bouncer;
+      return new ProcessingBuffers(
+          new StupidPool<>("testProcessing", () -> ByteBuffer.allocate(1_000_000)),
+          new Bouncer(1)
+      );
     }
 
     @Override
diff --git a/processing/src/main/java/org/apache/druid/collections/QueueNonBlockingPool.java b/processing/src/main/java/org/apache/druid/collections/QueueNonBlockingPool.java
new file mode 100644
index 000000000000..1c3309c958fa
--- /dev/null
+++ b/processing/src/main/java/org/apache/druid/collections/QueueNonBlockingPool.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.collections;
+
+import java.util.NoSuchElementException;
+import java.util.concurrent.BlockingQueue;
+
+/**
+ * Implementation of {@link NonBlockingPool} based on a pre-created {@link BlockingQueue} that never actually blocks.
+ * If the pool is empty when {@link #take()} is called, it throws {@link NoSuchElementException}.
+ */
+public class QueueNonBlockingPool<T> implements NonBlockingPool<T>
+{
+  private final BlockingQueue<T> queue;
+
+  public QueueNonBlockingPool(final BlockingQueue<T> queue)
+  {
+    this.queue = queue;
+  }
+
+  @Override
+  public ResourceHolder<T> take()
+  {
+    final T item = queue.poll();
+    if (item == null) {
+      throw new NoSuchElementException("No items available");
+    }
+
+    return new ReferenceCountingResourceHolder<>(item, () -> queue.add(item));
+  }
+}
diff --git a/processing/src/main/java/org/apache/druid/frame/processor/manager/ConcurrencyLimitedProcessorManager.java b/processing/src/main/java/org/apache/druid/frame/processor/manager/ConcurrencyLimitedProcessorManager.java
new file mode 100644
index 000000000000..751507423b0b
--- /dev/null
+++ b/processing/src/main/java/org/apache/druid/frame/processor/manager/ConcurrencyLimitedProcessorManager.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.frame.processor.manager;
+
+import com.google.common.util.concurrent.ListenableFuture;
+import org.apache.druid.common.guava.FutureUtils;
+import org.apache.druid.frame.processor.Bouncer;
+
+import java.util.Optional;
+
+/**
+ * Manager that limits the number of processors that may run concurrently.
+ */
+public class ConcurrencyLimitedProcessorManager<T, R> implements ProcessorManager<T, R>
+{
+  private final ProcessorManager<T, R> delegate;
+  private final Bouncer bouncer;
+
+  public ConcurrencyLimitedProcessorManager(ProcessorManager<T, R> delegate, int limit)
+  {
+    this.delegate = delegate;
+    this.bouncer = new Bouncer(limit);
+  }
+
+  @Override
+  public ListenableFuture<Optional<ProcessorAndCallback<T>>> next()
+  {
+    final ListenableFuture<Bouncer.Ticket> ticket = bouncer.ticket();
+    return FutureUtils.transformAsync(
+        ticket,
+        t -> FutureUtils.transform(
+            delegate.next(),
+            nextProcessor -> nextProcessor.map(
+                retVal -> new ProcessorAndCallback<>(
+                    retVal.processor(),
+                    r -> {
+                      FutureUtils.getUncheckedImmediately(ticket).giveBack();
+                      retVal.onComplete(r);
+                    }
+                )
+            )
+        )
+    );
+  }
+
+  @Override
+  public R result()
+  {
+    return delegate.result();
+  }
+
+  @Override
+  public void close()
+  {
+    delegate.close();
+  }
+}
diff --git a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryRunnerFactory.java b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryRunnerFactory.java
index d6136ebd9c48..665eb5dd1252 100644
--- a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryRunnerFactory.java
+++ b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryRunnerFactory.java
@@ -21,6 +21,8 @@
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.inject.Inject;
+import org.apache.druid.collections.NonBlockingPool;
+import org.apache.druid.guice.annotations.Global;
 import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.guava.Sequence;
 import org.apache.druid.query.Query;
@@ -36,6 +38,8 @@
 
 import javax.annotation.Nullable;
 
+import java.nio.ByteBuffer;
+
 /**
  *
  */
@@ -43,21 +47,24 @@ public class GroupByQueryRunnerFactory implements QueryRunnerFactory<ResultRow,
 {
   private final GroupingEngine groupingEngine;
   private final GroupByQueryQueryToolChest toolChest;
+  private final NonBlockingPool<ByteBuffer> processingBufferPool;
 
   @Inject
   public GroupByQueryRunnerFactory(
       GroupingEngine groupingEngine,
-      GroupByQueryQueryToolChest toolChest
+      GroupByQueryQueryToolChest toolChest,
+      @Global NonBlockingPool<ByteBuffer> processingBufferPool
   )
   {
     this.groupingEngine = groupingEngine;
     this.toolChest = toolChest;
+    this.processingBufferPool = processingBufferPool;
   }
 
   @Override
   public QueryRunner<ResultRow> createRunner(final Segment segment)
   {
-    return new GroupByQueryRunner(segment, groupingEngine);
+    return new GroupByQueryRunner(segment, groupingEngine, processingBufferPool);
   }
 
   /**
@@ -69,14 +76,9 @@ public QueryRunner<ResultRow> mergeRunners(
       final Iterable<QueryRunner<ResultRow>> queryRunners
   )
   {
-    return new QueryRunner<ResultRow>()
-    {
-      @Override
-      public Sequence<ResultRow> run(QueryPlus<ResultRow> queryPlus, ResponseContext responseContext)
-      {
-        QueryRunner<ResultRow> rowQueryRunner = groupingEngine.mergeRunners(queryProcessingPool, queryRunners);
-        return rowQueryRunner.run(queryPlus, responseContext);
-      }
+    return (queryPlus, responseContext) -> {
+      QueryRunner<ResultRow> rowQueryRunner = groupingEngine.mergeRunners(queryProcessingPool, queryRunners);
+      return rowQueryRunner.run(queryPlus, responseContext);
     };
   }
 
@@ -92,12 +94,18 @@ private static class GroupByQueryRunner implements QueryRunner<ResultRow>
     @Nullable
     private final TimeBoundaryInspector timeBoundaryInspector;
     private final GroupingEngine groupingEngine;
+    private final NonBlockingPool<ByteBuffer> processingBufferPool;
 
-    public GroupByQueryRunner(Segment segment, final GroupingEngine groupingEngine)
+    public GroupByQueryRunner(
+        Segment segment,
+        final GroupingEngine groupingEngine,
+        final NonBlockingPool<ByteBuffer> processingBufferPool
+    )
     {
       this.cursorFactory = segment.asCursorFactory();
       this.timeBoundaryInspector = segment.as(TimeBoundaryInspector.class);
       this.groupingEngine = groupingEngine;
+      this.processingBufferPool = processingBufferPool;
     }
 
     @Override
@@ -112,6 +120,7 @@ public Sequence<ResultRow> run(QueryPlus<ResultRow> queryPlus, ResponseContext r
           (GroupByQuery) query,
           cursorFactory,
           timeBoundaryInspector,
+          processingBufferPool,
           (GroupByQueryMetrics) queryPlus.getQueryMetrics()
       );
     }
diff --git a/processing/src/main/java/org/apache/druid/query/groupby/GroupingEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/GroupingEngine.java
index ce63050a7e61..67583fc1fc0a 100644
--- a/processing/src/main/java/org/apache/druid/query/groupby/GroupingEngine.java
+++ b/processing/src/main/java/org/apache/druid/query/groupby/GroupingEngine.java
@@ -32,7 +32,6 @@
 import org.apache.druid.collections.ReferenceCountingResourceHolder;
 import org.apache.druid.collections.ResourceHolder;
 import org.apache.druid.common.config.NullHandling;
-import org.apache.druid.guice.annotations.Global;
 import org.apache.druid.guice.annotations.Json;
 import org.apache.druid.guice.annotations.Merging;
 import org.apache.druid.guice.annotations.Smile;
@@ -118,8 +117,7 @@ public class GroupingEngine
 
   private final DruidProcessingConfig processingConfig;
   private final Supplier<GroupByQueryConfig> configSupplier;
-  private final NonBlockingPool<ByteBuffer> bufferPool;
-  GroupByResourcesReservationPool groupByResourcesReservationPool;
+  private final GroupByResourcesReservationPool groupByResourcesReservationPool;
   private final ObjectMapper jsonMapper;
   private final ObjectMapper spillMapper;
   private final QueryWatcher queryWatcher;
@@ -128,7 +126,6 @@ public class GroupingEngine
   public GroupingEngine(
       DruidProcessingConfig processingConfig,
       Supplier<GroupByQueryConfig> configSupplier,
-      @Global NonBlockingPool<ByteBuffer> bufferPool,
       @Merging GroupByResourcesReservationPool groupByResourcesReservationPool,
       @Json ObjectMapper jsonMapper,
       @Smile ObjectMapper spillMapper,
@@ -137,7 +134,6 @@ public GroupingEngine(
   {
     this.processingConfig = processingConfig;
     this.configSupplier = configSupplier;
-    this.bufferPool = bufferPool;
     this.groupByResourcesReservationPool = groupByResourcesReservationPool;
     this.jsonMapper = jsonMapper;
     this.spillMapper = spillMapper;
@@ -470,6 +466,8 @@ public QueryRunner<ResultRow> mergeRunners(
    * @param query                 the groupBy query
    * @param cursorFactory         cursor factory for the segment in question
    * @param timeBoundaryInspector time boundary inspector for the segment in question
+   * @param bufferPool            processing buffer pool
+   * @param groupByQueryMetrics   metrics instance, will be populated if nonnull
    *
    * @return result sequence for the cursor factory
    */
@@ -477,6 +475,7 @@ public Sequence<ResultRow> process(
       GroupByQuery query,
       CursorFactory cursorFactory,
       @Nullable TimeBoundaryInspector timeBoundaryInspector,
+      NonBlockingPool<ByteBuffer> bufferPool,
       @Nullable GroupByQueryMetrics groupByQueryMetrics
   )
   {
diff --git a/processing/src/test/java/org/apache/druid/collections/QueueNonBlockingPoolTest.java b/processing/src/test/java/org/apache/druid/collections/QueueNonBlockingPoolTest.java
new file mode 100644
index 000000000000..ae20bce41c95
--- /dev/null
+++ b/processing/src/test/java/org/apache/druid/collections/QueueNonBlockingPoolTest.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.collections;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.NoSuchElementException;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+
+public class QueueNonBlockingPoolTest
+{
+  @Test
+  public void testTakeAllTwice()
+  {
+    final BlockingQueue<String> queue = new ArrayBlockingQueue<>(2);
+    queue.add("foo");
+    queue.add("bar");
+
+    final QueueNonBlockingPool<String> pool = new QueueNonBlockingPool<>(queue);
+
+    // Take everything from pool
+    final ResourceHolder<String> obj1 = pool.take();
+    Assert.assertEquals("foo", obj1.get());
+    Assert.assertEquals(1, queue.size());
+
+    final ResourceHolder<String> obj2 = pool.take();
+    Assert.assertEquals("bar", obj2.get());
+    Assert.assertEquals(0, queue.size());
+
+    Assert.assertThrows(
+        NoSuchElementException.class,
+        pool::take
+    );
+
+    // Re-fill pool in reverse order
+    obj2.close();
+    Assert.assertEquals(1, queue.size());
+
+    obj1.close();
+    Assert.assertEquals(2, queue.size());
+
+    // Re-take everything from pool
+
+    final ResourceHolder<String> obj1b = pool.take();
+    Assert.assertEquals("bar", obj1b.get());
+    Assert.assertEquals(1, queue.size());
+
+    final ResourceHolder<String> obj2b = pool.take();
+    Assert.assertEquals("foo", obj2b.get());
+    Assert.assertEquals(0, queue.size());
+
+    Assert.assertThrows(
+        NoSuchElementException.class,
+        pool::take
+    );
+  }
+}
diff --git a/processing/src/test/java/org/apache/druid/frame/processor/manager/ConcurrencyLimitedProcessorManagerTest.java b/processing/src/test/java/org/apache/druid/frame/processor/manager/ConcurrencyLimitedProcessorManagerTest.java
new file mode 100644
index 000000000000..85739efda8a3
--- /dev/null
+++ b/processing/src/test/java/org/apache/druid/frame/processor/manager/ConcurrencyLimitedProcessorManagerTest.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.frame.processor.manager;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.util.concurrent.ListenableFuture;
+import org.apache.druid.java.util.common.Unit;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.Optional;
+
+public class ConcurrencyLimitedProcessorManagerTest
+{
+  @Test
+  public void test_empty() throws Exception
+  {
+    try (final ConcurrencyLimitedProcessorManager<Object, Long> manager =
+             new ConcurrencyLimitedProcessorManager<>(ProcessorManagers.none(), 1)) {
+      final ListenableFuture<Optional<ProcessorAndCallback<Object>>> future = manager.next();
+      Assert.assertTrue(future.isDone());
+      Assert.assertFalse(future.get().isPresent());
+      Assert.assertEquals(0, (long) manager.result());
+    }
+  }
+
+  @Test
+  public void test_one_limitOne() throws Exception
+  {
+    final NilFrameProcessor<Unit> processor = new NilFrameProcessor<>();
+
+    try (final ConcurrencyLimitedProcessorManager<Unit, Long> manager =
+             new ConcurrencyLimitedProcessorManager<>(ProcessorManagers.of(ImmutableList.of(processor)), 1)) {
+      // First element.
+      ListenableFuture<Optional<ProcessorAndCallback<Unit>>> future = manager.next();
+      Assert.assertTrue(future.isDone());
+      Assert.assertTrue(future.get().isPresent());
+      Assert.assertSame(processor, future.get().get().processor());
+
+      // Simulate processor finishing.
+      future.get().get().onComplete(Unit.instance());
+
+      // End of sequence.
+      future = manager.next();
+      Assert.assertTrue(future.isDone());
+      Assert.assertFalse(future.get().isPresent());
+    }
+  }
+
+  @Test
+  public void test_two_limitOne() throws Exception
+  {
+    final NilFrameProcessor<Unit> processor0 = new NilFrameProcessor<>();
+    final NilFrameProcessor<Unit> processor1 = new NilFrameProcessor<>();
+    final ImmutableList<NilFrameProcessor<Unit>> processors = ImmutableList.of(processor0, processor1);
+
+    try (final ConcurrencyLimitedProcessorManager<Unit, Long> manager =
+             new ConcurrencyLimitedProcessorManager<>(ProcessorManagers.of(processors), 1)) {
+      // First element.
+      ListenableFuture<Optional<ProcessorAndCallback<Unit>>> future0 = manager.next();
+      Assert.assertTrue(future0.isDone());
+      Assert.assertTrue(future0.get().isPresent());
+      Assert.assertSame(processors.get(0), future0.get().get().processor());
+
+      // Second element. Not yet ready to run due to the limit.
+      ListenableFuture<Optional<ProcessorAndCallback<Unit>>> future1 = manager.next();
+      Assert.assertFalse(future1.isDone());
+
+      // Simulate processor0 finishing.
+      future0.get().get().onComplete(Unit.instance());
+
+      // processor1 is now ready to run.
+      Assert.assertTrue(future1.isDone());
+      Assert.assertTrue(future1.get().isPresent());
+      Assert.assertSame(processors.get(1), future1.get().get().processor());
+
+      // Simulate processor1 finishing.
+      future1.get().get().onComplete(Unit.instance());
+
+      // End of sequence.
+      future1 = manager.next();
+      Assert.assertTrue(future1.isDone());
+      Assert.assertFalse(future1.get().isPresent());
+    }
+  }
+}
diff --git a/processing/src/test/java/org/apache/druid/frame/processor/manager/NilFrameProcessor.java b/processing/src/test/java/org/apache/druid/frame/processor/manager/NilFrameProcessor.java
new file mode 100644
index 000000000000..3cd749bbcc20
--- /dev/null
+++ b/processing/src/test/java/org/apache/druid/frame/processor/manager/NilFrameProcessor.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.frame.processor.manager;
+
+import it.unimi.dsi.fastutil.ints.IntSet;
+import org.apache.druid.frame.channel.ReadableFrameChannel;
+import org.apache.druid.frame.channel.WritableFrameChannel;
+import org.apache.druid.frame.processor.FrameProcessor;
+import org.apache.druid.frame.processor.ReturnOrAwait;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Frame processor that throws an exception from {@link #runIncrementally(IntSet)}. Used as a dummy processor
+ * by tests of {@link ProcessorManager}.
+ */
+public class NilFrameProcessor<T> implements FrameProcessor<T>
+{
+  @Override
+  public List<ReadableFrameChannel> inputChannels()
+  {
+    return Collections.emptyList();
+  }
+
+  @Override
+  public List<WritableFrameChannel> outputChannels()
+  {
+    return Collections.emptyList();
+  }
+
+  @Override
+  public ReturnOrAwait<T> runIncrementally(IntSet readableInputs)
+  {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void cleanup()
+  {
+    // Do nothing.
+  }
+}
diff --git a/processing/src/test/java/org/apache/druid/frame/processor/manager/SequenceProcessorManagerTest.java b/processing/src/test/java/org/apache/druid/frame/processor/manager/SequenceProcessorManagerTest.java
index a1ce465540aa..7bbbfca36173 100644
--- a/processing/src/test/java/org/apache/druid/frame/processor/manager/SequenceProcessorManagerTest.java
+++ b/processing/src/test/java/org/apache/druid/frame/processor/manager/SequenceProcessorManagerTest.java
@@ -21,18 +21,13 @@
 
 import com.google.common.collect.ImmutableList;
 import com.google.common.util.concurrent.ListenableFuture;
-import it.unimi.dsi.fastutil.ints.IntSet;
-import org.apache.druid.frame.channel.ReadableFrameChannel;
-import org.apache.druid.frame.channel.WritableFrameChannel;
 import org.apache.druid.frame.processor.FrameProcessor;
-import org.apache.druid.frame.processor.ReturnOrAwait;
 import org.apache.druid.java.util.common.Unit;
 import org.apache.druid.java.util.common.guava.Sequences;
 import org.junit.Assert;
 import org.junit.Test;
 
 import java.util.Collections;
-import java.util.List;
 import java.util.NoSuchElementException;
 import java.util.Optional;
 import java.util.concurrent.atomic.AtomicLong;
@@ -59,7 +54,7 @@ public void test_empty() throws Exception
   @Test
   public void test_one() throws Exception
   {
-    final NilFrameProcessor processor = new NilFrameProcessor();
+    final NilFrameProcessor<Unit> processor = new NilFrameProcessor<>();
     final AtomicLong closed = new AtomicLong();
 
     try (final SequenceProcessorManager<Unit, FrameProcessor<Unit>> manager =
@@ -84,8 +79,8 @@ public void test_one() throws Exception
   @Test
   public void test_two() throws Exception
   {
-    final NilFrameProcessor processor0 = new NilFrameProcessor();
-    final NilFrameProcessor processor1 = new NilFrameProcessor();
+    final NilFrameProcessor<Unit> processor0 = new NilFrameProcessor<>();
+    final NilFrameProcessor<Unit> processor1 = new NilFrameProcessor<>();
     final AtomicLong closed = new AtomicLong();
 
     try (final SequenceProcessorManager<Unit, FrameProcessor<Unit>> manager =
@@ -139,31 +134,4 @@ public void test_empty_closeThenNext()
     // Sequence is not closed because it never started iterating.
     Assert.assertEquals(0, closed.get());
   }
-
-  public static class NilFrameProcessor<T> implements FrameProcessor<T>
-  {
-    @Override
-    public List<ReadableFrameChannel> inputChannels()
-    {
-      return Collections.emptyList();
-    }
-
-    @Override
-    public List<WritableFrameChannel> outputChannels()
-    {
-      return Collections.emptyList();
-    }
-
-    @Override
-    public ReturnOrAwait<T> runIncrementally(IntSet readableInputs)
-    {
-      throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public void cleanup()
-    {
-      // Do nothing.
-    }
-  }
 }
diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByLimitPushDownInsufficientBufferTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByLimitPushDownInsufficientBufferTest.java
index dcbcb79724f5..1eb8774c207f 100644
--- a/processing/src/test/java/org/apache/druid/query/groupby/GroupByLimitPushDownInsufficientBufferTest.java
+++ b/processing/src/test/java/org/apache/druid/query/groupby/GroupByLimitPushDownInsufficientBufferTest.java
@@ -336,7 +336,6 @@ public String getFormatString()
     final GroupingEngine groupingEngine = new GroupingEngine(
         druidProcessingConfig,
         configSupplier,
-        bufferPool,
         groupByResourcesReservationPool,
         TestHelper.makeJsonMapper(),
         new ObjectMapper(new SmileFactory()),
@@ -346,7 +345,6 @@ public String getFormatString()
     final GroupingEngine tooSmallEngine = new GroupingEngine(
         tooSmallDruidProcessingConfig,
         configSupplier,
-        bufferPool2,
         tooSmallGroupByResourcesReservationPool,
         TestHelper.makeJsonMapper(),
         new ObjectMapper(new SmileFactory()),
@@ -355,12 +353,14 @@ public String getFormatString()
 
     groupByFactory = new GroupByQueryRunnerFactory(
         groupingEngine,
-        new GroupByQueryQueryToolChest(groupingEngine, groupByResourcesReservationPool)
+        new GroupByQueryQueryToolChest(groupingEngine, groupByResourcesReservationPool),
+        bufferPool
     );
 
     tooSmallGroupByFactory = new GroupByQueryRunnerFactory(
         tooSmallEngine,
-        new GroupByQueryQueryToolChest(tooSmallEngine, tooSmallGroupByResourcesReservationPool)
+        new GroupByQueryQueryToolChest(tooSmallEngine, tooSmallGroupByResourcesReservationPool),
+        bufferPool2
     );
   }
 
diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByLimitPushDownMultiNodeMergeTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByLimitPushDownMultiNodeMergeTest.java
index 9b9fea03231e..987d9a03f2dc 100644
--- a/processing/src/test/java/org/apache/druid/query/groupby/GroupByLimitPushDownMultiNodeMergeTest.java
+++ b/processing/src/test/java/org/apache/druid/query/groupby/GroupByLimitPushDownMultiNodeMergeTest.java
@@ -588,7 +588,6 @@ public String getFormatString()
     final GroupingEngine groupingEngineBroker = new GroupingEngine(
         druidProcessingConfig,
         configSupplier,
-        bufferPool,
         groupByResourcesReservationPoolBroker,
         TestHelper.makeJsonMapper(),
         new ObjectMapper(new SmileFactory()),
@@ -597,7 +596,6 @@ public String getFormatString()
     final GroupingEngine groupingEngineHistorical = new GroupingEngine(
         druidProcessingConfig,
         configSupplier,
-        bufferPool,
         groupByResourcesReservationPoolHistorical,
         TestHelper.makeJsonMapper(),
         new ObjectMapper(new SmileFactory()),
@@ -606,7 +604,6 @@ public String getFormatString()
     final GroupingEngine groupingEngineHistorical2 = new GroupingEngine(
         druidProcessingConfig,
         configSupplier,
-        bufferPool,
         groupByResourcesReservationPoolHistorical2,
         TestHelper.makeJsonMapper(),
         new ObjectMapper(new SmileFactory()),
@@ -615,17 +612,20 @@ public String getFormatString()
 
     groupByFactoryBroker = new GroupByQueryRunnerFactory(
         groupingEngineBroker,
-        new GroupByQueryQueryToolChest(groupingEngineBroker, groupByResourcesReservationPoolBroker)
+        new GroupByQueryQueryToolChest(groupingEngineBroker, groupByResourcesReservationPoolBroker),
+        bufferPool
     );
 
     groupByFactoryHistorical = new GroupByQueryRunnerFactory(
         groupingEngineHistorical,
-        new GroupByQueryQueryToolChest(groupingEngineHistorical, groupByResourcesReservationPoolHistorical)
+        new GroupByQueryQueryToolChest(groupingEngineHistorical, groupByResourcesReservationPoolHistorical),
+        bufferPool
     );
 
     groupByFactoryHistorical2 = new GroupByQueryRunnerFactory(
         groupingEngineHistorical2,
-        new GroupByQueryQueryToolChest(groupingEngineHistorical2, groupByResourcesReservationPoolHistorical2)
+        new GroupByQueryQueryToolChest(groupingEngineHistorical2, groupByResourcesReservationPoolHistorical2),
+        bufferPool
     );
   }
 
diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByMultiSegmentTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByMultiSegmentTest.java
index d461876c9bdf..9632ceba0b17 100644
--- a/processing/src/test/java/org/apache/druid/query/groupby/GroupByMultiSegmentTest.java
+++ b/processing/src/test/java/org/apache/druid/query/groupby/GroupByMultiSegmentTest.java
@@ -245,7 +245,6 @@ public String getFormatString()
     final GroupingEngine groupingEngine = new GroupingEngine(
         druidProcessingConfig,
         configSupplier,
-        bufferPool,
         groupByResourcesReservationPool,
         TestHelper.makeJsonMapper(),
         new ObjectMapper(new SmileFactory()),
@@ -254,7 +253,8 @@ public String getFormatString()
 
     groupByFactory = new GroupByQueryRunnerFactory(
         groupingEngine,
-        new GroupByQueryQueryToolChest(groupingEngine, groupByResourcesReservationPool)
+        new GroupByQueryQueryToolChest(groupingEngine, groupByResourcesReservationPool),
+        bufferPool
     );
   }
 
diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryMergeBufferTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryMergeBufferTest.java
index 5de602b87720..77f116aa6e40 100644
--- a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryMergeBufferTest.java
+++ b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryMergeBufferTest.java
@@ -130,7 +130,6 @@ private static GroupByQueryRunnerFactory makeQueryRunnerFactory(
     final GroupingEngine groupingEngine = new GroupingEngine(
         PROCESSING_CONFIG,
         configSupplier,
-        BUFFER_POOL,
         groupByResourcesReservationPool,
         TestHelper.makeJsonMapper(),
         mapper,
@@ -140,7 +139,7 @@ private static GroupByQueryRunnerFactory makeQueryRunnerFactory(
         groupingEngine,
         groupByResourcesReservationPool
     );
-    return new GroupByQueryRunnerFactory(groupingEngine, toolChest);
+    return new GroupByQueryRunnerFactory(groupingEngine, toolChest, BUFFER_POOL);
   }
 
   private static final CloseableStupidPool<ByteBuffer> BUFFER_POOL = new CloseableStupidPool<>(
diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryQueryToolChestTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryQueryToolChestTest.java
index d9aefd5f55e2..15c127c68cbb 100644
--- a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryQueryToolChestTest.java
+++ b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryQueryToolChestTest.java
@@ -28,9 +28,7 @@
 import com.google.common.collect.Lists;
 import org.apache.druid.collections.BlockingPool;
 import org.apache.druid.collections.DefaultBlockingPool;
-import org.apache.druid.collections.NonBlockingPool;
 import org.apache.druid.collections.SerializablePair;
-import org.apache.druid.collections.StupidPool;
 import org.apache.druid.common.config.NullHandling;
 import org.apache.druid.data.input.Row;
 import org.apache.druid.jackson.AggregatorsModule;
@@ -1293,10 +1291,6 @@ public String getFormatString()
     final Supplier<ByteBuffer> bufferSupplier =
         () -> ByteBuffer.allocateDirect(processingConfig.intermediateComputeSizeBytes());
 
-    final NonBlockingPool<ByteBuffer> bufferPool = new StupidPool<>(
-        "GroupByQueryEngine-bufferPool",
-        bufferSupplier
-    );
     final BlockingPool<ByteBuffer> mergeBufferPool = new DefaultBlockingPool<>(
         bufferSupplier,
         processingConfig.getNumMergeBuffers()
@@ -1305,7 +1299,6 @@ public String getFormatString()
     final GroupingEngine groupingEngine = new GroupingEngine(
         processingConfig,
         queryConfigSupplier,
-        bufferPool,
         groupByResourcesReservationPool,
         TestHelper.makeJsonMapper(),
         new ObjectMapper(new SmileFactory()),
diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerFailureTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerFailureTest.java
index 69debeb902cb..a3eb5ef724d5 100644
--- a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerFailureTest.java
+++ b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerFailureTest.java
@@ -97,18 +97,19 @@ private static GroupByQueryRunnerFactory makeQueryRunnerFactory(
   )
   {
     final Supplier<GroupByQueryConfig> configSupplier = Suppliers.ofInstance(config);
-    GroupByResourcesReservationPool groupByResourcesReservationPool = new GroupByResourcesReservationPool(MERGE_BUFFER_POOL, config);
+    GroupByResourcesReservationPool groupByResourcesReservationPool =
+        new GroupByResourcesReservationPool(MERGE_BUFFER_POOL, config);
     final GroupingEngine groupingEngine = new GroupingEngine(
         DEFAULT_PROCESSING_CONFIG,
         configSupplier,
-        BUFFER_POOL,
         groupByResourcesReservationPool,
         TestHelper.makeJsonMapper(),
         mapper,
         QueryRunnerTestHelper.NOOP_QUERYWATCHER
     );
-    final GroupByQueryQueryToolChest toolChest = new GroupByQueryQueryToolChest(groupingEngine, groupByResourcesReservationPool);
-    return new GroupByQueryRunnerFactory(groupingEngine, toolChest);
+    final GroupByQueryQueryToolChest toolChest =
+        new GroupByQueryQueryToolChest(groupingEngine, groupByResourcesReservationPool);
+    return new GroupByQueryRunnerFactory(groupingEngine, toolChest, BUFFER_POOL);
   }
 
   private static final CloseableStupidPool<ByteBuffer> BUFFER_POOL = new CloseableStupidPool<>(
diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java
index 461cb99b6c7f..cc36d0009570 100644
--- a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java
+++ b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java
@@ -361,7 +361,6 @@ public static GroupByQueryRunnerFactory makeQueryRunnerFactory(
     final GroupingEngine groupingEngine = new GroupingEngine(
         processingConfig,
         configSupplier,
-        bufferPools.getProcessingPool(),
         groupByResourcesReservationPool,
         mapper,
         mapper,
@@ -373,7 +372,7 @@ public static GroupByQueryRunnerFactory makeQueryRunnerFactory(
         DefaultGroupByQueryMetricsFactory.instance(),
         groupByResourcesReservationPool
     );
-    return new GroupByQueryRunnerFactory(groupingEngine, toolChest);
+    return new GroupByQueryRunnerFactory(groupingEngine, toolChest, bufferPools.getProcessingPool());
   }
 
   @Parameterized.Parameters(name = "{0}")
diff --git a/processing/src/test/java/org/apache/druid/query/groupby/NestedQueryPushDownTest.java b/processing/src/test/java/org/apache/druid/query/groupby/NestedQueryPushDownTest.java
index b75616c4593c..a28e782bf654 100644
--- a/processing/src/test/java/org/apache/druid/query/groupby/NestedQueryPushDownTest.java
+++ b/processing/src/test/java/org/apache/druid/query/groupby/NestedQueryPushDownTest.java
@@ -292,7 +292,6 @@ public String getFormatString()
     final GroupingEngine engine1 = new GroupingEngine(
         druidProcessingConfig,
         configSupplier,
-        bufferPool,
         groupByResourcesReservationPool,
         TestHelper.makeJsonMapper(),
         new ObjectMapper(new SmileFactory()),
@@ -301,7 +300,6 @@ public String getFormatString()
     final GroupingEngine engine2 = new GroupingEngine(
         druidProcessingConfig,
         configSupplier,
-        bufferPool,
         groupByResourcesReservationPool2,
         TestHelper.makeJsonMapper(),
         new ObjectMapper(new SmileFactory()),
@@ -310,12 +308,14 @@ public String getFormatString()
 
     groupByFactory = new GroupByQueryRunnerFactory(
         engine1,
-        new GroupByQueryQueryToolChest(engine1, groupByResourcesReservationPool)
+        new GroupByQueryQueryToolChest(engine1, groupByResourcesReservationPool),
+        bufferPool
     );
 
     groupByFactory2 = new GroupByQueryRunnerFactory(
         engine2,
-        new GroupByQueryQueryToolChest(engine2, groupByResourcesReservationPool2)
+        new GroupByQueryQueryToolChest(engine2, groupByResourcesReservationPool2),
+        bufferPool
     );
   }
 
diff --git a/processing/src/test/java/org/apache/druid/query/groupby/UnnestGroupByQueryRunnerTest.java b/processing/src/test/java/org/apache/druid/query/groupby/UnnestGroupByQueryRunnerTest.java
index 3976a20bd2d0..02bae02eb6f0 100644
--- a/processing/src/test/java/org/apache/druid/query/groupby/UnnestGroupByQueryRunnerTest.java
+++ b/processing/src/test/java/org/apache/druid/query/groupby/UnnestGroupByQueryRunnerTest.java
@@ -178,7 +178,6 @@ public static GroupByQueryRunnerFactory makeQueryRunnerFactory(
     final GroupingEngine groupingEngine = new GroupingEngine(
         processingConfig,
         configSupplier,
-        bufferPools.getProcessingPool(),
         groupByResourcesReservationPool,
         TestHelper.makeJsonMapper(),
         mapper,
@@ -186,7 +185,7 @@ public static GroupByQueryRunnerFactory makeQueryRunnerFactory(
     );
     final GroupByQueryQueryToolChest toolChest =
         new GroupByQueryQueryToolChest(groupingEngine, groupByResourcesReservationPool);
-    return new GroupByQueryRunnerFactory(groupingEngine, toolChest);
+    return new GroupByQueryRunnerFactory(groupingEngine, toolChest, bufferPools.getProcessingPool());
   }
 
   @Parameterized.Parameters(name = "{0}")
diff --git a/processing/src/test/java/org/apache/druid/segment/CursorHolderPreaggTest.java b/processing/src/test/java/org/apache/druid/segment/CursorHolderPreaggTest.java
index 82bba60821ce..52c129379fc5 100644
--- a/processing/src/test/java/org/apache/druid/segment/CursorHolderPreaggTest.java
+++ b/processing/src/test/java/org/apache/druid/segment/CursorHolderPreaggTest.java
@@ -62,6 +62,7 @@
 
 public class CursorHolderPreaggTest extends InitializedNullHandlingTest
 {
+  private CloseableStupidPool<ByteBuffer> bufferPool;
   private GroupingEngine groupingEngine;
   private TopNQueryEngine topNQueryEngine;
   private TimeseriesQueryEngine timeseriesQueryEngine;
@@ -75,18 +76,17 @@ public class CursorHolderPreaggTest extends InitializedNullHandlingTest
   @Before
   public void setup()
   {
-    final CloseableStupidPool<ByteBuffer> pool = closer.closeLater(
+    bufferPool = closer.closeLater(
         new CloseableStupidPool<>(
             "CursorHolderPreaggTest-bufferPool",
             () -> ByteBuffer.allocate(50000)
         )
     );
-    topNQueryEngine = new TopNQueryEngine(pool);
-    timeseriesQueryEngine = new TimeseriesQueryEngine(pool);
+    topNQueryEngine = new TopNQueryEngine(bufferPool);
+    timeseriesQueryEngine = new TimeseriesQueryEngine(bufferPool);
     groupingEngine = new GroupingEngine(
         new DruidProcessingConfig(),
         GroupByQueryConfig::new,
-        pool,
         new GroupByResourcesReservationPool(
             closer.closeLater(
                 new CloseableDefaultBlockingPool<>(
@@ -235,6 +235,7 @@ public void testGroupBy()
         query,
         cursorFactory,
         null,
+        bufferPool,
         null
     );
     List<ResultRow> rows = results.toList();

From d1fd1c6e2c0c39aa0acb5d06863c4635c2e06557 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Sat, 14 Sep 2024 18:10:56 -0700
Subject: [PATCH 18/47] Fix call to MemoryIntrospector in
 IndexerControllerContext. (#17066)

This was a logical conflict between #17057 and #17048.
---
 .../org/apache/druid/msq/indexing/IndexerControllerContext.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java
index e60f1c5c9622..42808f647426 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java
@@ -206,7 +206,7 @@ public int defaultTargetPartitionsPerWorker()
   {
     // Assume tasks are symmetric: workers have the same number of processors available as a controller.
     // Create one partition per processor per task, for maximum parallelism.
-    return memoryIntrospector.numProcessorsInJvm();
+    return memoryIntrospector.numProcessingThreads();
   }
 
   /**

From 8453b8a69d903c89d1ed730dac46938a5a592f97 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Sun, 15 Sep 2024 01:22:28 -0700
Subject: [PATCH 19/47] MSQ: Improved worker cancellation. (#17046)

* MSQ: Improved worker cancellation.

Four changes:

1) FrameProcessorExecutor now requires that cancellationIds be registered
   with "registerCancellationId" prior to being used in "runFully" or "runAllFully".

2) FrameProcessorExecutor gains an "asExecutor" method, which allows that
   executor to be used as an executor for future callbacks in such a way
   that respects cancellationId.

3) RunWorkOrder gains a "stop" method, which cancels the current
   cancellationId and closes the current FrameContext. It blocks until
   both operations are complete.

4) Fixes a bug in RunAllFullyWidget where "processorManager.result()" was
   called outside "runAllFullyLock", which could cause it to be called
   out-of-order with "cleanup()" in case of cancellation or other error.

Together, these changes help ensure cancellation does not have races.
Once "cancel" is called for a given cancellationId, all existing processors
and running callbacks are canceled and exit in an orderly manner. Future
processors and callbacks with the same cancellationId are rejected
before being executed.

* Fix test.

* Use execute, which doesn't return, to avoid errorprone complaints.

* Fix some style stuff.

* Further enhancements.

* Fix style.
---
 .../frame/FrameChannelMergerBenchmark.java    |   8 +-
 .../apache/druid/msq/exec/ControllerImpl.java |  54 ++++---
 .../apache/druid/msq/exec/RunWorkOrder.java   | 137 +++++++++++++++---
 .../druid/msq/exec/RunWorkOrderListener.java  |   2 +-
 .../org/apache/druid/msq/exec/WorkerImpl.java |  45 +++---
 .../msq/querykit/FrameProcessorTestBase.java  |   9 +-
 .../processor/FrameProcessorExecutor.java     |  83 ++++++++---
 .../frame/processor/RunAllFullyWidget.java    |   6 +-
 .../processor/RunnableFrameProcessor.java     |  65 +++++++++
 .../druid/frame/processor/SuperSorter.java    |   4 +-
 .../processor/FrameProcessorExecutorTest.java |  23 +++
 .../processor/RunAllFullyWidgetTest.java      |   4 +-
 12 files changed, 353 insertions(+), 87 deletions(-)
 create mode 100644 processing/src/main/java/org/apache/druid/frame/processor/RunnableFrameProcessor.java

diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/frame/FrameChannelMergerBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/frame/FrameChannelMergerBenchmark.java
index a57b7a116c4e..25f9015de2b9 100644
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/frame/FrameChannelMergerBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/frame/FrameChannelMergerBenchmark.java
@@ -21,6 +21,7 @@
 
 import com.google.common.collect.ImmutableList;
 import com.google.common.util.concurrent.ListenableFuture;
+import com.google.common.util.concurrent.ListeningExecutorService;
 import com.google.common.util.concurrent.MoreExecutors;
 import org.apache.druid.common.config.NullHandling;
 import org.apache.druid.common.guava.FutureUtils;
@@ -203,6 +204,7 @@ public int getChannelNumber(int rowNumber, int numRows, int numChannels)
   private final List<KeyColumn> sortKey = ImmutableList.of(new KeyColumn(KEY, KeyOrder.ASCENDING));
 
   private List<List<Frame>> channelFrames;
+  private ListeningExecutorService innerExec;
   private FrameProcessorExecutor exec;
   private List<BlockingQueueFrameChannel> channels;
 
@@ -226,7 +228,7 @@ public void setupTrial()
     frameReader = FrameReader.create(signature);
 
     exec = new FrameProcessorExecutor(
-        MoreExecutors.listeningDecorator(
+        innerExec = MoreExecutors.listeningDecorator(
             Execs.singleThreaded(StringUtils.encodeForFormat(getClass().getSimpleName()))
         )
     );
@@ -335,8 +337,8 @@ public void setupInvocation() throws IOException
   @TearDown(Level.Trial)
   public void tearDown() throws Exception
   {
-    exec.getExecutorService().shutdownNow();
-    if (!exec.getExecutorService().awaitTermination(1, TimeUnit.MINUTES)) {
+    innerExec.shutdownNow();
+    if (!innerExec.awaitTermination(1, TimeUnit.MINUTES)) {
       throw new ISE("Could not terminate executor after 1 minute");
     }
   }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
index 4b63d85cda7b..6d1ef21abbf2 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
@@ -224,6 +224,7 @@
 public class ControllerImpl implements Controller
 {
   private static final Logger log = new Logger(ControllerImpl.class);
+  private static final String RESULT_READER_CANCELLATION_ID = "result-reader";
 
   private final String queryId;
   private final MSQSpec querySpec;
@@ -2189,6 +2190,34 @@ private static void logKernelStatus(final String queryId, final ControllerQueryK
     }
   }
 
+  /**
+   * Create a result-reader executor for {@link RunQueryUntilDone#readQueryResults()}.
+   */
+  private static FrameProcessorExecutor createResultReaderExec(final String queryId)
+  {
+    return new FrameProcessorExecutor(
+        MoreExecutors.listeningDecorator(
+            Execs.singleThreaded(StringUtils.encodeForFormat("msq-result-reader[" + queryId + "]")))
+    );
+  }
+
+  /**
+   * Cancel any currently-running work and shut down a result-reader executor, like one created by
+   * {@link #createResultReaderExec(String)}.
+   */
+  private static void closeResultReaderExec(final FrameProcessorExecutor exec)
+  {
+    try {
+      exec.cancel(RESULT_READER_CANCELLATION_ID);
+    }
+    catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+    finally {
+      exec.shutdownNow();
+    }
+  }
+
   private void stopExternalFetchers()
   {
     if (workerSketchFetcher != null) {
@@ -2698,12 +2727,9 @@ private void startQueryResultsReader()
         inputChannelFactory = new WorkerInputChannelFactory(netClient, () -> taskIds);
       }
 
-      final FrameProcessorExecutor resultReaderExec = new FrameProcessorExecutor(
-          MoreExecutors.listeningDecorator(
-              Execs.singleThreaded(StringUtils.encodeForFormat("msq-result-reader[" + queryId() + "]")))
-      );
+      final FrameProcessorExecutor resultReaderExec = createResultReaderExec(queryId());
+      resultReaderExec.registerCancellationId(RESULT_READER_CANCELLATION_ID);
 
-      final String cancellationId = "results-reader";
       ReadableConcatFrameChannel resultsChannel = null;
 
       try {
@@ -2713,7 +2739,7 @@ private void startQueryResultsReader()
             inputChannelFactory,
             () -> ArenaMemoryAllocator.createOnHeap(5_000_000),
             resultReaderExec,
-            cancellationId,
+            RESULT_READER_CANCELLATION_ID,
             null,
             MultiStageQueryContext.removeNullBytes(querySpec.getQuery().context())
         );
@@ -2747,7 +2773,7 @@ private void startQueryResultsReader()
             queryListener
         );
 
-        queryResultsReaderFuture = resultReaderExec.runFully(resultsReader, cancellationId);
+        queryResultsReaderFuture = resultReaderExec.runFully(resultsReader, RESULT_READER_CANCELLATION_ID);
 
         // When results are done being read, kick the main thread.
         // Important: don't use FutureUtils.futureWithBaggage, because we need queryResultsReaderFuture to resolve
@@ -2764,23 +2790,13 @@ private void startQueryResultsReader()
             e,
             () -> CloseableUtils.closeAll(
                 finalResultsChannel,
-                () -> resultReaderExec.getExecutorService().shutdownNow()
+                () -> closeResultReaderExec(resultReaderExec)
             )
         );
       }
 
       // Result reader is set up. Register with the query-wide closer.
-      closer.register(() -> {
-        try {
-          resultReaderExec.cancel(cancellationId);
-        }
-        catch (Exception e) {
-          throw new RuntimeException(e);
-        }
-        finally {
-          resultReaderExec.getExecutorService().shutdownNow();
-        }
-      });
+      closer.register(() -> closeResultReaderExec(resultReaderExec));
     }
 
     /**
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrder.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrder.java
index 4d028147af02..3d31d7e2c3ee 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrder.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrder.java
@@ -21,6 +21,7 @@
 
 import com.google.common.base.Function;
 import com.google.common.base.Preconditions;
+import com.google.common.base.Throwables;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Iterables;
@@ -56,6 +57,7 @@
 import org.apache.druid.frame.processor.manager.ProcessorManagers;
 import org.apache.druid.frame.util.DurableStorageUtils;
 import org.apache.druid.frame.write.FrameWriters;
+import org.apache.druid.java.util.common.Either;
 import org.apache.druid.java.util.common.FileUtils;
 import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.StringUtils;
@@ -67,6 +69,8 @@
 import org.apache.druid.msq.indexing.CountingOutputChannelFactory;
 import org.apache.druid.msq.indexing.InputChannelFactory;
 import org.apache.druid.msq.indexing.InputChannelsImpl;
+import org.apache.druid.msq.indexing.error.CanceledFault;
+import org.apache.druid.msq.indexing.error.MSQException;
 import org.apache.druid.msq.indexing.processor.KeyStatisticsCollectionProcessor;
 import org.apache.druid.msq.input.InputSlice;
 import org.apache.druid.msq.input.InputSliceReader;
@@ -94,7 +98,6 @@
 import org.apache.druid.msq.shuffle.output.DurableStorageOutputChannelFactory;
 import org.apache.druid.msq.statistics.ClusterByStatisticsCollector;
 import org.apache.druid.msq.statistics.ClusterByStatisticsSnapshot;
-import org.apache.druid.utils.CloseableUtils;
 import org.checkerframework.checker.nullness.qual.MonotonicNonNull;
 
 import javax.annotation.Nullable;
@@ -104,7 +107,8 @@
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
-import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.atomic.AtomicReference;
 import java.util.stream.Collectors;
 
 /**
@@ -112,7 +116,29 @@
  */
 public class RunWorkOrder
 {
-  private final String controllerTaskId;
+  enum State
+  {
+    /**
+     * Initial state. Must be in this state to call {@link #startAsync()}.
+     */
+    INIT,
+
+    /**
+     * State entered upon calling {@link #startAsync()}.
+     */
+    STARTED,
+
+    /**
+     * State entered upon calling {@link #stop()}.
+     */
+    STOPPING,
+
+    /**
+     * State entered when a call to {@link #stop()} concludes.
+     */
+    STOPPED
+  }
+
   private final WorkOrder workOrder;
   private final InputChannelFactory inputChannelFactory;
   private final CounterTracker counterTracker;
@@ -125,7 +151,9 @@ public class RunWorkOrder
   private final boolean reindex;
   private final boolean removeNullBytes;
   private final ByteTracker intermediateSuperSorterLocalStorageTracker;
-  private final AtomicBoolean started = new AtomicBoolean();
+  private final AtomicReference<State> state = new AtomicReference<>(State.INIT);
+  private final CountDownLatch stopLatch = new CountDownLatch(1);
+  private final AtomicReference<Either<Throwable, Object>> resultForListener = new AtomicReference<>();
 
   @MonotonicNonNull
   private InputSliceReader inputSliceReader;
@@ -141,7 +169,6 @@ public class RunWorkOrder
   private ListenableFuture<OutputChannels> stageOutputChannelsFuture;
 
   public RunWorkOrder(
-      final String controllerTaskId,
       final WorkOrder workOrder,
       final InputChannelFactory inputChannelFactory,
       final CounterTracker counterTracker,
@@ -154,7 +181,6 @@ public RunWorkOrder(
       final boolean removeNullBytes
   )
   {
-    this.controllerTaskId = controllerTaskId;
     this.workOrder = workOrder;
     this.inputChannelFactory = inputChannelFactory;
     this.counterTracker = counterTracker;
@@ -180,15 +206,16 @@ public RunWorkOrder(
    * Execution proceeds asynchronously after this method returns. The {@link RunWorkOrderListener} passed to the
    * constructor of this instance can be used to track progress.
    */
-  public void start() throws IOException
+  public void startAsync()
   {
-    if (started.getAndSet(true)) {
-      throw new ISE("Already started");
+    if (!state.compareAndSet(State.INIT, State.STARTED)) {
+      throw new ISE("Cannot start from state[%s]", state);
     }
 
     final StageDefinition stageDef = workOrder.getStageDefinition();
 
     try {
+      exec.registerCancellationId(cancellationId);
       makeInputSliceReader();
       makeWorkOutputChannelFactory();
       makeShuffleOutputChannelFactory();
@@ -205,16 +232,78 @@ public void start() throws IOException
       setUpCompletionCallbacks();
     }
     catch (Throwable t) {
-      // If start() has problems, cancel anything that was already kicked off, and close the FrameContext.
+      stopUnchecked();
+    }
+  }
+
+  /**
+   * Stops an execution that was previously initiated through {@link #startAsync()} and closes the {@link FrameContext}.
+   * May be called to cancel execution. Must also be called after successful execution in order to ensure that resources
+   * are all properly cleaned up.
+   *
+   * Blocks until execution is fully stopped.
+   */
+  public void stop() throws InterruptedException
+  {
+    if (state.compareAndSet(State.INIT, State.STOPPING)
+        || state.compareAndSet(State.STARTED, State.STOPPING)) {
+      // Initiate stopping.
+      Throwable e = null;
+
       try {
         exec.cancel(cancellationId);
       }
-      catch (Throwable t2) {
-        t.addSuppressed(t2);
+      catch (Throwable e2) {
+        e = e2;
       }
 
-      CloseableUtils.closeAndSuppressExceptions(frameContext, t::addSuppressed);
-      throw t;
+      try {
+        frameContext.close();
+      }
+      catch (Throwable e2) {
+        if (e == null) {
+          e = e2;
+        } else {
+          e.addSuppressed(e2);
+        }
+      }
+
+      try {
+        // notifyListener will ignore this cancellation error if work has already succeeded.
+        notifyListener(Either.error(new MSQException(CanceledFault.instance())));
+      }
+      catch (Throwable e2) {
+        if (e == null) {
+          e = e2;
+        } else {
+          e.addSuppressed(e2);
+        }
+      }
+
+      stopLatch.countDown();
+
+      if (e != null) {
+        Throwables.throwIfInstanceOf(e, InterruptedException.class);
+        Throwables.throwIfUnchecked(e);
+        throw new RuntimeException(e);
+      }
+    }
+
+    stopLatch.await();
+  }
+
+  /**
+   * Calls {@link #stop()}. If the call to {@link #stop()} throws {@link InterruptedException}, this method sets
+   * the interrupt flag and throws an unchecked exception.
+   */
+  public void stopUnchecked()
+  {
+    try {
+      stop();
+    }
+    catch (InterruptedException e) {
+      Thread.currentThread().interrupt();
+      throw new RuntimeException(e);
     }
   }
 
@@ -459,19 +548,33 @@ public void onSuccess(final List<Object> workerResultAndOutputChannelsResolved)
               writeDurableStorageSuccessFile();
             }
 
-            listener.onSuccess(resultObject);
+            notifyListener(Either.value(resultObject));
           }
 
           @Override
           public void onFailure(final Throwable t)
           {
-            listener.onFailure(t);
+            notifyListener(Either.error(t));
           }
         },
         Execs.directExecutor()
     );
   }
 
+  /**
+   * Notify {@link RunWorkOrderListener} that the job is done, if not already notified.
+   */
+  private void notifyListener(final Either<Throwable, Object> result)
+  {
+    if (resultForListener.compareAndSet(null, result)) {
+      if (result.isError()) {
+        listener.onFailure(result.error());
+      } else {
+        listener.onSuccess(result.valueOrThrow());
+      }
+    }
+  }
+
   /**
    * Write {@link DurableStorageUtils#SUCCESS_MARKER_FILENAME} for a particular stage, if durable storage is enabled.
    */
@@ -561,7 +664,7 @@ private DurableStorageOutputChannelFactory makeDurableStorageOutputChannelFactor
   )
   {
     return DurableStorageOutputChannelFactory.createStandardImplementation(
-        controllerTaskId,
+        workerContext.queryId(),
         workOrder.getWorkerNumber(),
         workOrder.getStageNumber(),
         workerContext.workerId(),
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrderListener.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrderListener.java
index 19c3c6570fe9..8bffd6f8179f 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrderListener.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrderListener.java
@@ -25,7 +25,7 @@
 import javax.annotation.Nullable;
 
 /**
- * Listener for various things that may happen during execution of {@link RunWorkOrder#start()}. Listener methods are
+ * Listener for various things that may happen during execution of {@link RunWorkOrder#startAsync()}. Listener methods are
  * fired in processing threads, so they must be thread-safe, and it is important that they run quickly.
  */
 public interface RunWorkOrderListener
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
index 5d9f9b9db541..74e3850c6e96 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
@@ -367,28 +367,19 @@ private void handleNewWorkOrder(
     final WorkerStageKernel kernel = kernelHolder.kernel;
     final WorkOrder workOrder = kernel.getWorkOrder();
     final StageDefinition stageDefinition = workOrder.getStageDefinition();
-    final String cancellationId = cancellationIdFor(stageDefinition.getId());
+    final String cancellationId = cancellationIdFor(stageDefinition.getId(), workOrder.getWorkerNumber());
 
     log.info(
-        "Processing work order for stage[%s]%s",
+        "Starting work order for stage[%s], workerNumber[%d]%s",
         stageDefinition.getId(),
+        workOrder.getWorkerNumber(),
         (log.isDebugEnabled()
          ? StringUtils.format(", payload[%s]", context.jsonMapper().writeValueAsString(workOrder)) : "")
     );
 
-    final FrameContext frameContext = kernelHolder.processorCloser.register(context.frameContext(workOrder));
-    kernelHolder.processorCloser.register(() -> {
-      try {
-        workerExec.cancel(cancellationId);
-      }
-      catch (InterruptedException e) {
-        // Strange that cancellation would itself be interrupted. Log and suppress.
-        log.warn(e, "Cancellation interrupted for stage[%s]", stageDefinition.getId());
-        Thread.currentThread().interrupt();
-      }
-    });
+    final FrameContext frameContext = context.frameContext(workOrder);
 
-    // Set up cleanup functions for this work order.
+    // Set up resultsCloser (called when we are done reading results).
     kernelHolder.resultsCloser.register(() -> FileUtils.deleteDirectory(frameContext.tempDir()));
     kernelHolder.resultsCloser.register(() -> removeStageOutputChannels(stageDefinition.getId()));
 
@@ -397,13 +388,9 @@ private void handleNewWorkOrder(
     final InputChannelFactory inputChannelFactory =
         makeBaseInputChannelFactory(workOrder, controllerClient, kernelHolder.processorCloser);
 
-    // Start working on this stage immediately.
-    kernel.startReading();
-
     final QueryContext queryContext = task != null ? QueryContext.of(task.getContext()) : QueryContext.empty();
     final boolean includeAllCounters = context.includeAllCounters();
     final RunWorkOrder runWorkOrder = new RunWorkOrder(
-        task.getControllerTaskId(),
         workOrder,
         inputChannelFactory,
         stageCounters.computeIfAbsent(
@@ -419,7 +406,12 @@ private void handleNewWorkOrder(
         MultiStageQueryContext.removeNullBytes(queryContext)
     );
 
-    runWorkOrder.start();
+    // Set up processorCloser (called when processing is done).
+    kernelHolder.processorCloser.register(runWorkOrder::stopUnchecked);
+
+    // Start working on this stage immediately.
+    kernel.startReading();
+    runWorkOrder.startAsync();
     kernelHolder.partitionBoundariesFuture = runWorkOrder.getStagePartitionBoundariesFuture();
   }
 
@@ -987,9 +979,9 @@ private StageOutputHolder getOrCreateStageOutputHolder(final StageId stageId, fi
   /**
    * Returns cancellation ID for a particular stage, to be used in {@link FrameProcessorExecutor#cancel(String)}.
    */
-  private static String cancellationIdFor(final StageId stageId)
+  private static String cancellationIdFor(final StageId stageId, final int workerNumber)
   {
-    return stageId.toString();
+    return StringUtils.format("%s_%s", stageId, workerNumber);
   }
 
   /**
@@ -1244,9 +1236,18 @@ public void setDone()
   private static class KernelHolder
   {
     private final WorkerStageKernel kernel;
+    private SettableFuture<ClusterByPartitions> partitionBoundariesFuture;
+
+    /**
+     * Closer for processing. This is closed when all processing for a stage has completed.
+     */
     private final Closer processorCloser;
+
+    /**
+     * Closer for results. This is closed when results for a stage are no longer needed. Always closed
+     * *after* {@link #processorCloser} is done closing.
+     */
     private final Closer resultsCloser;
-    private SettableFuture<ClusterByPartitions> partitionBoundariesFuture;
 
     public KernelHolder(WorkerStageKernel kernel)
     {
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/querykit/FrameProcessorTestBase.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/querykit/FrameProcessorTestBase.java
index 439aa148a84c..cde2b0ea4e9d 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/querykit/FrameProcessorTestBase.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/querykit/FrameProcessorTestBase.java
@@ -20,6 +20,7 @@
 package org.apache.druid.msq.querykit;
 
 import com.google.common.collect.Iterables;
+import com.google.common.util.concurrent.ListeningExecutorService;
 import com.google.common.util.concurrent.MoreExecutors;
 import org.apache.druid.frame.Frame;
 import org.apache.druid.frame.FrameType;
@@ -47,19 +48,21 @@ public class FrameProcessorTestBase extends InitializedNullHandlingTest
 {
   protected static final StagePartition STAGE_PARTITION = new StagePartition(new StageId("q", 0), 0);
 
+  private ListeningExecutorService innerExec;
   protected FrameProcessorExecutor exec;
 
   @Before
   public void setUp()
   {
-    exec = new FrameProcessorExecutor(MoreExecutors.listeningDecorator(Execs.singleThreaded("test-exec")));
+    innerExec = MoreExecutors.listeningDecorator(Execs.singleThreaded("test-exec"));
+    exec = new FrameProcessorExecutor(innerExec);
   }
 
   @After
   public void tearDown() throws Exception
   {
-    exec.getExecutorService().shutdownNow();
-    exec.getExecutorService().awaitTermination(10, TimeUnit.MINUTES);
+    innerExec.shutdownNow();
+    innerExec.awaitTermination(10, TimeUnit.MINUTES);
   }
 
   protected ReadableInput makeChannelFromCursorFactory(
diff --git a/processing/src/main/java/org/apache/druid/frame/processor/FrameProcessorExecutor.java b/processing/src/main/java/org/apache/druid/frame/processor/FrameProcessorExecutor.java
index c0f79d30e581..f255fbe13a6b 100644
--- a/processing/src/main/java/org/apache/druid/frame/processor/FrameProcessorExecutor.java
+++ b/processing/src/main/java/org/apache/druid/frame/processor/FrameProcessorExecutor.java
@@ -46,12 +46,14 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.IdentityHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
 import java.util.concurrent.CancellationException;
+import java.util.concurrent.Executor;
 import java.util.concurrent.ExecutorService;
 import java.util.stream.Collectors;
 
@@ -61,7 +63,6 @@
  * If you want single threaded execution, use {@code Execs.singleThreaded()}. It is not a good idea to use this with a
  * same-thread executor like {@code Execs.directExecutor()}, because it will lead to deep call stacks.
  */
-@SuppressWarnings("CheckReturnValue")
 public class FrameProcessorExecutor
 {
   private static final Logger log = new Logger(FrameProcessorExecutor.class);
@@ -70,6 +71,10 @@ public class FrameProcessorExecutor
 
   private final Object lock = new Object();
 
+  // Currently-active cancellationIds.
+  @GuardedBy("lock")
+  private final Set<String> activeCancellationIds = new HashSet<>();
+
   // Futures that are active and therefore cancelable.
   // Does not include return futures: those are in cancelableReturnFutures.
   @GuardedBy("lock")
@@ -96,19 +101,12 @@ public FrameProcessorExecutor(final ListeningExecutorService exec)
     this.exec = exec;
   }
 
-  /**
-   * Returns the underlying executor service used by this executor.
-   */
-  public ListeningExecutorService getExecutorService()
-  {
-    return exec;
-  }
-
   /**
    * Runs a processor until it is done, and returns a future that resolves when execution is complete.
    *
-   * If "cancellationId" is provided, it can be used with the {@link #cancel(String)} method to cancel all processors
-   * currently running with the same cancellationId.
+   * If "cancellationId" is provided, it must have previously been registered with {@link #registerCancellationId}.
+   * Then, it can be used with the {@link #cancel(String)} method to cancel all processors with that
+   * same cancellationId.
    */
   public <T> ListenableFuture<T> runFully(final FrameProcessor<T> processor, @Nullable final String cancellationId)
   {
@@ -116,6 +114,11 @@ public <T> ListenableFuture<T> runFully(final FrameProcessor<T> processor, @Null
     final List<WritableFrameChannel> outputChannels = processor.outputChannels();
     final SettableFuture<T> finished = registerCancelableFuture(SettableFuture.create(), true, cancellationId);
 
+    if (finished.isDone()) {
+      // Possibly due to starting life out being canceled.
+      return finished;
+    }
+
     class ExecutorRunnable implements Runnable
     {
       private final AwaitAnyWidget awaitAnyWidget = new AwaitAnyWidget(inputChannels);
@@ -152,7 +155,7 @@ public void run()
             final IntSet await = result.awaitSet();
 
             if (await.isEmpty()) {
-              exec.submit(ExecutorRunnable.this);
+              exec.execute(ExecutorRunnable.this);
             } else if (result.isAwaitAll() || await.size() == 1) {
               final List<ListenableFuture<?>> readabilityFutures = new ArrayList<>();
 
@@ -164,7 +167,7 @@ public void run()
               }
 
               if (readabilityFutures.isEmpty()) {
-                exec.submit(ExecutorRunnable.this);
+                exec.execute(ExecutorRunnable.this);
               } else {
                 runProcessorAfterFutureResolves(Futures.allAsList(readabilityFutures));
               }
@@ -272,7 +275,7 @@ private <V> void runProcessorAfterFutureResolves(final ListenableFuture<V> futur
               public void onSuccess(final V ignored)
               {
                 try {
-                  exec.submit(ExecutorRunnable.this);
+                  exec.execute(ExecutorRunnable.this);
                 }
                 catch (Throwable e) {
                   fail(e);
@@ -390,7 +393,7 @@ void doProcessorCleanup() throws IOException
 
     logProcessorStatusString(processor, finished, null);
     registerCancelableProcessor(processor, cancellationId);
-    exec.submit(runnable);
+    exec.execute(runnable);
     return finished;
   }
 
@@ -423,8 +426,20 @@ public <T, R> ListenableFuture<R> runAllFully(
   }
 
   /**
-   * Cancels all processors associated with a given cancellationId. Waits for the processors to exit before
-   * returning.
+   * Registers a cancellationId, so it can be provided to {@link #runFully} or {@link #runAllFully}. To avoid the
+   * set of active cancellationIds growing without bound, callers must also call {@link #cancel(String)} on the
+   * same cancellationId when done using it.
+   */
+  public void registerCancellationId(final String cancellationId)
+  {
+    synchronized (lock) {
+      activeCancellationIds.add(cancellationId);
+    }
+  }
+
+  /**
+   * Deregisters a cancellationId and cancels any currently-running processors associated with that cancellationId.
+   * Waits for any canceled processors to exit before returning.
    */
   public void cancel(final String cancellationId) throws InterruptedException
   {
@@ -435,6 +450,7 @@ public void cancel(final String cancellationId) throws InterruptedException
     final Set<ListenableFuture<?>> returnFuturesToCancel;
 
     synchronized (lock) {
+      activeCancellationIds.remove(cancellationId);
       futuresToCancel = cancelableFutures.removeAll(cancellationId);
       processorsToCancel = cancelableProcessors.removeAll(cancellationId);
       returnFuturesToCancel = cancelableReturnFutures.removeAll(cancellationId);
@@ -457,6 +473,33 @@ public void cancel(final String cancellationId) throws InterruptedException
     }
   }
 
+  /**
+   * Returns an {@link Executor} that executes using the same underlying service, and that is also connected to
+   * cancellation through {@link #cancel(String)}.
+   *
+   * @param cancellationId cancellation ID for the executor
+   */
+  public Executor asExecutor(@Nullable final String cancellationId)
+  {
+    return command -> runFully(new RunnableFrameProcessor(command), cancellationId);
+  }
+
+  /**
+   * Shuts down the underlying executor service immediately.
+   */
+  public void shutdownNow()
+  {
+    exec.shutdownNow();
+  }
+
+  /**
+   * Returns the underlying executor service used by this executor.
+   */
+  ListeningExecutorService getExecutorService()
+  {
+    return exec;
+  }
+
   /**
    * Register a future that will be canceled when the provided {@code cancellationId} is canceled.
    *
@@ -472,6 +515,12 @@ <T, FutureType extends ListenableFuture<T>> FutureType registerCancelableFuture(
   {
     if (cancellationId != null) {
       synchronized (lock) {
+        if (!activeCancellationIds.contains(cancellationId)) {
+          // Cancel and return immediately.
+          future.cancel(true);
+          return future;
+        }
+
         final SetMultimap<String, ListenableFuture<?>> map = isReturn ? cancelableReturnFutures : cancelableFutures;
         map.put(cancellationId, future);
         future.addListener(
diff --git a/processing/src/main/java/org/apache/druid/frame/processor/RunAllFullyWidget.java b/processing/src/main/java/org/apache/druid/frame/processor/RunAllFullyWidget.java
index a1a1c0f87120..7f79a319c280 100644
--- a/processing/src/main/java/org/apache/druid/frame/processor/RunAllFullyWidget.java
+++ b/processing/src/main/java/org/apache/druid/frame/processor/RunAllFullyWidget.java
@@ -306,9 +306,11 @@ public void onSuccess(T result)
                 }
 
                 if (isDone) {
-                  finished.compareAndSet(null, Either.value(processorManager.result()));
-
                   synchronized (runAllFullyLock) {
+                    if (finished.get() == null) {
+                      finished.compareAndSet(null, Either.value(processorManager.result()));
+                    }
+
                     cleanupIfNoMoreProcessors();
                   }
                 } else {
diff --git a/processing/src/main/java/org/apache/druid/frame/processor/RunnableFrameProcessor.java b/processing/src/main/java/org/apache/druid/frame/processor/RunnableFrameProcessor.java
new file mode 100644
index 000000000000..697879490e1e
--- /dev/null
+++ b/processing/src/main/java/org/apache/druid/frame/processor/RunnableFrameProcessor.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.frame.processor;
+
+import it.unimi.dsi.fastutil.ints.IntSet;
+import org.apache.druid.frame.channel.ReadableFrameChannel;
+import org.apache.druid.frame.channel.WritableFrameChannel;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Frame processor that simply runs a {@link Runnable}, once.
+ */
+public class RunnableFrameProcessor implements FrameProcessor<Void>
+{
+  private final Runnable runnable;
+
+  public RunnableFrameProcessor(Runnable runnable)
+  {
+    this.runnable = runnable;
+  }
+
+  @Override
+  public List<ReadableFrameChannel> inputChannels()
+  {
+    return Collections.emptyList();
+  }
+
+  @Override
+  public List<WritableFrameChannel> outputChannels()
+  {
+    return Collections.emptyList();
+  }
+
+  @Override
+  public ReturnOrAwait<Void> runIncrementally(IntSet readableInputs)
+  {
+    runnable.run();
+    return ReturnOrAwait.returnObject(null);
+  }
+
+  @Override
+  public void cleanup()
+  {
+    // Nothing to do.
+  }
+}
diff --git a/processing/src/main/java/org/apache/druid/frame/processor/SuperSorter.java b/processing/src/main/java/org/apache/druid/frame/processor/SuperSorter.java
index e30f2e77b02b..b8b74a2b797e 100644
--- a/processing/src/main/java/org/apache/druid/frame/processor/SuperSorter.java
+++ b/processing/src/main/java/org/apache/druid/frame/processor/SuperSorter.java
@@ -297,7 +297,7 @@ public ListenableFuture<OutputChannels> run()
               setAllDoneIfPossible();
             }
           },
-          exec.getExecutorService()
+          exec.asExecutor(cancellationId)
       );
 
       return FutureUtils.futureWithBaggage(
@@ -813,7 +813,7 @@ public void onFailure(Throwable t)
         },
         // Must run in exec, instead of in the same thread, to avoid running callback immediately if the
         // worker happens to finish super-quickly.
-        exec.getExecutorService()
+        exec.asExecutor(cancellationId)
     );
   }
 
diff --git a/processing/src/test/java/org/apache/druid/frame/processor/FrameProcessorExecutorTest.java b/processing/src/test/java/org/apache/druid/frame/processor/FrameProcessorExecutorTest.java
index 0f50624078b4..4ed2c610525e 100644
--- a/processing/src/test/java/org/apache/druid/frame/processor/FrameProcessorExecutorTest.java
+++ b/processing/src/test/java/org/apache/druid/frame/processor/FrameProcessorExecutorTest.java
@@ -222,6 +222,7 @@ public void test_registerCancelableFuture() throws InterruptedException
       final SettableFuture<Object> future = SettableFuture.create();
       final String cancellationId = "xyzzy";
 
+      exec.registerCancellationId(cancellationId);
       Assert.assertSame(future, exec.registerCancelableFuture(future, false, cancellationId));
       exec.cancel(cancellationId);
 
@@ -236,6 +237,8 @@ public void test_cancel_sleepy() throws Exception
     {
       final SleepyFrameProcessor processor = new SleepyFrameProcessor();
       final String cancellationId = "xyzzy";
+
+      exec.registerCancellationId(cancellationId);
       final ListenableFuture<Long> future = exec.runFully(processor, cancellationId);
 
       processor.awaitRun();
@@ -254,6 +257,8 @@ public void test_futureCancel_sleepy() throws Exception
     {
       final SleepyFrameProcessor processor = new SleepyFrameProcessor();
       final String cancellationId = "xyzzy";
+
+      exec.registerCancellationId(cancellationId);
       final ListenableFuture<Long> future = exec.runFully(processor, cancellationId);
 
       processor.awaitRun();
@@ -314,6 +319,8 @@ public void test_cancel_concurrency() throws Exception
 
       // Start up all systems at once.
       for (final String systemId : systemGeneratorsMap.keySet()) {
+        exec.registerCancellationId(systemId);
+
         for (InfiniteFrameProcessor generator : systemGeneratorsMap.get(systemId)) {
           processorFutureMap.put(generator, exec.runFully(generator, systemId));
         }
@@ -391,6 +398,22 @@ public void test_cancel_nonexistentCancellationId() throws InterruptedException
       // Just making sure no error is thrown when we refer to a nonexistent cancellationId.
       exec.cancel("nonexistent");
     }
+
+    @Test
+    public void test_runFully_nonexistentCancellationId()
+    {
+      final SleepyFrameProcessor processor = new SleepyFrameProcessor();
+      final String cancellationId = "xyzzy";
+
+      // Don't registerCancellationId(cancellationId).
+      final ListenableFuture<Long> future = exec.runFully(processor, cancellationId);
+
+      // Future should be immediately canceled, without running the processor.
+      Assert.assertTrue(future.isDone());
+      Assert.assertTrue(future.isCancelled());
+      Assert.assertFalse(processor.didGetInterrupt());
+      Assert.assertFalse(processor.didCleanup());
+    }
   }
 
   public abstract static class BaseFrameProcessorExecutorTestSuite extends InitializedNullHandlingTest
diff --git a/processing/src/test/java/org/apache/druid/frame/processor/RunAllFullyWidgetTest.java b/processing/src/test/java/org/apache/druid/frame/processor/RunAllFullyWidgetTest.java
index 7cd1e980428e..d0ae5a986a00 100644
--- a/processing/src/test/java/org/apache/druid/frame/processor/RunAllFullyWidgetTest.java
+++ b/processing/src/test/java/org/apache/druid/frame/processor/RunAllFullyWidgetTest.java
@@ -409,6 +409,8 @@ public void test_runAllFully_futureCancel() throws InterruptedException
                  .mapToObj(i -> new SleepyFrameProcessor())
                  .collect(Collectors.toList());
 
+    final String cancellationId = "xyzzy";
+    exec.registerCancellationId(cancellationId);
     final ListenableFuture<Long> future = exec.runAllFully(
         possiblyDelay(
             ensureClose(
@@ -418,7 +420,7 @@ public void test_runAllFully_futureCancel() throws InterruptedException
         ),
         maxOutstandingProcessors,
         bouncer,
-        "xyzzy"
+        cancellationId
     );
 
     for (int i = 0; i < expectedRunningProcessors; i++) {

From d3c071c9d7a4f3dd743315ea1ea18ab0d5b292f8 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Sun, 15 Sep 2024 02:10:58 -0700
Subject: [PATCH 20/47] BaseWorkerClientImpl: Don't attempt to recover from a
 closed channel. (#17052)

* BaseWorkerClientImpl: Don't attempt to recover from a closed channel.

This patch introduces an exception type "ChannelClosedForWritesException",
which allows the BaseWorkerClientImpl to avoid retrying when the local
channel has been closed. This can happen in cases of cancellation.

* Add some test coverage.

* wip

* Add test coverage.

* Style.
---
 .../druid/msq/rpc/BaseWorkerClientImpl.java   |  19 +-
 .../msq/rpc/BaseWorkerClientImplTest.java     | 383 ++++++++++++++++++
 .../ChannelClosedForWritesException.java      |  33 ++
 .../ReadableByteChunksFrameChannel.java       |   4 +-
 .../ReadableByteChunksFrameChannelTest.java   |  13 +
 .../apache/druid/rpc/MockServiceClient.java   |   4 +-
 6 files changed, 448 insertions(+), 8 deletions(-)
 create mode 100644 extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/rpc/BaseWorkerClientImplTest.java
 create mode 100644 processing/src/main/java/org/apache/druid/frame/channel/ChannelClosedForWritesException.java

diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/rpc/BaseWorkerClientImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/rpc/BaseWorkerClientImpl.java
index fd1a0323d0fb..d6e7d412acad 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/rpc/BaseWorkerClientImpl.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/rpc/BaseWorkerClientImpl.java
@@ -26,6 +26,7 @@
 import com.google.common.util.concurrent.ListenableFuture;
 import com.google.common.util.concurrent.SettableFuture;
 import org.apache.druid.common.guava.FutureUtils;
+import org.apache.druid.frame.channel.ChannelClosedForWritesException;
 import org.apache.druid.frame.channel.ReadableByteChunksFrameChannel;
 import org.apache.druid.frame.file.FrameFileHttpResponseHandler;
 import org.apache.druid.frame.file.FrameFilePartialFetch;
@@ -219,12 +220,18 @@ public ListenableFuture<Boolean> fetchChannelData(
           public void onSuccess(FrameFilePartialFetch partialFetch)
           {
             if (partialFetch.isExceptionCaught()) {
-              // Exception while reading channel. Recoverable.
-              log.noStackTrace().info(
-                  partialFetch.getExceptionCaught(),
-                  "Encountered exception while reading channel [%s]",
-                  channel.getId()
-              );
+              if (partialFetch.getExceptionCaught() instanceof ChannelClosedForWritesException) {
+                // Channel was closed. Stop trying.
+                retVal.setException(partialFetch.getExceptionCaught());
+                return;
+              } else {
+                // Exception while reading channel. Recoverable.
+                log.noStackTrace().warn(
+                    partialFetch.getExceptionCaught(),
+                    "Attempting recovery after exception while reading channel[%s]",
+                    channel.getId()
+                );
+              }
             }
 
             // Empty fetch means this is the last fetch for the channel.
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/rpc/BaseWorkerClientImplTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/rpc/BaseWorkerClientImplTest.java
new file mode 100644
index 000000000000..dd8633c886f6
--- /dev/null
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/rpc/BaseWorkerClientImplTest.java
@@ -0,0 +1,383 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.rpc;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.ImmutableMap;
+import it.unimi.dsi.fastutil.bytes.ByteArrays;
+import org.apache.druid.common.guava.FutureUtils;
+import org.apache.druid.frame.Frame;
+import org.apache.druid.frame.FrameType;
+import org.apache.druid.frame.channel.ByteTracker;
+import org.apache.druid.frame.channel.ChannelClosedForWritesException;
+import org.apache.druid.frame.channel.ReadableByteChunksFrameChannel;
+import org.apache.druid.frame.channel.ReadableFrameChannel;
+import org.apache.druid.frame.file.FrameFile;
+import org.apache.druid.frame.file.FrameFileHttpResponseHandler;
+import org.apache.druid.frame.file.FrameFileWriter;
+import org.apache.druid.frame.read.FrameReader;
+import org.apache.druid.frame.testutil.FrameSequenceBuilder;
+import org.apache.druid.frame.testutil.FrameTestUtil;
+import org.apache.druid.jackson.DefaultObjectMapper;
+import org.apache.druid.java.util.common.ISE;
+import org.apache.druid.java.util.common.StringUtils;
+import org.apache.druid.java.util.common.concurrent.Execs;
+import org.apache.druid.java.util.common.guava.Sequence;
+import org.apache.druid.java.util.common.guava.Sequences;
+import org.apache.druid.msq.exec.WorkerClient;
+import org.apache.druid.msq.kernel.StageId;
+import org.apache.druid.rpc.MockServiceClient;
+import org.apache.druid.rpc.RequestBuilder;
+import org.apache.druid.rpc.ServiceClient;
+import org.apache.druid.segment.QueryableIndexCursorFactory;
+import org.apache.druid.segment.TestIndex;
+import org.apache.druid.testing.InitializedNullHandlingTest;
+import org.apache.druid.utils.CloseableUtils;
+import org.hamcrest.CoreMatchers;
+import org.hamcrest.MatcherAssert;
+import org.jboss.netty.handler.codec.http.HttpMethod;
+import org.jboss.netty.handler.codec.http.HttpResponseStatus;
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.internal.matchers.ThrowableMessageMatcher;
+
+import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.MediaType;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.nio.channels.Channels;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+public class BaseWorkerClientImplTest extends InitializedNullHandlingTest
+{
+  private static final String WORKER_ID = "w0";
+  /**
+   * Bytes for a {@link FrameFile} with no frames. (Not an empty array.)
+   */
+  private static byte[] NIL_FILE_BYTES;
+  /**
+   * Bytes for a {@link FrameFile} holding {@link TestIndex#getMMappedTestIndex()}.
+   */
+  private static byte[] FILE_BYTES;
+  private static FrameReader FRAME_READER;
+
+  private ObjectMapper jsonMapper;
+  private MockServiceClient workerServiceClient;
+  private WorkerClient workerClient;
+  private ExecutorService exec;
+
+  @BeforeClass
+  public static void setupClass()
+  {
+    final QueryableIndexCursorFactory cursorFactory = new QueryableIndexCursorFactory(TestIndex.getMMappedTestIndex());
+
+    NIL_FILE_BYTES = toFileBytes(Sequences.empty());
+    FILE_BYTES = toFileBytes(
+        FrameSequenceBuilder.fromCursorFactory(cursorFactory)
+                            .frameType(FrameType.COLUMNAR)
+                            .maxRowsPerFrame(10)
+                            .frames()
+    );
+    FRAME_READER = FrameReader.create(cursorFactory.getRowSignature());
+  }
+
+  @AfterClass
+  public static void afterClass()
+  {
+    NIL_FILE_BYTES = null;
+    FILE_BYTES = null;
+    FRAME_READER = null;
+  }
+
+  @Before
+  public void setup()
+  {
+    jsonMapper = new DefaultObjectMapper();
+    workerServiceClient = new MockServiceClient();
+    workerClient = new TestWorkerClient(jsonMapper, workerServiceClient);
+    exec = Execs.singleThreaded(StringUtils.encodeForFormat("exec-for-" + getClass().getName()) + "-%s");
+  }
+
+  @After
+  public void tearDown() throws InterruptedException
+  {
+    workerServiceClient.verify();
+    exec.shutdownNow();
+    if (!exec.awaitTermination(1, TimeUnit.MINUTES)) {
+      throw new ISE("Timed out waiting for exec to finish");
+    }
+  }
+
+  @Test
+  public void test_fetchChannelData_empty() throws Exception
+  {
+    workerServiceClient.expectAndRespond(
+        new RequestBuilder(HttpMethod.GET, "/channels/xyz/1/2?offset=0")
+            .header(HttpHeaders.ACCEPT_ENCODING, "identity"),
+        HttpResponseStatus.OK,
+        fetchChannelDataResponseHeaders(false),
+        NIL_FILE_BYTES
+    ).expectAndRespond(
+        new RequestBuilder(HttpMethod.GET, "/channels/xyz/1/2?offset=" + NIL_FILE_BYTES.length)
+            .header(HttpHeaders.ACCEPT_ENCODING, "identity"),
+        HttpResponseStatus.OK,
+        fetchChannelDataResponseHeaders(true),
+        ByteArrays.EMPTY_ARRAY
+    );
+
+    // Perform the test.
+    final StageId stageId = new StageId("xyz", 1);
+    final ReadableByteChunksFrameChannel channel = ReadableByteChunksFrameChannel.create("testChannel", false);
+    final Future<List<List<Object>>> framesFuture = readChannelAsync(channel);
+
+    Assert.assertFalse(workerClient.fetchChannelData(WORKER_ID, stageId, 2, 0, channel).get());
+    Assert.assertTrue(workerClient.fetchChannelData(WORKER_ID, stageId, 2, NIL_FILE_BYTES.length, channel).get());
+    channel.doneWriting(); // Caller is expected to call doneWriting after fetchChannelData returns true.
+
+    Assert.assertEquals(
+        0,
+        framesFuture.get().size()
+    );
+  }
+
+  @Test
+  public void test_fetchChannelData_empty_intoClosedChannel()
+  {
+    workerServiceClient.expectAndRespond(
+        new RequestBuilder(HttpMethod.GET, "/channels/xyz/1/2?offset=0")
+            .header(HttpHeaders.ACCEPT_ENCODING, "identity"),
+        HttpResponseStatus.OK,
+        fetchChannelDataResponseHeaders(false),
+        NIL_FILE_BYTES
+    );
+
+    // Perform the test.
+    final StageId stageId = new StageId("xyz", 1);
+    final ReadableByteChunksFrameChannel channel = ReadableByteChunksFrameChannel.create("testChannel", false);
+    channel.close(); // ReadableFrameChannel's close() method.
+
+    final ExecutionException e = Assert.assertThrows(
+        ExecutionException.class,
+        () -> workerClient.fetchChannelData(WORKER_ID, stageId, 2, 0, channel).get()
+    );
+
+    MatcherAssert.assertThat(
+        e.getCause(),
+        CoreMatchers.instanceOf(ChannelClosedForWritesException.class)
+    );
+  }
+
+  @Test
+  public void test_fetchChannelData_empty_retry500() throws Exception
+  {
+    workerServiceClient.expectAndRespond(
+        new RequestBuilder(HttpMethod.GET, "/channels/xyz/1/2?offset=0")
+            .header(HttpHeaders.ACCEPT_ENCODING, "identity"),
+        HttpResponseStatus.INTERNAL_SERVER_ERROR,
+        ImmutableMap.of(),
+        ByteArrays.EMPTY_ARRAY
+    ).expectAndRespond(
+        new RequestBuilder(HttpMethod.GET, "/channels/xyz/1/2?offset=0")
+            .header(HttpHeaders.ACCEPT_ENCODING, "identity"),
+        HttpResponseStatus.OK,
+        fetchChannelDataResponseHeaders(false),
+        NIL_FILE_BYTES
+    ).expectAndRespond(
+        new RequestBuilder(HttpMethod.GET, "/channels/xyz/1/2?offset=" + NIL_FILE_BYTES.length)
+            .header(HttpHeaders.ACCEPT_ENCODING, "identity"),
+        HttpResponseStatus.OK,
+        fetchChannelDataResponseHeaders(true),
+        ByteArrays.EMPTY_ARRAY
+    );
+
+    // Perform the test.
+    final StageId stageId = new StageId("xyz", 1);
+    final ReadableByteChunksFrameChannel channel = ReadableByteChunksFrameChannel.create("testChannel", false);
+    final Future<List<List<Object>>> framesFuture = readChannelAsync(channel);
+
+    Assert.assertFalse(workerClient.fetchChannelData(WORKER_ID, stageId, 2, 0, channel).get());
+    Assert.assertFalse(workerClient.fetchChannelData(WORKER_ID, stageId, 2, 0, channel).get());
+    Assert.assertTrue(workerClient.fetchChannelData(WORKER_ID, stageId, 2, NIL_FILE_BYTES.length, channel).get());
+    channel.doneWriting(); // Caller is expected to call doneWriting after fetchChannelData returns true.
+
+    Assert.assertEquals(
+        0,
+        framesFuture.get().size()
+    );
+  }
+
+  @Test
+  public void test_fetchChannelData_empty_serviceClientError()
+  {
+    workerServiceClient.expectAndThrow(
+        new RequestBuilder(HttpMethod.GET, "/channels/xyz/1/2?offset=0")
+            .header(HttpHeaders.ACCEPT_ENCODING, "identity"),
+        new IOException("Some error")
+    );
+
+    // Perform the test.
+    final StageId stageId = new StageId("xyz", 1);
+    final ReadableByteChunksFrameChannel channel = ReadableByteChunksFrameChannel.create("testChannel", false);
+
+    final ExecutionException e = Assert.assertThrows(
+        ExecutionException.class,
+        () -> workerClient.fetchChannelData(WORKER_ID, stageId, 2, 0, channel).get()
+    );
+
+    MatcherAssert.assertThat(
+        e.getCause(),
+        CoreMatchers.allOf(
+            CoreMatchers.instanceOf(IOException.class),
+            ThrowableMessageMatcher.hasMessage(CoreMatchers.equalTo("Some error"))
+        )
+    );
+
+    channel.close();
+  }
+
+  @Test
+  public void test_fetchChannelData_nonEmpty() throws Exception
+  {
+    workerServiceClient.expectAndRespond(
+        new RequestBuilder(HttpMethod.GET, "/channels/xyz/1/2?offset=0")
+            .header(HttpHeaders.ACCEPT_ENCODING, "identity"),
+        HttpResponseStatus.OK,
+        ImmutableMap.of(HttpHeaders.CONTENT_TYPE, MediaType.APPLICATION_OCTET_STREAM),
+        FILE_BYTES
+    ).expectAndRespond(
+        new RequestBuilder(HttpMethod.GET, "/channels/xyz/1/2?offset=" + FILE_BYTES.length)
+            .header(HttpHeaders.ACCEPT_ENCODING, "identity"),
+        HttpResponseStatus.OK,
+        fetchChannelDataResponseHeaders(true),
+        ByteArrays.EMPTY_ARRAY
+    );
+
+    // Perform the test.
+    final StageId stageId = new StageId("xyz", 1);
+    final ReadableByteChunksFrameChannel channel = ReadableByteChunksFrameChannel.create("testChannel", false);
+    final Future<List<List<Object>>> framesFuture = readChannelAsync(channel);
+
+    Assert.assertFalse(workerClient.fetchChannelData(WORKER_ID, stageId, 2, 0, channel).get());
+    Assert.assertTrue(workerClient.fetchChannelData(WORKER_ID, stageId, 2, FILE_BYTES.length, channel).get());
+    channel.doneWriting(); // Caller is expected to call doneWriting after fetchChannelData returns true.
+
+    FrameTestUtil.assertRowsEqual(
+        FrameTestUtil.readRowsFromCursorFactory(new QueryableIndexCursorFactory(TestIndex.getMMappedTestIndex())),
+        Sequences.simple(framesFuture.get())
+    );
+  }
+
+  private Future<List<List<Object>>> readChannelAsync(final ReadableFrameChannel channel)
+  {
+    return exec.submit(() -> {
+      final List<List<Object>> retVal = new ArrayList<>();
+      while (!channel.isFinished()) {
+        FutureUtils.getUnchecked(channel.readabilityFuture(), false);
+
+        if (channel.canRead()) {
+          final Frame frame = channel.read();
+          retVal.addAll(FrameTestUtil.readRowsFromCursorFactory(FRAME_READER.makeCursorFactory(frame)).toList());
+        }
+      }
+      channel.close();
+      return retVal;
+    });
+  }
+
+  /**
+   * Returns a frame file (as bytes) from a sequence of frames.
+   */
+  private static byte[] toFileBytes(final Sequence<Frame> frames)
+  {
+    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    final FrameFileWriter writer =
+        FrameFileWriter.open(Channels.newChannel(baos), null, ByteTracker.unboundedTracker());
+    frames.forEach(frame -> {
+      try {
+        writer.writeFrame(frame, FrameFileWriter.NO_PARTITION);
+      }
+      catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    });
+    CloseableUtils.closeAndWrapExceptions(writer);
+    return baos.toByteArray();
+  }
+
+
+  /**
+   * Expected response headers for the "fetch channel data" API.
+   */
+  private static Map<String, String> fetchChannelDataResponseHeaders(final boolean lastResponse)
+  {
+    final ImmutableMap.Builder<String, String> builder =
+        ImmutableMap.<String, String>builder()
+                    .put(HttpHeaders.CONTENT_TYPE, MediaType.APPLICATION_OCTET_STREAM);
+
+    if (lastResponse) {
+      builder.put(
+          FrameFileHttpResponseHandler.HEADER_LAST_FETCH_NAME,
+          FrameFileHttpResponseHandler.HEADER_LAST_FETCH_VALUE
+      );
+    }
+
+    return builder.build();
+  }
+
+  /**
+   * Worker client that communicates with a single worker named {@link #WORKER_ID}.
+   */
+  private static class TestWorkerClient extends BaseWorkerClientImpl
+  {
+    private final ServiceClient workerServiceClient;
+
+    public TestWorkerClient(ObjectMapper objectMapper, ServiceClient workerServiceClient)
+    {
+      super(objectMapper, MediaType.APPLICATION_JSON);
+      this.workerServiceClient = workerServiceClient;
+    }
+
+    @Override
+    protected ServiceClient getClient(String workerId)
+    {
+      if (WORKER_ID.equals(workerId)) {
+        return workerServiceClient;
+      } else {
+        throw new ISE("Expected workerId[%s], got[%s]", WORKER_ID, workerId);
+      }
+    }
+
+    @Override
+    public void close()
+    {
+      // Nothing to close.
+    }
+  }
+}
diff --git a/processing/src/main/java/org/apache/druid/frame/channel/ChannelClosedForWritesException.java b/processing/src/main/java/org/apache/druid/frame/channel/ChannelClosedForWritesException.java
new file mode 100644
index 000000000000..933790174918
--- /dev/null
+++ b/processing/src/main/java/org/apache/druid/frame/channel/ChannelClosedForWritesException.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.frame.channel;
+
+/**
+ * Exception thrown by {@link ReadableByteChunksFrameChannel#addChunk(byte[])} when the channel has been closed
+ * for writes, i.e., after {@link ReadableByteChunksFrameChannel#doneWriting()} or
+ * {@link ReadableByteChunksFrameChannel#close()} has been called.
+ */
+public class ChannelClosedForWritesException extends RuntimeException
+{
+  public ChannelClosedForWritesException()
+  {
+    super("Channel is no longer accepting writes");
+  }
+}
diff --git a/processing/src/main/java/org/apache/druid/frame/channel/ReadableByteChunksFrameChannel.java b/processing/src/main/java/org/apache/druid/frame/channel/ReadableByteChunksFrameChannel.java
index a4a40d70a38c..79ad621de280 100644
--- a/processing/src/main/java/org/apache/druid/frame/channel/ReadableByteChunksFrameChannel.java
+++ b/processing/src/main/java/org/apache/druid/frame/channel/ReadableByteChunksFrameChannel.java
@@ -132,13 +132,15 @@ public static ReadableByteChunksFrameChannel create(final String id, boolean fra
    * chunks. (This is not enforced; addChunk will continue to accept new chunks even if the channel is over its limit.)
    *
    * When done adding chunks call {@code doneWriting}.
+   *
+   * @throws ChannelClosedForWritesException if the channel is closed
    */
   @Nullable
   public ListenableFuture<?> addChunk(final byte[] chunk)
   {
     synchronized (lock) {
       if (noMoreWrites) {
-        throw new ISE("Channel is no longer accepting writes");
+        throw new ChannelClosedForWritesException();
       }
 
       try {
diff --git a/processing/src/test/java/org/apache/druid/frame/channel/ReadableByteChunksFrameChannelTest.java b/processing/src/test/java/org/apache/druid/frame/channel/ReadableByteChunksFrameChannelTest.java
index 32faac85276b..a81d3914b23e 100644
--- a/processing/src/test/java/org/apache/druid/frame/channel/ReadableByteChunksFrameChannelTest.java
+++ b/processing/src/test/java/org/apache/druid/frame/channel/ReadableByteChunksFrameChannelTest.java
@@ -118,6 +118,19 @@ public void testEmptyFrameFile() throws IOException
       channel.close();
     }
 
+    @Test
+    public void testAddChunkAfterDoneWriting()
+    {
+      try (final ReadableByteChunksFrameChannel channel = ReadableByteChunksFrameChannel.create("test", false)) {
+        channel.doneWriting();
+
+        Assert.assertThrows(
+            ChannelClosedForWritesException.class,
+            () -> channel.addChunk(new byte[]{})
+        );
+      }
+    }
+
     @Test
     public void testTruncatedFrameFile() throws IOException
     {
diff --git a/server/src/test/java/org/apache/druid/rpc/MockServiceClient.java b/server/src/test/java/org/apache/druid/rpc/MockServiceClient.java
index da817c3da3b1..021db219d963 100644
--- a/server/src/test/java/org/apache/druid/rpc/MockServiceClient.java
+++ b/server/src/test/java/org/apache/druid/rpc/MockServiceClient.java
@@ -41,6 +41,7 @@
 public class MockServiceClient implements ServiceClient
 {
   private final Queue<Expectation> expectations = new ArrayDeque<>(16);
+  private int requestNumber = -1;
 
   @Override
   public <IntermediateType, FinalType> ListenableFuture<FinalType> asyncRequest(
@@ -50,8 +51,9 @@ public <IntermediateType, FinalType> ListenableFuture<FinalType> asyncRequest(
   {
     final Expectation expectation = expectations.poll();
 
+    requestNumber++;
     Assert.assertEquals(
-        "request",
+        "request[" + requestNumber + "]",
         expectation == null ? null : expectation.request,
         requestBuilder
     );

From e392d5c14bd20700211c16a820a1ac80396b5536 Mon Sep 17 00:00:00 2001
From: Akshat Jain <akjn11@gmail.com>
Date: Sun, 15 Sep 2024 23:47:25 +0530
Subject: [PATCH 21/47] Handle memory leaks from Mockito inline mocks (#17070)

---
 .../src/test/java/org/apache/druid/msq/test/MSQTestBase.java    | 1 +
 pom.xml                                                         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java
index 0bba94f05f9c..be05a0fcc8a9 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java
@@ -381,6 +381,7 @@ public List<? extends com.fasterxml.jackson.databind.Module> getJacksonModules()
   @AfterEach
   public void tearDown2()
   {
+    Mockito.framework().clearInlineMocks();
     groupByBuffers.close();
   }
 
diff --git a/pom.xml b/pom.xml
index de7543867610..d8e96ed859dd 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1765,7 +1765,7 @@
                             @{jacocoArgLine}
                             ${jdk.strong.encapsulation.argLine}
                             ${jdk.security.manager.allow.argLine}
-                            -Xmx2500m
+                            -Xmx2048m
                             -XX:MaxDirectMemorySize=2500m
                             -XX:+ExitOnOutOfMemoryError
                             -XX:+HeapDumpOnOutOfMemoryError

From 7db3580adcfa74fead40d116ca53d80634c3d5a8 Mon Sep 17 00:00:00 2001
From: Clint Wylie <cwylie@apache.org>
Date: Sun, 15 Sep 2024 11:18:34 -0700
Subject: [PATCH 22/47] add DataSchema.Builder to tidy stuff up a bit (#17065)

* add DataSchema.Builder to tidy stuff up a bit

* fixes

* fixes

* more style fixes

* review stuff
---
 .../k8s/overlord/common/K8sTestUtils.java     |  26 +-
 .../MaterializedViewSupervisorSpec.java       |  16 +-
 .../MaterializedViewSupervisorTest.java       |  13 +-
 .../RabbitStreamSupervisorTest.java           |  25 +-
 .../indexing/kafka/KafkaIndexTaskTest.java    |  94 ++---
 .../indexing/kafka/KafkaSamplerSpecTest.java  |  87 ++--
 .../kafka/supervisor/KafkaSupervisorTest.java |  27 +-
 .../kinesis/KinesisIndexTaskSerdeTest.java    |   3 +-
 .../kinesis/KinesisIndexTaskTest.java         |  11 +-
 .../kinesis/KinesisSamplerSpecTest.java       |  62 +--
 .../supervisor/KinesisSupervisorTest.java     |  27 +-
 .../destination/SegmentGenerationUtils.java   |  17 +-
 .../msq/indexing/MSQCompactionRunnerTest.java |  28 +-
 .../indexer/BatchDeltaIngestionTest.java      |  51 +--
 .../DetermineHashedPartitionsJobTest.java     |  80 ++--
 .../indexer/DeterminePartitionsJobTest.java   |  58 +--
 .../DetermineRangePartitionsJobTest.java      |  60 +--
 .../indexer/HadoopDruidIndexerConfigTest.java |  25 +-
 .../indexer/HadoopDruidIndexerMapperTest.java |  45 +-
 ...cUpdateDatasourcePathSpecSegmentsTest.java |  24 +-
 .../indexer/IndexGeneratorCombinerTest.java   |  51 +--
 .../druid/indexer/IndexGeneratorJobTest.java  |  24 +-
 .../apache/druid/indexer/JobHelperTest.java   |  95 +++--
 .../indexer/path/DatasourcePathSpecTest.java  |  56 +--
 .../indexer/path/GranularityPathSpecTest.java |  47 +--
 .../indexer/path/StaticPathSpecTest.java      |   3 +-
 .../overlord/sampler/InputSourceSampler.java  |  15 +-
 .../druid/indexing/common/TestIndexTask.java  |   3 +-
 .../task/CompactionTaskParallelRunTest.java   |  25 +-
 .../common/task/HadoopIndexTaskTest.java      |  23 +-
 .../common/task/IndexIngestionSpecTest.java   |  27 +-
 .../indexing/common/task/IndexTaskTest.java   | 135 +++---
 .../indexing/common/task/TaskSerdeTest.java   |  98 ++---
 ...bstractMultiPhaseParallelIndexingTest.java |  30 +-
 .../parallel/HashPartitionTaskKillTest.java   |  33 +-
 ...aseParallelIndexingWithNullColumnTest.java | 110 ++---
 .../ParallelIndexSupervisorTaskKillTest.java  |  28 +-
 ...rallelIndexSupervisorTaskResourceTest.java |  28 +-
 .../ParallelIndexSupervisorTaskSerdeTest.java |  24 +-
 .../ParallelIndexSupervisorTaskTest.java      |  63 ++-
 .../parallel/ParallelIndexTestingFactory.java |  19 +-
 .../parallel/RangePartitionTaskKillTest.java  |  16 +-
 .../SinglePhaseParallelIndexingTest.java      | 173 ++++----
 .../parallel/SinglePhaseSubTaskSpecTest.java  |  14 +-
 .../batch/parallel/TombstoneHelperTest.java   |   6 +-
 .../indexing/input/InputRowSchemasTest.java   |  38 +-
 .../indexing/overlord/TaskLifecycleTest.java  | 100 ++---
 .../indexing/overlord/TaskQueueTest.java      |  17 +-
 .../sampler/CsvInputSourceSamplerTest.java    |  13 +-
 .../InputSourceSamplerDiscoveryTest.java      |  71 ++--
 .../sampler/InputSourceSamplerTest.java       |  32 +-
 ...SeekableStreamIndexTaskRunnerAuthTest.java |  19 +-
 .../SeekableStreamIndexTaskTestBase.java      | 100 +++--
 .../SeekableStreamSamplerSpecTest.java        |  62 +--
 .../SeekableStreamSupervisorSpecTest.java     |  27 +-
 .../SeekableStreamSupervisorStateTest.java    |  27 +-
 .../indexing/worker/TaskAnnouncementTest.java |   3 +-
 .../druid/segment/indexing/DataSchema.java    | 178 +++++---
 .../segment/indexing/DataSchemaTest.java      | 384 +++++++++---------
 .../appenderator/BatchAppenderatorTester.java |  26 +-
 .../StreamAppenderatorTester.java             |  22 +-
 ...nifiedIndexerAppenderatorsManagerTest.java |  13 +-
 .../druid/segment/realtime/sink/SinkTest.java |  40 +-
 .../cli/validate/DruidJsonValidatorTest.java  |  14 +-
 64 files changed, 1528 insertions(+), 1583 deletions(-)

diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/common/K8sTestUtils.java b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/common/K8sTestUtils.java
index b3fda99b222e..161e80569521 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/common/K8sTestUtils.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/common/K8sTestUtils.java
@@ -34,7 +34,6 @@
 import org.apache.druid.indexing.common.task.Tasks;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.DoubleSumAggregatorFactory;
 import org.apache.druid.segment.IndexSpec;
 import org.apache.druid.segment.indexing.DataSchema;
@@ -66,18 +65,19 @@ public static Task getTask()
         null,
         null,
         new IndexTask.IndexIngestionSpec(
-            new DataSchema(
-                "foo",
-                new TimestampSpec(null, null, null),
-                DimensionsSpec.EMPTY,
-                new AggregatorFactory[]{new DoubleSumAggregatorFactory("met", "met")},
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    null,
-                    ImmutableList.of(Intervals.of("2010-01-01/P2D"))
-                ),
-                null
-            ),
+            DataSchema.builder()
+                      .withDataSource("foo")
+                      .withTimestamp(new TimestampSpec(null, null, null))
+                      .withDimensions(DimensionsSpec.EMPTY)
+                      .withAggregators(new DoubleSumAggregatorFactory("met", "met"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              null,
+                              ImmutableList.of(Intervals.of("2010-01-01/P2D"))
+                          )
+                      )
+                      .build(),
             new IndexTask.IndexIOConfig(
                 new LocalInputSource(new File("lol"), "rofl"),
                 new NoopInputFormat(),
diff --git a/extensions-contrib/materialized-view-maintenance/src/main/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorSpec.java b/extensions-contrib/materialized-view-maintenance/src/main/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorSpec.java
index 01039375259e..b23af62f6309 100644
--- a/extensions-contrib/materialized-view-maintenance/src/main/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorSpec.java
+++ b/extensions-contrib/materialized-view-maintenance/src/main/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorSpec.java
@@ -48,7 +48,6 @@
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec;
 import org.apache.druid.segment.realtime.ChatHandlerProvider;
-import org.apache.druid.segment.transform.TransformSpec;
 import org.apache.druid.server.security.AuthorizerMapper;
 import org.apache.druid.timeline.DataSegment;
 import org.joda.time.Interval;
@@ -211,14 +210,13 @@ public HadoopIndexTask createTask(Interval interval, String version, List<DataSe
     );
 
     // generate DataSchema
-    DataSchema dataSchema = new DataSchema(
-        dataSourceName,
-        parser,
-        aggregators,
-        granularitySpec,
-        TransformSpec.NONE,
-        objectMapper
-    );
+    DataSchema dataSchema = DataSchema.builder()
+                                      .withDataSource(dataSourceName)
+                                      .withParserMap(parser)
+                                      .withAggregators(aggregators)
+                                      .withGranularity(granularitySpec)
+                                      .withObjectMapper(objectMapper)
+                                      .build();
 
     // generate DatasourceIngestionSpec
     DatasourceIngestionSpec datasourceIngestionSpec = new DatasourceIngestionSpec(
diff --git a/extensions-contrib/materialized-view-maintenance/src/test/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorTest.java b/extensions-contrib/materialized-view-maintenance/src/test/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorTest.java
index ff1759195869..b87ff6be14c7 100644
--- a/extensions-contrib/materialized-view-maintenance/src/test/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorTest.java
+++ b/extensions-contrib/materialized-view-maintenance/src/test/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorTest.java
@@ -53,7 +53,6 @@
 import org.apache.druid.segment.metadata.CentralizedDatasourceSchemaConfig;
 import org.apache.druid.segment.metadata.SegmentSchemaManager;
 import org.apache.druid.segment.realtime.ChatHandlerProvider;
-import org.apache.druid.segment.transform.TransformSpec;
 import org.apache.druid.server.security.AuthorizerMapper;
 import org.apache.druid.timeline.DataSegment;
 import org.apache.druid.timeline.partition.HashBasedNumberedShardSpec;
@@ -237,14 +236,10 @@ public void testCheckSegmentsAndSubmitTasks()
     Map<Interval, HadoopIndexTask> runningTasks = runningTasksPair.lhs;
     Map<Interval, String> runningVersion = runningTasksPair.rhs;
 
-    DataSchema dataSchema = new DataSchema(
-        "test_datasource",
-        null,
-        null,
-        null,
-        TransformSpec.NONE,
-        objectMapper
-    );
+    DataSchema dataSchema = DataSchema.builder()
+                                      .withDataSource("test_datasource")
+                                      .withObjectMapper(objectMapper)
+                                      .build();
     HadoopIOConfig hadoopIOConfig = new HadoopIOConfig(new HashMap<>(), null, null);
     HadoopIngestionSpec spec = new HadoopIngestionSpec(dataSchema, hadoopIOConfig, null);
     HadoopIndexTask task1 = new HadoopIndexTask(
diff --git a/extensions-contrib/rabbit-stream-indexing-service/src/test/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisorTest.java b/extensions-contrib/rabbit-stream-indexing-service/src/test/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisorTest.java
index a5b9b597afa5..e52ca2f29b64 100644
--- a/extensions-contrib/rabbit-stream-indexing-service/src/test/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisorTest.java
+++ b/extensions-contrib/rabbit-stream-indexing-service/src/test/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisorTest.java
@@ -24,7 +24,6 @@
 import com.google.common.collect.ImmutableMap;
 import org.apache.druid.data.input.InputFormat;
 import org.apache.druid.data.input.impl.DimensionSchema;
-import org.apache.druid.data.input.impl.DimensionsSpec;
 import org.apache.druid.data.input.impl.JsonInputFormat;
 import org.apache.druid.data.input.impl.StringDimensionSchema;
 import org.apache.druid.data.input.impl.TimestampSpec;
@@ -44,7 +43,6 @@
 import org.apache.druid.java.util.emitter.EmittingLogger;
 import org.apache.druid.java.util.metrics.DruidMonitorSchedulerConfig;
 import org.apache.druid.java.util.metrics.StubServiceEmitter;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.segment.TestHelper;
 import org.apache.druid.segment.incremental.RowIngestionMetersFactory;
@@ -102,16 +100,19 @@ private static DataSchema getDataSchema(String dataSource)
     dimensions.add(StringDimensionSchema.create("dim1"));
     dimensions.add(StringDimensionSchema.create("dim2"));
 
-    return new DataSchema(
-        dataSource,
-        new TimestampSpec("timestamp", "iso", null),
-        new DimensionsSpec(dimensions),
-        new AggregatorFactory[] {new CountAggregatorFactory("rows")},
-        new UniformGranularitySpec(
-            Granularities.HOUR,
-            Granularities.NONE,
-            ImmutableList.of()),
-        null);
+    return DataSchema.builder()
+                     .withDataSource(dataSource)
+                     .withTimestamp(new TimestampSpec("timestamp", "iso", null))
+                     .withDimensions(dimensions)
+                     .withAggregators(new CountAggregatorFactory("rows"))
+                     .withGranularity(
+                         new UniformGranularitySpec(
+                             Granularities.HOUR,
+                             Granularities.NONE,
+                             ImmutableList.of()
+                         )
+                     )
+                     .build();
   }
 
   @BeforeClass
diff --git a/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/KafkaIndexTaskTest.java b/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/KafkaIndexTaskTest.java
index dd04c2309f6e..23bdeb14acb8 100644
--- a/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/KafkaIndexTaskTest.java
+++ b/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/KafkaIndexTaskTest.java
@@ -92,7 +92,6 @@
 import org.apache.druid.query.QueryRunnerFactory;
 import org.apache.druid.query.QueryRunnerFactoryConglomerate;
 import org.apache.druid.query.SegmentDescriptor;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.query.aggregation.DoubleSumAggregatorFactory;
 import org.apache.druid.query.filter.SelectorDimFilter;
@@ -1262,28 +1261,27 @@ public void testKafkaRecordEntityInputFormat() throws Exception
 
     final KafkaIndexTask task = createTask(
         null,
-        new DataSchema(
-            "test_ds",
-            new TimestampSpec("timestamp", "iso", null),
-            new DimensionsSpec(
-                Arrays.asList(
-                    new StringDimensionSchema("dim1"),
-                    new StringDimensionSchema("dim1t"),
-                    new StringDimensionSchema("dim2"),
-                    new LongDimensionSchema("dimLong"),
-                    new FloatDimensionSchema("dimFloat"),
-                    new StringDimensionSchema("kafka.topic"),
-                    new LongDimensionSchema("kafka.offset"),
-                    new StringDimensionSchema("kafka.header.encoding")
-                )
-            ),
-            new AggregatorFactory[]{
-                new DoubleSumAggregatorFactory("met1sum", "met1"),
-                new CountAggregatorFactory("rows")
-            },
-            new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null),
-            null
-        ),
+        DataSchema.builder()
+                  .withDataSource("test_ds")
+                  .withTimestamp(new TimestampSpec("timestamp", "iso", null))
+                  .withDimensions(
+                      new StringDimensionSchema("dim1"),
+                      new StringDimensionSchema("dim1t"),
+                      new StringDimensionSchema("dim2"),
+                      new LongDimensionSchema("dimLong"),
+                      new FloatDimensionSchema("dimFloat"),
+                      new StringDimensionSchema("kafka.topic"),
+                      new LongDimensionSchema("kafka.offset"),
+                      new StringDimensionSchema("kafka.header.encoding")
+                  )
+                  .withAggregators(
+                      new DoubleSumAggregatorFactory("met1sum", "met1"),
+                      new CountAggregatorFactory("rows")
+                  )
+                  .withGranularity(
+                      new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null)
+                  )
+                  .build(),
         new KafkaIndexTaskIOConfig(
             0,
             "sequence0",
@@ -1337,26 +1335,25 @@ public void testKafkaInputFormat() throws Exception
 
     final KafkaIndexTask task = createTask(
         null,
-        new DataSchema(
-            "test_ds",
-            new TimestampSpec("timestamp", "iso", null),
-            new DimensionsSpec(
-                Arrays.asList(
-                    new StringDimensionSchema("dim1"),
-                    new StringDimensionSchema("dim1t"),
-                    new StringDimensionSchema("dim2"),
-                    new LongDimensionSchema("dimLong"),
-                    new FloatDimensionSchema("dimFloat"),
-                    new StringDimensionSchema("kafka.testheader.encoding")
-                )
-            ),
-            new AggregatorFactory[]{
-                new DoubleSumAggregatorFactory("met1sum", "met1"),
-                new CountAggregatorFactory("rows")
-            },
-            new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null),
-            null
-        ),
+        DataSchema.builder()
+                  .withDataSource("test_ds")
+                  .withTimestamp(new TimestampSpec("timestamp", "iso", null))
+                  .withDimensions(
+                      new StringDimensionSchema("dim1"),
+                      new StringDimensionSchema("dim1t"),
+                      new StringDimensionSchema("dim2"),
+                      new LongDimensionSchema("dimLong"),
+                      new FloatDimensionSchema("dimFloat"),
+                      new StringDimensionSchema("kafka.testheader.encoding")
+                  )
+                  .withAggregators(
+                      new DoubleSumAggregatorFactory("met1sum", "met1"),
+                      new CountAggregatorFactory("rows")
+                  )
+                  .withGranularity(
+                      new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null)
+                  )
+                  .build(),
         new KafkaIndexTaskIOConfig(
             0,
             "sequence0",
@@ -2888,16 +2885,7 @@ private KafkaIndexTask createTask(
 
   private static DataSchema cloneDataSchema(final DataSchema dataSchema)
   {
-    return new DataSchema(
-        dataSchema.getDataSource(),
-        dataSchema.getTimestampSpec(),
-        dataSchema.getDimensionsSpec(),
-        dataSchema.getAggregators(),
-        dataSchema.getGranularitySpec(),
-        dataSchema.getTransformSpec(),
-        dataSchema.getParserMap(),
-        OBJECT_MAPPER
-    );
+    return DataSchema.builder(dataSchema).withObjectMapper(OBJECT_MAPPER).build();
   }
 
   @Override
diff --git a/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/KafkaSamplerSpecTest.java b/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/KafkaSamplerSpecTest.java
index 0a0b64396a66..9cdc0ac0edcd 100644
--- a/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/KafkaSamplerSpecTest.java
+++ b/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/KafkaSamplerSpecTest.java
@@ -46,7 +46,6 @@
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.java.util.common.parsers.JSONPathSpec;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.query.aggregation.DoubleSumAggregatorFactory;
 import org.apache.druid.segment.TestHelper;
@@ -81,45 +80,30 @@ public class KafkaSamplerSpecTest extends InitializedNullHandlingTest
 
   private static final ObjectMapper OBJECT_MAPPER = TestHelper.makeJsonMapper();
   private static final String TOPIC = "sampling";
-  private static final DataSchema DATA_SCHEMA = new DataSchema(
-      "test_ds",
-      new TimestampSpec("timestamp", "iso", null),
-      new DimensionsSpec(
-          Arrays.asList(
-              new StringDimensionSchema("dim1"),
-              new StringDimensionSchema("dim1t"),
-              new StringDimensionSchema("dim2"),
-              new LongDimensionSchema("dimLong"),
-              new FloatDimensionSchema("dimFloat")
-          )
-      ),
-      new AggregatorFactory[]{
-          new DoubleSumAggregatorFactory("met1sum", "met1"),
-          new CountAggregatorFactory("rows")
-      },
-      new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null),
-      null
-  );
-
-  private static final DataSchema DATA_SCHEMA_KAFKA_TIMESTAMP = new DataSchema(
-      "test_ds",
-      new TimestampSpec("kafka.timestamp", "iso", null),
-      new DimensionsSpec(
-          Arrays.asList(
-              new StringDimensionSchema("dim1"),
-              new StringDimensionSchema("dim1t"),
-              new StringDimensionSchema("dim2"),
-              new LongDimensionSchema("dimLong"),
-              new FloatDimensionSchema("dimFloat")
-          )
-      ),
-      new AggregatorFactory[]{
-          new DoubleSumAggregatorFactory("met1sum", "met1"),
-          new CountAggregatorFactory("rows")
-      },
-      new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null),
-      null
-  );
+  private static final DataSchema DATA_SCHEMA =
+      DataSchema.builder()
+                .withDataSource("test_ds")
+                .withTimestamp(new TimestampSpec("timestamp", "iso", null))
+                .withDimensions(
+                    new StringDimensionSchema("dim1"),
+                    new StringDimensionSchema("dim1t"),
+                    new StringDimensionSchema("dim2"),
+                    new LongDimensionSchema("dimLong"),
+                    new FloatDimensionSchema("dimFloat")
+                )
+                .withAggregators(
+                    new DoubleSumAggregatorFactory("met1sum", "met1"),
+                    new CountAggregatorFactory("rows")
+                )
+                .withGranularity(
+                    new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null)
+                )
+                .build();
+
+  private static final DataSchema DATA_SCHEMA_KAFKA_TIMESTAMP =
+      DataSchema.builder(DATA_SCHEMA)
+                .withTimestamp(new TimestampSpec("kafka.timestamp", "iso", null))
+                .build();
 
   private static TestingCluster zkServer;
   private static TestBroker kafkaServer;
@@ -364,17 +348,18 @@ public void testWithInputRowParser() throws IOException
     );
     InputRowParser parser = new StringInputRowParser(new JSONParseSpec(timestampSpec, dimensionsSpec, JSONPathSpec.DEFAULT, null, null), "UTF8");
 
-    DataSchema dataSchema = new DataSchema(
-        "test_ds",
-        objectMapper.readValue(objectMapper.writeValueAsBytes(parser), Map.class),
-        new AggregatorFactory[]{
-            new DoubleSumAggregatorFactory("met1sum", "met1"),
-            new CountAggregatorFactory("rows")
-        },
-        new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null),
-        null,
-        objectMapper
-    );
+    DataSchema dataSchema = DataSchema.builder()
+                                      .withDataSource("test_ds")
+                                      .withParserMap(
+                                          objectMapper.readValue(objectMapper.writeValueAsBytes(parser), Map.class)
+                                      )
+                                      .withAggregators(
+                                          new DoubleSumAggregatorFactory("met1sum", "met1"),
+                                          new CountAggregatorFactory("rows")
+                                      )
+                                      .withGranularity(new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null))
+                                      .withObjectMapper(objectMapper)
+                                      .build();
 
     KafkaSupervisorSpec supervisorSpec = new KafkaSupervisorSpec(
         null,
diff --git a/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorTest.java b/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorTest.java
index 127cb72efcb1..e436b8cd56a5 100644
--- a/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorTest.java
+++ b/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorTest.java
@@ -30,7 +30,6 @@
 import org.apache.curator.test.TestingCluster;
 import org.apache.druid.data.input.InputFormat;
 import org.apache.druid.data.input.impl.DimensionSchema;
-import org.apache.druid.data.input.impl.DimensionsSpec;
 import org.apache.druid.data.input.impl.JsonInputFormat;
 import org.apache.druid.data.input.impl.StringDimensionSchema;
 import org.apache.druid.data.input.impl.TimestampSpec;
@@ -83,7 +82,6 @@
 import org.apache.druid.java.util.emitter.service.AlertEvent;
 import org.apache.druid.java.util.metrics.DruidMonitorSchedulerConfig;
 import org.apache.druid.java.util.metrics.StubServiceEmitter;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.segment.TestHelper;
 import org.apache.druid.segment.incremental.ParseExceptionReport;
@@ -5135,18 +5133,19 @@ private static DataSchema getDataSchema(String dataSource)
     dimensions.add(StringDimensionSchema.create("dim1"));
     dimensions.add(StringDimensionSchema.create("dim2"));
 
-    return new DataSchema(
-        dataSource,
-        new TimestampSpec("timestamp", "iso", null),
-        new DimensionsSpec(dimensions),
-        new AggregatorFactory[]{new CountAggregatorFactory("rows")},
-        new UniformGranularitySpec(
-            Granularities.HOUR,
-            Granularities.NONE,
-            ImmutableList.of()
-        ),
-        null
-    );
+    return DataSchema.builder()
+                     .withDataSource(dataSource)
+                     .withTimestamp(new TimestampSpec("timestamp", "iso", null))
+                     .withDimensions(dimensions)
+                     .withAggregators(new CountAggregatorFactory("rows"))
+                     .withGranularity(
+                         new UniformGranularitySpec(
+                             Granularities.HOUR,
+                             Granularities.NONE,
+                             ImmutableList.of()
+                         )
+                     )
+                     .build();
   }
 
   private KafkaIndexTask createKafkaIndexTask(
diff --git a/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisIndexTaskSerdeTest.java b/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisIndexTaskSerdeTest.java
index ed2758ddaefd..e84581af6013 100644
--- a/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisIndexTaskSerdeTest.java
+++ b/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisIndexTaskSerdeTest.java
@@ -50,7 +50,8 @@
 
 public class KinesisIndexTaskSerdeTest
 {
-  private static final DataSchema DATA_SCHEMA = new DataSchema("dataSource", null, null, null, null, null, null, null);
+  private static final DataSchema DATA_SCHEMA =
+      DataSchema.builder().withDataSource("dataSource").build();
   private static final KinesisIndexTaskTuningConfig TUNING_CONFIG = new KinesisIndexTaskTuningConfig(
       null,
       null,
diff --git a/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisIndexTaskTest.java b/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisIndexTaskTest.java
index d69e43ca660c..2ef391484008 100644
--- a/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisIndexTaskTest.java
+++ b/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisIndexTaskTest.java
@@ -2412,16 +2412,7 @@ private KinesisIndexTask createTask(
 
   private static DataSchema cloneDataSchema(final DataSchema dataSchema)
   {
-    return new DataSchema(
-        dataSchema.getDataSource(),
-        dataSchema.getTimestampSpec(),
-        dataSchema.getDimensionsSpec(),
-        dataSchema.getAggregators(),
-        dataSchema.getGranularitySpec(),
-        dataSchema.getTransformSpec(),
-        dataSchema.getParserMap(),
-        OBJECT_MAPPER
-    );
+    return DataSchema.builder(dataSchema).withObjectMapper(OBJECT_MAPPER).build();
   }
 
   @Override
diff --git a/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisSamplerSpecTest.java b/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisSamplerSpecTest.java
index 63144c6a9353..102e2d8929f2 100644
--- a/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisSamplerSpecTest.java
+++ b/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/KinesisSamplerSpecTest.java
@@ -49,7 +49,6 @@
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.java.util.common.parsers.JSONPathSpec;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.query.aggregation.DoubleSumAggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
@@ -75,25 +74,25 @@ public class KinesisSamplerSpecTest extends EasyMockSupport
 {
   private static final String STREAM = "sampling";
   private static final String SHARD_ID = "1";
-  private static final DataSchema DATA_SCHEMA = new DataSchema(
-      "test_ds",
-      new TimestampSpec("timestamp", "iso", null),
-      new DimensionsSpec(
-          Arrays.asList(
-              new StringDimensionSchema("dim1"),
-              new StringDimensionSchema("dim1t"),
-              new StringDimensionSchema("dim2"),
-              new LongDimensionSchema("dimLong"),
-              new FloatDimensionSchema("dimFloat")
-          )
-      ),
-      new AggregatorFactory[]{
-          new DoubleSumAggregatorFactory("met1sum", "met1"),
-          new CountAggregatorFactory("rows")
-      },
-      new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null),
-      null
-  );
+  private static final DataSchema DATA_SCHEMA =
+      DataSchema.builder()
+                .withDataSource("test_ds")
+                .withTimestamp(new TimestampSpec("timestamp", "iso", null))
+                .withDimensions(
+                    new StringDimensionSchema("dim1"),
+                    new StringDimensionSchema("dim1t"),
+                    new StringDimensionSchema("dim2"),
+                    new LongDimensionSchema("dimLong"),
+                    new FloatDimensionSchema("dimFloat")
+                )
+                .withAggregators(
+                    new DoubleSumAggregatorFactory("met1sum", "met1"),
+                    new CountAggregatorFactory("rows")
+                )
+                .withGranularity(
+                    new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null)
+                )
+                .build();
 
   static {
     NullHandling.initializeForTests();
@@ -192,17 +191,18 @@ public void testSampleWithInputRowParser() throws IOException, InterruptedExcept
     );
     InputRowParser parser = new StringInputRowParser(new JSONParseSpec(timestampSpec, dimensionsSpec, JSONPathSpec.DEFAULT, null, null), "UTF8");
 
-    DataSchema dataSchema = new DataSchema(
-        "test_ds",
-        objectMapper.readValue(objectMapper.writeValueAsBytes(parser), Map.class),
-        new AggregatorFactory[]{
-            new DoubleSumAggregatorFactory("met1sum", "met1"),
-            new CountAggregatorFactory("rows")
-        },
-        new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null),
-        null,
-        objectMapper
-    );
+    DataSchema dataSchema = DataSchema.builder()
+                                      .withDataSource("test_ds")
+                                      .withParserMap(
+                                          objectMapper.readValue(objectMapper.writeValueAsBytes(parser), Map.class)
+                                      )
+                                      .withAggregators(
+                                          new DoubleSumAggregatorFactory("met1sum", "met1"),
+                                          new CountAggregatorFactory("rows")
+                                      )
+                                      .withGranularity(new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null))
+                                      .withObjectMapper(objectMapper)
+                                      .build();
 
     KinesisSupervisorSpec supervisorSpec = new KinesisSupervisorSpec(
         null,
diff --git a/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisorTest.java b/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisorTest.java
index 50b7203c629c..24d919918f47 100644
--- a/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisorTest.java
+++ b/extensions-core/kinesis-indexing-service/src/test/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisorTest.java
@@ -28,7 +28,6 @@
 import com.google.common.util.concurrent.ListenableFuture;
 import org.apache.druid.data.input.InputFormat;
 import org.apache.druid.data.input.impl.DimensionSchema;
-import org.apache.druid.data.input.impl.DimensionsSpec;
 import org.apache.druid.data.input.impl.JsonInputFormat;
 import org.apache.druid.data.input.impl.StringDimensionSchema;
 import org.apache.druid.data.input.impl.TimestampSpec;
@@ -79,7 +78,6 @@
 import org.apache.druid.java.util.emitter.service.AlertEvent;
 import org.apache.druid.java.util.metrics.DruidMonitorSchedulerConfig;
 import org.apache.druid.java.util.metrics.StubServiceEmitter;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.segment.TestHelper;
 import org.apache.druid.segment.incremental.RowIngestionMetersFactory;
@@ -5482,18 +5480,19 @@ private static DataSchema getDataSchema(String dataSource)
     dimensions.add(StringDimensionSchema.create("dim1"));
     dimensions.add(StringDimensionSchema.create("dim2"));
 
-    return new DataSchema(
-        dataSource,
-        new TimestampSpec("timestamp", "iso", null),
-        new DimensionsSpec(dimensions),
-        new AggregatorFactory[]{new CountAggregatorFactory("rows")},
-        new UniformGranularitySpec(
-            Granularities.HOUR,
-            Granularities.NONE,
-            ImmutableList.of()
-        ),
-        null
-    );
+    return DataSchema.builder()
+                     .withDataSource(dataSource)
+                     .withTimestamp(new TimestampSpec("timestamp", "iso", null))
+                     .withDimensions(dimensions)
+                     .withAggregators(new CountAggregatorFactory("rows"))
+                     .withGranularity(
+                         new UniformGranularitySpec(
+                             Granularities.HOUR,
+                             Granularities.NONE,
+                             ImmutableList.of()
+                         )
+                     )
+                     .build();
   }
 
 
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/destination/SegmentGenerationUtils.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/destination/SegmentGenerationUtils.java
index 09d79534337c..b37a29f53e20 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/destination/SegmentGenerationUtils.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/destination/SegmentGenerationUtils.java
@@ -53,14 +53,12 @@
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec;
 import org.apache.druid.segment.indexing.granularity.GranularitySpec;
-import org.apache.druid.segment.transform.TransformSpec;
 import org.apache.druid.sql.calcite.planner.ColumnMappings;
 import org.apache.druid.sql.calcite.rel.DruidQuery;
 import org.apache.druid.utils.CollectionUtils;
 
 import javax.annotation.Nullable;
 import java.util.ArrayList;
-import java.util.Collections;
 import java.util.HashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
@@ -96,14 +94,13 @@ public static DataSchema makeDataSchemaForIngestion(
             destination.getDimensionSchemas()
         );
 
-    return new DataSchema(
-        destination.getDataSource(),
-        new TimestampSpec(ColumnHolder.TIME_COLUMN_NAME, "millis", null),
-        dimensionsAndAggregators.lhs,
-        dimensionsAndAggregators.rhs.toArray(new AggregatorFactory[0]),
-        makeGranularitySpecForIngestion(querySpec.getQuery(), querySpec.getColumnMappings(), isRollupQuery, jsonMapper),
-        new TransformSpec(null, Collections.emptyList())
-    );
+    return DataSchema.builder()
+                     .withDataSource(destination.getDataSource())
+                     .withTimestamp(new TimestampSpec(ColumnHolder.TIME_COLUMN_NAME, "millis", null))
+                     .withDimensions(dimensionsAndAggregators.lhs)
+                     .withAggregators(dimensionsAndAggregators.rhs.toArray(new AggregatorFactory[0]))
+                     .withGranularity(makeGranularitySpecForIngestion(querySpec.getQuery(), querySpec.getColumnMappings(), isRollupQuery, jsonMapper))
+                     .build();
   }
 
   private static GranularitySpec makeGranularitySpecForIngestion(
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQCompactionRunnerTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQCompactionRunnerTest.java
index 4088d5cecb10..15b12be15753 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQCompactionRunnerTest.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQCompactionRunnerTest.java
@@ -259,19 +259,21 @@ public void testMSQControllerTaskSpecWithScanIsValid() throws JsonProcessingExce
         null
     );
 
-    DataSchema dataSchema = new DataSchema(
-        DATA_SOURCE,
-        new TimestampSpec(TIMESTAMP_COLUMN, null, null),
-        new DimensionsSpec(DIMENSIONS),
-        new AggregatorFactory[]{},
-        new UniformGranularitySpec(
-            SEGMENT_GRANULARITY.getDefaultGranularity(),
-            null,
-            false,
-            Collections.singletonList(COMPACTION_INTERVAL)
-        ),
-        new TransformSpec(dimFilter, Collections.emptyList())
-    );
+    DataSchema dataSchema =
+        DataSchema.builder()
+                  .withDataSource(DATA_SOURCE)
+                  .withTimestamp(new TimestampSpec(TIMESTAMP_COLUMN, null, null))
+                  .withDimensions(DIMENSIONS)
+                  .withGranularity(
+                      new UniformGranularitySpec(
+                          SEGMENT_GRANULARITY.getDefaultGranularity(),
+                          null,
+                          false,
+                          Collections.singletonList(COMPACTION_INTERVAL)
+                      )
+                  )
+                  .withTransform(new TransformSpec(dimFilter, Collections.emptyList()))
+                  .build();
 
 
     List<MSQControllerTask> msqControllerTasks = MSQ_COMPACTION_RUNNER.createMsqControllerTasks(
diff --git a/indexing-hadoop/src/test/java/org/apache/druid/indexer/BatchDeltaIngestionTest.java b/indexing-hadoop/src/test/java/org/apache/druid/indexer/BatchDeltaIngestionTest.java
index 1e4f62ca6e1f..41dae9c9e65e 100644
--- a/indexing-hadoop/src/test/java/org/apache/druid/indexer/BatchDeltaIngestionTest.java
+++ b/indexing-hadoop/src/test/java/org/apache/druid/indexer/BatchDeltaIngestionTest.java
@@ -433,30 +433,33 @@ private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig(
   {
     HadoopDruidIndexerConfig config = new HadoopDruidIndexerConfig(
         new HadoopIngestionSpec(
-            new DataSchema(
-                "website",
-                MAPPER.convertValue(
-                    new StringInputRowParser(
-                        new CSVParseSpec(
-                            new TimestampSpec("timestamp", "yyyyMMddHH", null),
-                            new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host"))),
-                            null,
-                            ImmutableList.of("timestamp", "host", "host2", "visited_num"),
-                            false,
-                            0
-                        ),
-                        null
-                    ),
-                    Map.class
-                ),
-                aggregators != null ? aggregators : new AggregatorFactory[]{
-                    new LongSumAggregatorFactory("visited_sum", "visited_num"),
-                    new HyperUniquesAggregatorFactory("unique_hosts", "host2")
-                },
-                new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, ImmutableList.of(INTERVAL_FULL)),
-                null,
-                MAPPER
-            ),
+            DataSchema.builder()
+                      .withDataSource("website")
+                      .withParserMap(MAPPER.convertValue(
+                          new StringInputRowParser(
+                              new CSVParseSpec(
+                                  new TimestampSpec("timestamp", "yyyyMMddHH", null),
+                                  new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host"))),
+                                  null,
+                                  ImmutableList.of("timestamp", "host", "host2", "visited_num"),
+                                  false,
+                                  0
+                              ),
+                              null
+                          ),
+                          Map.class
+                      ))
+                      .withAggregators(aggregators != null ? aggregators : new AggregatorFactory[]{
+                          new LongSumAggregatorFactory("visited_sum", "visited_num"),
+                          new HyperUniquesAggregatorFactory("unique_hosts", "host2")
+                      })
+                      .withGranularity(new UniformGranularitySpec(
+                          Granularities.DAY,
+                          Granularities.NONE,
+                          ImmutableList.of(INTERVAL_FULL)
+                      ))
+                      .withObjectMapper(MAPPER)
+                      .build(),
             new HadoopIOConfig(
                 inputSpec,
                 null,
diff --git a/indexing-hadoop/src/test/java/org/apache/druid/indexer/DetermineHashedPartitionsJobTest.java b/indexing-hadoop/src/test/java/org/apache/druid/indexer/DetermineHashedPartitionsJobTest.java
index 24a8ee0ef7eb..dd22a95083ce 100644
--- a/indexing-hadoop/src/test/java/org/apache/druid/indexer/DetermineHashedPartitionsJobTest.java
+++ b/indexing-hadoop/src/test/java/org/apache/druid/indexer/DetermineHashedPartitionsJobTest.java
@@ -32,7 +32,6 @@
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.java.util.common.granularity.Granularity;
 import org.apache.druid.java.util.common.granularity.PeriodGranularity;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.DoubleSumAggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
@@ -158,46 +157,45 @@ public DetermineHashedPartitionsJobTest(
     }
 
     HadoopIngestionSpec ingestionSpec = new HadoopIngestionSpec(
-        new DataSchema(
-            "test_schema",
-            HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
-                new StringInputRowParser(
-                    new DelimitedParseSpec(
-                        new TimestampSpec("ts", null, null),
-                        new DimensionsSpec(
-                            DimensionsSpec.getDefaultSchemas(ImmutableList.of(
-                                "market",
-                                "quality",
-                                "placement",
-                                "placementish"
-                            ))
-                        ),
-                        "\t",
-                        null,
-                        Arrays.asList(
-                            "ts",
-                            "market",
-                            "quality",
-                            "placement",
-                            "placementish",
-                            "index"
-                        ),
-                        false,
-                        0
-                    ),
-                    null
-                ),
-                Map.class
-            ),
-            new AggregatorFactory[]{new DoubleSumAggregatorFactory("index", "index")},
-            new UniformGranularitySpec(
-                segmentGranularity,
-                Granularities.NONE,
-                intervals
-            ),
-            null,
-            HadoopDruidIndexerConfig.JSON_MAPPER
-        ),
+        DataSchema.builder()
+                  .withDataSource("test_schema")
+                  .withParserMap(HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
+                      new StringInputRowParser(
+                          new DelimitedParseSpec(
+                              new TimestampSpec("ts", null, null),
+                              new DimensionsSpec(
+                                  DimensionsSpec.getDefaultSchemas(ImmutableList.of(
+                                      "market",
+                                      "quality",
+                                      "placement",
+                                      "placementish"
+                                  ))
+                              ),
+                              "\t",
+                              null,
+                              Arrays.asList(
+                                  "ts",
+                                  "market",
+                                  "quality",
+                                  "placement",
+                                  "placementish",
+                                  "index"
+                              ),
+                              false,
+                              0
+                          ),
+                          null
+                      ),
+                      Map.class
+                  ))
+                  .withAggregators(new DoubleSumAggregatorFactory("index", "index"))
+                  .withGranularity(new UniformGranularitySpec(
+                      segmentGranularity,
+                      Granularities.NONE,
+                      intervals
+                  ))
+                  .withObjectMapper(HadoopDruidIndexerConfig.JSON_MAPPER)
+                  .build(),
         new HadoopIOConfig(
             ImmutableMap.of(
                 "paths",
diff --git a/indexing-hadoop/src/test/java/org/apache/druid/indexer/DeterminePartitionsJobTest.java b/indexing-hadoop/src/test/java/org/apache/druid/indexer/DeterminePartitionsJobTest.java
index a3c98f29565b..bfd28d2cfca0 100644
--- a/indexing-hadoop/src/test/java/org/apache/druid/indexer/DeterminePartitionsJobTest.java
+++ b/indexing-hadoop/src/test/java/org/apache/druid/indexer/DeterminePartitionsJobTest.java
@@ -29,7 +29,6 @@
 import org.apache.druid.java.util.common.FileUtils;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
@@ -280,33 +279,36 @@ public DeterminePartitionsJobTest(
 
     config = new HadoopDruidIndexerConfig(
         new HadoopIngestionSpec(
-            new DataSchema(
-                "website",
-                HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
-                    new StringInputRowParser(
-                        new CSVParseSpec(
-                            new TimestampSpec("timestamp", "yyyyMMddHH", null),
-                            new DimensionsSpec(
-                                DimensionsSpec.getDefaultSchemas(ImmutableList.of("host", "country"))
-                            ),
-                            null,
-                            ImmutableList.of("timestamp", "host", "country", "visited_num"),
-                            false,
-                            0
-                        ),
-                        null
-                    ),
-                    Map.class
-                ),
-                new AggregatorFactory[]{new LongSumAggregatorFactory("visited_num", "visited_num")},
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    Granularities.NONE,
-                    ImmutableList.of(Intervals.of(interval))
-                ),
-                null,
-                HadoopDruidIndexerConfig.JSON_MAPPER
-            ),
+            DataSchema.builder()
+                      .withDataSource("website")
+                      .withParserMap(
+                          HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
+                              new StringInputRowParser(
+                                  new CSVParseSpec(
+                                      new TimestampSpec("timestamp", "yyyyMMddHH", null),
+                                      new DimensionsSpec(
+                                          DimensionsSpec.getDefaultSchemas(ImmutableList.of("host", "country"))
+                                      ),
+                                      null,
+                                      ImmutableList.of("timestamp", "host", "country", "visited_num"),
+                                      false,
+                                      0
+                                  ),
+                                  null
+                              ),
+                              Map.class
+                          )
+                      )
+                      .withAggregators(new LongSumAggregatorFactory("visited_num", "visited_num"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              Granularities.NONE,
+                              ImmutableList.of(Intervals.of(interval))
+                          )
+                      )
+                      .withObjectMapper(HadoopDruidIndexerConfig.JSON_MAPPER)
+                      .build(),
             new HadoopIOConfig(
                 ImmutableMap.of(
                     "paths",
diff --git a/indexing-hadoop/src/test/java/org/apache/druid/indexer/DetermineRangePartitionsJobTest.java b/indexing-hadoop/src/test/java/org/apache/druid/indexer/DetermineRangePartitionsJobTest.java
index e79d066ab55c..3ff525c8b433 100644
--- a/indexing-hadoop/src/test/java/org/apache/druid/indexer/DetermineRangePartitionsJobTest.java
+++ b/indexing-hadoop/src/test/java/org/apache/druid/indexer/DetermineRangePartitionsJobTest.java
@@ -29,7 +29,6 @@
 import org.apache.druid.java.util.common.FileUtils;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
@@ -328,35 +327,36 @@ public DetermineRangePartitionsJobTest(
 
     config = new HadoopDruidIndexerConfig(
         new HadoopIngestionSpec(
-            new DataSchema(
-                "website",
-                null,
-                null,
-                new AggregatorFactory[]{new LongSumAggregatorFactory("visited_num", "visited_num")},
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    Granularities.NONE,
-                    ImmutableList.of(Intervals.of(interval))
-                ),
-                null,
-                HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
-                    new StringInputRowParser(
-                        new CSVParseSpec(
-                            new TimestampSpec("timestamp", "yyyyMMddHH", null),
-                            new DimensionsSpec(
-                                DimensionsSpec.getDefaultSchemas(ImmutableList.of("host", "country"))
-                            ),
-                            null,
-                            ImmutableList.of("timestamp", "host", "country", "visited_num"),
-                            false,
-                            0
-                        ),
-                        null
-                    ),
-                    Map.class
-                ),
-                HadoopDruidIndexerConfig.JSON_MAPPER
-            ),
+            DataSchema.builder()
+                      .withDataSource("website")
+                      .withAggregators(new LongSumAggregatorFactory("visited_num", "visited_num"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              Granularities.NONE,
+                              ImmutableList.of(Intervals.of(interval))
+                          )
+                      )
+                      .withParserMap(
+                          HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
+                              new StringInputRowParser(
+                                  new CSVParseSpec(
+                                      new TimestampSpec("timestamp", "yyyyMMddHH", null),
+                                      new DimensionsSpec(
+                                          DimensionsSpec.getDefaultSchemas(ImmutableList.of("host", "country"))
+                                      ),
+                                      null,
+                                      ImmutableList.of("timestamp", "host", "country", "visited_num"),
+                                      false,
+                                      0
+                                  ),
+                                  null
+                              ),
+                              Map.class
+                          )
+                      )
+                      .withObjectMapper(HadoopDruidIndexerConfig.JSON_MAPPER)
+                      .build(),
             new HadoopIOConfig(
                 ImmutableMap.of(
                     "paths",
diff --git a/indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopDruidIndexerConfigTest.java b/indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopDruidIndexerConfigTest.java
index 8aead05d625b..ed47d180b432 100644
--- a/indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopDruidIndexerConfigTest.java
+++ b/indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopDruidIndexerConfigTest.java
@@ -37,7 +37,6 @@
 import org.apache.druid.java.util.common.DateTimes;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
 import org.apache.druid.timeline.partition.HashBasedNumberedShardSpec;
@@ -217,18 +216,18 @@ public void testGetTargetPartitionSizeWithSingleDimensionPartitionsMaxRowsPerSeg
 
   private static class HadoopIngestionSpecBuilder
   {
-    private static final DataSchema DATA_SCHEMA = new DataSchema(
-        "foo",
-        null,
-        new AggregatorFactory[0],
-        new UniformGranularitySpec(
-            Granularities.MINUTE,
-            Granularities.MINUTE,
-            ImmutableList.of(Intervals.of("2010-01-01/P1D"))
-        ),
-        null,
-        HadoopDruidIndexerConfigTest.JSON_MAPPER
-    );
+    private static final DataSchema DATA_SCHEMA =
+        DataSchema.builder()
+                  .withDataSource("foo")
+                  .withGranularity(
+                      new UniformGranularitySpec(
+                          Granularities.MINUTE,
+                          Granularities.MINUTE,
+                          ImmutableList.of(Intervals.of("2010-01-01/P1D"))
+                      )
+                  )
+                  .withObjectMapper(HadoopDruidIndexerConfigTest.JSON_MAPPER)
+                  .build();
 
     private static final HadoopIOConfig HADOOP_IO_CONFIG = new HadoopIOConfig(
         ImmutableMap.of("paths", "bar", "type", "static"),
diff --git a/indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopDruidIndexerMapperTest.java b/indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopDruidIndexerMapperTest.java
index db20ed8a1847..da57b8ccf4a3 100644
--- a/indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopDruidIndexerMapperTest.java
+++ b/indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopDruidIndexerMapperTest.java
@@ -31,7 +31,6 @@
 import org.apache.druid.java.util.common.jackson.JacksonUtils;
 import org.apache.druid.java.util.common.parsers.JSONPathSpec;
 import org.apache.druid.math.expr.ExprMacroTable;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.query.filter.SelectorDimFilter;
 import org.apache.druid.segment.TestHelper;
@@ -58,27 +57,29 @@
 public class HadoopDruidIndexerMapperTest
 {
   private static final ObjectMapper JSON_MAPPER = TestHelper.makeJsonMapper();
-  private static final DataSchema DATA_SCHEMA = new DataSchema(
-      "test_ds",
-      JSON_MAPPER.convertValue(
-          new HadoopyStringInputRowParser(
-              new JSONParseSpec(
-                  new TimestampSpec("t", "auto", null),
-                  new DimensionsSpec(
-                      DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim1t", "dim2"))
-                  ),
-                  new JSONPathSpec(true, ImmutableList.of()),
-                  ImmutableMap.of(),
-                  null
-              )
-          ),
-          JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT
-      ),
-      new AggregatorFactory[]{new CountAggregatorFactory("rows")},
-      new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null),
-      null,
-      JSON_MAPPER
-  );
+  private static final DataSchema DATA_SCHEMA =
+      DataSchema.builder()
+                .withDataSource("test_ds")
+                .withParserMap(
+                    JSON_MAPPER.convertValue(
+                        new HadoopyStringInputRowParser(
+                            new JSONParseSpec(
+                                new TimestampSpec("t", "auto", null),
+                                new DimensionsSpec(
+                                    DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim1t", "dim2"))
+                                ),
+                                new JSONPathSpec(true, ImmutableList.of()),
+                                ImmutableMap.of(),
+                                null
+                            )
+                        ),
+                        JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT
+                    )
+                )
+                .withAggregators(new CountAggregatorFactory("rows"))
+                .withGranularity(new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null))
+                .withObjectMapper(JSON_MAPPER)
+                .build();
 
   private static final HadoopIOConfig IO_CONFIG = new HadoopIOConfig(
       JSON_MAPPER.convertValue(
diff --git a/indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopIngestionSpecUpdateDatasourcePathSpecSegmentsTest.java b/indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopIngestionSpecUpdateDatasourcePathSpecSegmentsTest.java
index 6402721e73c6..afcfb4023595 100644
--- a/indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopIngestionSpecUpdateDatasourcePathSpecSegmentsTest.java
+++ b/indexing-hadoop/src/test/java/org/apache/druid/indexer/HadoopIngestionSpecUpdateDatasourcePathSpecSegmentsTest.java
@@ -34,7 +34,6 @@
 import org.apache.druid.jackson.DefaultObjectMapper;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
 import org.apache.druid.timeline.DataSegment;
@@ -274,18 +273,17 @@ private HadoopDruidIndexerConfig testRunUpdateSegmentListIfDatasourcePathSpecIsU
       throws Exception
   {
     HadoopIngestionSpec spec = new HadoopIngestionSpec(
-        new DataSchema(
-            "foo",
-            null,
-            new AggregatorFactory[0],
-            new UniformGranularitySpec(
-                Granularities.DAY,
-                null,
-                ImmutableList.of(Intervals.of("2010-01-01/P1D"))
-            ),
-            null,
-            jsonMapper
-        ),
+        DataSchema.builder()
+                  .withDataSource("foo")
+                  .withGranularity(
+                      new UniformGranularitySpec(
+                          Granularities.DAY,
+                          null,
+                          ImmutableList.of(Intervals.of("2010-01-01/P1D"))
+                      )
+                  )
+                  .withObjectMapper(jsonMapper)
+                  .build(),
         new HadoopIOConfig(
             jsonMapper.convertValue(datasourcePathSpec, Map.class),
             null,
diff --git a/indexing-hadoop/src/test/java/org/apache/druid/indexer/IndexGeneratorCombinerTest.java b/indexing-hadoop/src/test/java/org/apache/druid/indexer/IndexGeneratorCombinerTest.java
index 385c28ff0fb0..37cdbb7300d7 100644
--- a/indexing-hadoop/src/test/java/org/apache/druid/indexer/IndexGeneratorCombinerTest.java
+++ b/indexing-hadoop/src/test/java/org/apache/druid/indexer/IndexGeneratorCombinerTest.java
@@ -64,30 +64,33 @@ public void setUp() throws Exception
   {
     HadoopDruidIndexerConfig config = new HadoopDruidIndexerConfig(
         new HadoopIngestionSpec(
-            new DataSchema(
-                "website",
-                HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
-                    new StringInputRowParser(
-                        new TimeAndDimsParseSpec(
-                            new TimestampSpec("timestamp", "yyyyMMddHH", null),
-                            new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host", "keywords")))
-                        ),
-                        null
-                    ),
-                    Map.class
-                ),
-                new AggregatorFactory[]{
-                    new LongSumAggregatorFactory("visited_sum", "visited"),
-                    new HyperUniquesAggregatorFactory("unique_hosts", "host")
-                },
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    Granularities.NONE,
-                    ImmutableList.of(Intervals.of("2010/2011"))
-                ),
-                null,
-                HadoopDruidIndexerConfig.JSON_MAPPER
-            ),
+            DataSchema.builder()
+                      .withDataSource("website")
+                      .withParserMap(
+                          HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
+                              new StringInputRowParser(
+                                  new TimeAndDimsParseSpec(
+                                      new TimestampSpec("timestamp", "yyyyMMddHH", null),
+                                      new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host", "keywords")))
+                                  ),
+                                  null
+                              ),
+                              Map.class
+                          )
+                      )
+                      .withAggregators(
+                          new LongSumAggregatorFactory("visited_sum", "visited"),
+                          new HyperUniquesAggregatorFactory("unique_hosts", "host")
+                      )
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              Granularities.NONE,
+                              ImmutableList.of(Intervals.of("2010/2011"))
+                          )
+                      )
+                      .withObjectMapper(HadoopDruidIndexerConfig.JSON_MAPPER)
+                      .build(),
             new HadoopIOConfig(
                 ImmutableMap.of(
                     "paths",
diff --git a/indexing-hadoop/src/test/java/org/apache/druid/indexer/IndexGeneratorJobTest.java b/indexing-hadoop/src/test/java/org/apache/druid/indexer/IndexGeneratorJobTest.java
index e14ade454f4c..241746ca58d6 100644
--- a/indexing-hadoop/src/test/java/org/apache/druid/indexer/IndexGeneratorJobTest.java
+++ b/indexing-hadoop/src/test/java/org/apache/druid/indexer/IndexGeneratorJobTest.java
@@ -506,17 +506,19 @@ public void setUp() throws Exception
 
     config = new HadoopDruidIndexerConfig(
         new HadoopIngestionSpec(
-            new DataSchema(
-                datasourceName,
-                mapper.convertValue(
-                    inputRowParser,
-                    Map.class
-                ),
-                aggs,
-                new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, ImmutableList.of(this.interval)),
-                null,
-                mapper
-            ),
+            DataSchema.builder()
+                      .withDataSource(datasourceName)
+                      .withParserMap(mapper.convertValue(inputRowParser, Map.class))
+                      .withAggregators(aggs)
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              Granularities.NONE,
+                              ImmutableList.of(interval)
+                          )
+                      )
+                      .withObjectMapper(mapper)
+                      .build(),
             new HadoopIOConfig(
                 ImmutableMap.copyOf(inputSpec),
                 null,
diff --git a/indexing-hadoop/src/test/java/org/apache/druid/indexer/JobHelperTest.java b/indexing-hadoop/src/test/java/org/apache/druid/indexer/JobHelperTest.java
index 7069e9a78de3..530c0f657a51 100644
--- a/indexing-hadoop/src/test/java/org/apache/druid/indexer/JobHelperTest.java
+++ b/indexing-hadoop/src/test/java/org/apache/druid/indexer/JobHelperTest.java
@@ -34,7 +34,6 @@
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.java.util.common.jackson.JacksonUtils;
 import org.apache.druid.java.util.common.parsers.JSONPathSpec;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
 import org.apache.druid.segment.TestHelper;
@@ -68,27 +67,30 @@
 public class JobHelperTest
 {
   private static final ObjectMapper JSON_MAPPER = TestHelper.makeJsonMapper();
-  private static final DataSchema DATA_SCHEMA = new DataSchema(
-      "test_ds",
-      JSON_MAPPER.convertValue(
-          new HadoopyStringInputRowParser(
-              new JSONParseSpec(
-                  new TimestampSpec("t", "auto", null),
-                  new DimensionsSpec(
-                      DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim1t", "dim2"))
-                  ),
-                  new JSONPathSpec(true, ImmutableList.of()),
-                  ImmutableMap.of(),
-                  null
-              )
-          ),
-          JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT
-      ),
-      new AggregatorFactory[]{new CountAggregatorFactory("rows")},
-      new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null),
-      null,
-      JSON_MAPPER
-  );
+
+  private static final DataSchema DATA_SCHEMA =
+      DataSchema.builder()
+                .withDataSource("test_ds")
+                .withParserMap(
+                    JSON_MAPPER.convertValue(
+                        new HadoopyStringInputRowParser(
+                            new JSONParseSpec(
+                                new TimestampSpec("t", "auto", null),
+                                new DimensionsSpec(
+                                    DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim1t", "dim2"))
+                                ),
+                                new JSONPathSpec(true, ImmutableList.of()),
+                                ImmutableMap.of(),
+                                null
+                            )
+                        ),
+                        JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT
+                    )
+                )
+                .withAggregators(new CountAggregatorFactory("rows"))
+                .withGranularity(new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null))
+                .withObjectMapper(JSON_MAPPER)
+                .build();
 
   private static final HadoopIOConfig IO_CONFIG = new HadoopIOConfig(
       JSON_MAPPER.convertValue(
@@ -123,27 +125,34 @@ public void setup() throws Exception
     dataFile = temporaryFolder.newFile();
     config = new HadoopDruidIndexerConfig(
         new HadoopIngestionSpec(
-            new DataSchema(
-                "website",
-                HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
-                    new StringInputRowParser(
-                        new CSVParseSpec(
-                            new TimestampSpec("timestamp", "yyyyMMddHH", null),
-                            new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host"))),
-                            null,
-                            ImmutableList.of("timestamp", "host", "visited_num"),
-                            false,
-                            0
-                        ),
-                        null
-                    ),
-                    Map.class
-                ),
-                new AggregatorFactory[]{new LongSumAggregatorFactory("visited_num", "visited_num")},
-                new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, ImmutableList.of(this.interval)),
-                null,
-                HadoopDruidIndexerConfig.JSON_MAPPER
-            ),
+            DataSchema.builder()
+                      .withDataSource("website")
+                      .withParserMap(
+                          HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
+                              new StringInputRowParser(
+                                  new CSVParseSpec(
+                                      new TimestampSpec("timestamp", "yyyyMMddHH", null),
+                                      new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host"))),
+                                      null,
+                                      ImmutableList.of("timestamp", "host", "visited_num"),
+                                      false,
+                                      0
+                                  ),
+                                  null
+                              ),
+                              Map.class
+                          )
+                      )
+                      .withAggregators(new LongSumAggregatorFactory("visited_num", "visited_num"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              Granularities.NONE,
+                              ImmutableList.of(this.interval)
+                          )
+                      )
+                      .withObjectMapper(HadoopDruidIndexerConfig.JSON_MAPPER)
+                      .build(),
             new HadoopIOConfig(
                 ImmutableMap.of(
                     "paths",
diff --git a/indexing-hadoop/src/test/java/org/apache/druid/indexer/path/DatasourcePathSpecTest.java b/indexing-hadoop/src/test/java/org/apache/druid/indexer/path/DatasourcePathSpecTest.java
index e8caea0256e0..75a4fe45eee6 100644
--- a/indexing-hadoop/src/test/java/org/apache/druid/indexer/path/DatasourcePathSpecTest.java
+++ b/indexing-hadoop/src/test/java/org/apache/druid/indexer/path/DatasourcePathSpecTest.java
@@ -44,7 +44,6 @@
 import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
@@ -308,33 +307,34 @@ private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig()
   {
     return new HadoopDruidIndexerConfig(
         new HadoopIngestionSpec(
-            new DataSchema(
-                ingestionSpec1.getDataSource(),
-                HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
-                    new StringInputRowParser(
-                        new CSVParseSpec(
-                            new TimestampSpec("timestamp", "yyyyMMddHH", null),
-                            DimensionsSpec.EMPTY,
-                            null,
-                            ImmutableList.of("timestamp", "host", "visited"),
-                            false,
-                            0
-                        ),
-                        null
-                    ),
-                    Map.class
-                ),
-                new AggregatorFactory[]{
-                    new LongSumAggregatorFactory("visited_sum", "visited")
-                },
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    Granularities.NONE,
-                    ImmutableList.of(Intervals.of("2000/3000"))
-                ),
-                null,
-                HadoopDruidIndexerConfig.JSON_MAPPER
-            ),
+            DataSchema.builder()
+                      .withDataSource(ingestionSpec1.getDataSource())
+                      .withParserMap(
+                          HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
+                              new StringInputRowParser(
+                                  new CSVParseSpec(
+                                      new TimestampSpec("timestamp", "yyyyMMddHH", null),
+                                      DimensionsSpec.EMPTY,
+                                      null,
+                                      ImmutableList.of("timestamp", "host", "visited"),
+                                      false,
+                                      0
+                                  ),
+                                  null
+                              ),
+                              Map.class
+                          )
+                      )
+                      .withAggregators(new LongSumAggregatorFactory("visited_sum", "visited"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              Granularities.NONE,
+                              ImmutableList.of(Intervals.of("2000/3000"))
+                          )
+                      )
+                      .withObjectMapper(HadoopDruidIndexerConfig.JSON_MAPPER)
+                      .build(),
             new HadoopIOConfig(
                 ImmutableMap.of(
                     "paths",
diff --git a/indexing-hadoop/src/test/java/org/apache/druid/indexer/path/GranularityPathSpecTest.java b/indexing-hadoop/src/test/java/org/apache/druid/indexer/path/GranularityPathSpecTest.java
index 8af77ca0e4fd..92bd8595560c 100644
--- a/indexing-hadoop/src/test/java/org/apache/druid/indexer/path/GranularityPathSpecTest.java
+++ b/indexing-hadoop/src/test/java/org/apache/druid/indexer/path/GranularityPathSpecTest.java
@@ -34,7 +34,6 @@
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.java.util.common.granularity.Granularity;
 import org.apache.druid.java.util.common.granularity.PeriodGranularity;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
 import org.apache.hadoop.mapreduce.Job;
@@ -152,18 +151,17 @@ public void testAddInputPath() throws Exception
   {
     UserGroupInformation.setLoginUser(UserGroupInformation.createUserForTesting("test", new String[]{"testGroup"}));
     HadoopIngestionSpec spec = new HadoopIngestionSpec(
-        new DataSchema(
-            "foo",
-            null,
-            new AggregatorFactory[0],
-            new UniformGranularitySpec(
-                Granularities.DAY,
-                Granularities.MINUTE,
-                ImmutableList.of(Intervals.of("2015-11-06T00:00Z/2015-11-07T00:00Z"))
-            ),
-            null,
-            jsonMapper
-        ),
+        DataSchema.builder()
+                  .withDataSource("foo")
+                  .withGranularity(
+                      new UniformGranularitySpec(
+                          Granularities.DAY,
+                          Granularities.MINUTE,
+                          ImmutableList.of(Intervals.of("2015-11-06T00:00Z/2015-11-07T00:00Z"))
+                      )
+                  )
+                  .withObjectMapper(jsonMapper)
+                  .build(),
         new HadoopIOConfig(null, null, null),
         DEFAULT_TUNING_CONFIG
     );
@@ -204,18 +202,17 @@ public void testIntervalTrimming() throws Exception
   {
     UserGroupInformation.setLoginUser(UserGroupInformation.createUserForTesting("test", new String[]{"testGroup"}));
     HadoopIngestionSpec spec = new HadoopIngestionSpec(
-        new DataSchema(
-            "foo",
-            null,
-            new AggregatorFactory[0],
-            new UniformGranularitySpec(
-                Granularities.DAY,
-                Granularities.ALL,
-                ImmutableList.of(Intervals.of("2015-01-01T11Z/2015-01-02T05Z"))
-            ),
-            null,
-            jsonMapper
-        ),
+        DataSchema.builder()
+                  .withDataSource("foo")
+                  .withGranularity(
+                      new UniformGranularitySpec(
+                          Granularities.DAY,
+                          Granularities.ALL,
+                          ImmutableList.of(Intervals.of("2015-01-01T11Z/2015-01-02T05Z"))
+                      )
+                  )
+                  .withObjectMapper(jsonMapper)
+                  .build(),
         new HadoopIOConfig(null, null, null),
         DEFAULT_TUNING_CONFIG
     );
diff --git a/indexing-hadoop/src/test/java/org/apache/druid/indexer/path/StaticPathSpecTest.java b/indexing-hadoop/src/test/java/org/apache/druid/indexer/path/StaticPathSpecTest.java
index 06a1416ad83f..fa1b2f59f48e 100644
--- a/indexing-hadoop/src/test/java/org/apache/druid/indexer/path/StaticPathSpecTest.java
+++ b/indexing-hadoop/src/test/java/org/apache/druid/indexer/path/StaticPathSpecTest.java
@@ -24,7 +24,6 @@
 import org.apache.druid.indexer.HadoopIOConfig;
 import org.apache.druid.indexer.HadoopIngestionSpec;
 import org.apache.druid.jackson.DefaultObjectMapper;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
@@ -54,7 +53,7 @@ public void testAddingPaths() throws Exception
     Job job = new Job();
     StaticPathSpec pathSpec = new StaticPathSpec("/a/c,/a/b/{c,d}", null);
 
-    DataSchema schema = new DataSchema("ds", null, new AggregatorFactory[0], null, null, jsonMapper);
+    DataSchema schema = DataSchema.builder().withDataSource("ds").withObjectMapper(jsonMapper).build();
     HadoopIOConfig io = new HadoopIOConfig(null, null, null);
     pathSpec.addInputPaths(new HadoopDruidIndexerConfig(new HadoopIngestionSpec(schema, io, null)), job);
 
diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java
index f98287124ed0..8b4795d75f51 100644
--- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java
+++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/sampler/InputSourceSampler.java
@@ -67,14 +67,13 @@ public class InputSourceSampler
 {
   private static final String SAMPLER_DATA_SOURCE = "sampler";
 
-  private static final DataSchema DEFAULT_DATA_SCHEMA = new DataSchema(
-      SAMPLER_DATA_SOURCE,
-      new TimestampSpec(null, null, null),
-      new DimensionsSpec(null),
-      null,
-      null,
-      null
-  );
+
+  private static final DataSchema DEFAULT_DATA_SCHEMA =
+      DataSchema.builder()
+                .withDataSource(SAMPLER_DATA_SOURCE)
+                .withTimestamp(new TimestampSpec(null, null, null))
+                .withDimensions(DimensionsSpec.builder().build())
+                .build();
 
   // We want to be able to sort the list of processed results back into the same order that we read them from the input
   // source so that the rows in the data loader are not always changing. To do this, we add a temporary column to the
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/TestIndexTask.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/TestIndexTask.java
index f6732f68a6c4..ff0aa674ef3f 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/TestIndexTask.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/TestIndexTask.java
@@ -33,7 +33,6 @@
 import org.apache.druid.indexing.common.task.TaskResource;
 import org.apache.druid.indexing.common.task.TuningConfigBuilder;
 import org.apache.druid.indexing.overlord.SegmentPublishResult;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.IndexSpec;
 import org.apache.druid.segment.SegmentSchemaMapping;
 import org.apache.druid.segment.indexing.DataSchema;
@@ -62,7 +61,7 @@ public TestIndexTask(
         id,
         taskResource,
         new IndexIngestionSpec(
-            new DataSchema(dataSource, null, new AggregatorFactory[]{}, null, null, mapper),
+            DataSchema.builder().withDataSource(dataSource).withObjectMapper(mapper).build(),
             new IndexTask.IndexIOConfig(
                 new LocalInputSource(new File("lol"), "rofl"),
                 new JsonInputFormat(null, null, null, null, null),
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskParallelRunTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskParallelRunTest.java
index 1b742971eb95..377f4ece0657 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskParallelRunTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskParallelRunTest.java
@@ -939,18 +939,19 @@ private void runIndexTask(@Nullable PartitionsSpec partitionsSpec, boolean appen
         null,
         null,
         new ParallelIndexIngestionSpec(
-            new DataSchema(
-                DATA_SOURCE,
-                new TimestampSpec("ts", "auto", null),
-                new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Arrays.asList("ts", "dim"))),
-                new AggregatorFactory[]{new LongSumAggregatorFactory("val", "val")},
-                new UniformGranularitySpec(
-                    Granularities.HOUR,
-                    Granularities.MINUTE,
-                    ImmutableList.of(INTERVAL_TO_INDEX)
-                ),
-                null
-            ),
+            DataSchema.builder()
+                      .withDataSource(DATA_SOURCE)
+                      .withTimestamp(new TimestampSpec("ts", "auto", null))
+                      .withDimensions(DimensionsSpec.getDefaultSchemas(Arrays.asList("ts", "dim")))
+                      .withAggregators(new LongSumAggregatorFactory("val", "val"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.HOUR,
+                              Granularities.MINUTE,
+                              ImmutableList.of(INTERVAL_TO_INDEX)
+                          )
+                      )
+                      .build(),
             ioConfig,
             tuningConfig
         ),
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/HadoopIndexTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/HadoopIndexTaskTest.java
index ff828f16789d..0a72b77e1db7 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/HadoopIndexTaskTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/HadoopIndexTaskTest.java
@@ -27,7 +27,6 @@
 import org.apache.druid.jackson.DefaultObjectMapper;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
 import org.apache.druid.server.security.Action;
@@ -50,15 +49,19 @@ public void testCorrectInputSourceResources()
     final HadoopIndexTask task = new HadoopIndexTask(
         null,
         new HadoopIngestionSpec(
-            new DataSchema(
-                "foo", null, new AggregatorFactory[0], new UniformGranularitySpec(
-                Granularities.DAY,
-                null,
-                ImmutableList.of(Intervals.of("2010-01-01/P1D"))
-            ),
-                null,
-                jsonMapper
-            ), new HadoopIOConfig(ImmutableMap.of("paths", "bar"), null, null), null
+            DataSchema.builder()
+                      .withDataSource("foo")
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              null,
+                              ImmutableList.of(Intervals.of("2010-01-01/P1D"))
+                          )
+                      )
+                      .withObjectMapper(jsonMapper)
+                      .build(),
+            new HadoopIOConfig(ImmutableMap.of("paths", "bar"), null, null),
+            null
         ),
         null,
         null,
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IndexIngestionSpecTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IndexIngestionSpecTest.java
index ab953ba954ac..d84aa154fd26 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IndexIngestionSpecTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IndexIngestionSpecTest.java
@@ -25,7 +25,6 @@
 import org.apache.druid.indexing.common.task.IndexTask.IndexIOConfig;
 import org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec;
 import org.junit.Rule;
@@ -45,14 +44,11 @@ public void testParserAndInputFormat()
         "Cannot use parser and inputSource together. Try using inputFormat instead of parser."
     );
     final IndexIngestionSpec spec = new IndexIngestionSpec(
-        new DataSchema(
-            "dataSource",
-            ImmutableMap.of("fake", "parser map"),
-            new AggregatorFactory[0],
-            new ArbitraryGranularitySpec(Granularities.NONE, null),
-            null,
-            null
-        ),
+        DataSchema.builder()
+                  .withDataSource("dataSource")
+                  .withParserMap(ImmutableMap.of("fake", "parser map"))
+                  .withGranularity(new ArbitraryGranularitySpec(Granularities.NONE, null))
+                  .build(),
         new IndexIOConfig(
             new NoopInputSource(),
             new NoopInputFormat(),
@@ -69,14 +65,11 @@ public void testParserAndInputSource()
     expectedException.expect(IllegalArgumentException.class);
     expectedException.expectMessage("Cannot use parser and inputSource together.");
     final IndexIngestionSpec spec = new IndexIngestionSpec(
-        new DataSchema(
-            "dataSource",
-            ImmutableMap.of("fake", "parser map"),
-            new AggregatorFactory[0],
-            new ArbitraryGranularitySpec(Granularities.NONE, null),
-            null,
-            null
-        ),
+        DataSchema.builder()
+                  .withDataSource("dataSource")
+                  .withParserMap(ImmutableMap.of("fake", "parser map"))
+                  .withGranularity(new ArbitraryGranularitySpec(Granularities.NONE, null))
+                  .build(),
         new IndexIOConfig(
             new NoopInputSource(),
             null,
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IndexTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IndexTaskTest.java
index d03ccf465e57..defa2107b9ac 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IndexTaskTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IndexTaskTest.java
@@ -166,6 +166,25 @@ public class IndexTaskTest extends IngestionTestBase
       0
   );
 
+  private static final DataSchema DATA_SCHEMA =
+      DataSchema.builder()
+                .withDataSource("test-json")
+                .withTimestamp(DEFAULT_TIMESTAMP_SPEC)
+                .withDimensions(
+                    new StringDimensionSchema("ts"),
+                    new StringDimensionSchema("dim"),
+                    new LongDimensionSchema("valDim")
+                )
+                .withAggregators(new LongSumAggregatorFactory("valMet", "val"))
+                .withGranularity(
+                    new UniformGranularitySpec(
+                        Granularities.DAY,
+                        Granularities.MINUTE,
+                        Collections.singletonList(Intervals.of("2014/P1D"))
+                    )
+                )
+                .build();
+
   @Parameterized.Parameters(name = "{0}, useInputFormatApi={1}")
   public static Iterable<Object[]> constructorFeeder()
   {
@@ -225,24 +244,7 @@ public void testCorrectInputSourceResources()
   {
     IndexTask indexTask = createIndexTask(
         new IndexIngestionSpec(
-            new DataSchema(
-                "test-json",
-                DEFAULT_TIMESTAMP_SPEC,
-                new DimensionsSpec(
-                    ImmutableList.of(
-                        new StringDimensionSchema("ts"),
-                        new StringDimensionSchema("dim"),
-                        new LongDimensionSchema("valDim")
-                    )
-                ),
-                new AggregatorFactory[]{new LongSumAggregatorFactory("valMet", "val")},
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    Granularities.MINUTE,
-                    Collections.singletonList(Intervals.of("2014/P1D"))
-                ),
-                null
-            ),
+            DATA_SCHEMA,
             new IndexIOConfig(
                 new LocalInputSource(tmpDir, "druid*"),
                 DEFAULT_INPUT_FORMAT,
@@ -275,24 +277,7 @@ public void testIngestNullOnlyColumns() throws Exception
 
     IndexTask indexTask = createIndexTask(
         new IndexIngestionSpec(
-            new DataSchema(
-                "test-json",
-                DEFAULT_TIMESTAMP_SPEC,
-                new DimensionsSpec(
-                    ImmutableList.of(
-                        new StringDimensionSchema("ts"),
-                        new StringDimensionSchema("dim"),
-                        new LongDimensionSchema("valDim")
-                    )
-                ),
-                new AggregatorFactory[]{new LongSumAggregatorFactory("valMet", "val")},
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    Granularities.MINUTE,
-                    Collections.singletonList(Intervals.of("2014/P1D"))
-                ),
-                null
-            ),
+            DATA_SCHEMA,
             new IndexIOConfig(
                 new LocalInputSource(tmpDir, "druid*"),
                 DEFAULT_INPUT_FORMAT,
@@ -337,24 +322,7 @@ public void testIngestNullOnlyColumns_storeEmptyColumnsOff_shouldNotStoreEmptyCo
 
     IndexTask indexTask = createIndexTask(
         new IndexIngestionSpec(
-            new DataSchema(
-                "test-json",
-                DEFAULT_TIMESTAMP_SPEC,
-                new DimensionsSpec(
-                    ImmutableList.of(
-                        new StringDimensionSchema("ts"),
-                        new StringDimensionSchema("dim"),
-                        new LongDimensionSchema("valDim")
-                    )
-                ),
-                new AggregatorFactory[]{new LongSumAggregatorFactory("valMet", "val")},
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    Granularities.MINUTE,
-                    Collections.singletonList(Intervals.of("2014/P1D"))
-                ),
-                null
-            ),
+            DATA_SCHEMA,
             new IndexIOConfig(
                 new LocalInputSource(tmpDir, "druid*"),
                 DEFAULT_INPUT_FORMAT,
@@ -2698,20 +2666,20 @@ private static IndexIngestionSpec createIngestionSpec(
     if (inputFormat != null) {
       Preconditions.checkArgument(parseSpec == null, "Can't use parseSpec");
       return new IndexIngestionSpec(
-          new DataSchema(
-              DATASOURCE,
-              Preconditions.checkNotNull(timestampSpec, "timestampSpec"),
-              Preconditions.checkNotNull(dimensionsSpec, "dimensionsSpec"),
-              new AggregatorFactory[]{
-                  new LongSumAggregatorFactory("val", "val")
-              },
-              granularitySpec != null ? granularitySpec : new UniformGranularitySpec(
-                  Granularities.DAY,
-                  Granularities.MINUTE,
-                  Collections.singletonList(Intervals.of("2014/2015"))
-              ),
-              transformSpec
-          ),
+          DataSchema.builder()
+                    .withDataSource(DATASOURCE)
+                    .withTimestamp(Preconditions.checkNotNull(timestampSpec, "timestampSpec"))
+                    .withDimensions(Preconditions.checkNotNull(dimensionsSpec, "dimensionsSpec"))
+                    .withAggregators(new LongSumAggregatorFactory("val", "val"))
+                    .withGranularity(
+                        granularitySpec != null ? granularitySpec : new UniformGranularitySpec(
+                            Granularities.DAY,
+                            Granularities.MINUTE,
+                            Collections.singletonList(Intervals.of("2014/2015"))
+                        )
+                    )
+                    .withTransform(transformSpec)
+                    .build(),
           new IndexIOConfig(
               new LocalInputSource(baseDir, "druid*"),
               inputFormat,
@@ -2723,22 +2691,21 @@ private static IndexIngestionSpec createIngestionSpec(
     } else {
       parseSpec = parseSpec != null ? parseSpec : DEFAULT_PARSE_SPEC;
       return new IndexIngestionSpec(
-          new DataSchema(
-              DATASOURCE,
-              parseSpec.getTimestampSpec(),
-              parseSpec.getDimensionsSpec(),
-              new AggregatorFactory[]{
-                  new LongSumAggregatorFactory("val", "val")
-              },
-              granularitySpec != null ? granularitySpec : new UniformGranularitySpec(
-                  Granularities.DAY,
-                  Granularities.MINUTE,
-                  Collections.singletonList(Intervals.of("2014/2015"))
-              ),
-              transformSpec,
-              null,
-              objectMapper
-          ),
+          DataSchema.builder()
+                    .withDataSource(DATASOURCE)
+                    .withTimestamp(parseSpec.getTimestampSpec())
+                    .withDimensions(parseSpec.getDimensionsSpec())
+                    .withAggregators(new LongSumAggregatorFactory("val", "val"))
+                    .withGranularity(
+                        granularitySpec != null ? granularitySpec : new UniformGranularitySpec(
+                            Granularities.DAY,
+                            Granularities.MINUTE,
+                            Collections.singletonList(Intervals.of("2014/2015"))
+                        )
+                    )
+                    .withTransform(transformSpec)
+                    .withObjectMapper(objectMapper)
+                    .build(),
           new IndexIOConfig(
               new LocalInputSource(baseDir, "druid*"),
               createInputFormatFromParseSpec(parseSpec),
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/TaskSerdeTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/TaskSerdeTest.java
index e6ea0e1329ae..99b0f8e7a759 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/TaskSerdeTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/TaskSerdeTest.java
@@ -38,7 +38,6 @@
 import org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexTuningConfig;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.DoubleSumAggregatorFactory;
 import org.apache.druid.segment.IndexSpec;
 import org.apache.druid.segment.indexing.DataSchema;
@@ -220,18 +219,19 @@ public void testIndexTaskSerde() throws Exception
         null,
         null,
         new IndexIngestionSpec(
-            new DataSchema(
-                "foo",
-                new TimestampSpec(null, null, null),
-                DimensionsSpec.EMPTY,
-                new AggregatorFactory[]{new DoubleSumAggregatorFactory("met", "met")},
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    null,
-                    ImmutableList.of(Intervals.of("2010-01-01/P2D"))
-                ),
-                null
-            ),
+            DataSchema.builder()
+                      .withDataSource("foo")
+                      .withTimestamp(new TimestampSpec(null, null, null))
+                      .withDimensions(DimensionsSpec.EMPTY)
+                      .withAggregators(new DoubleSumAggregatorFactory("met", "met"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              null,
+                              ImmutableList.of(Intervals.of("2010-01-01/P2D"))
+                          )
+                      )
+                      .build(),
             new IndexIOConfig(new LocalInputSource(new File("lol"), "rofl"), new NoopInputFormat(), true, false),
             TuningConfigBuilder.forIndexTask()
                                .withMaxRowsInMemory(10)
@@ -288,18 +288,19 @@ public void testIndexTaskwithResourceSerde() throws Exception
         null,
         new TaskResource("rofl", 2),
         new IndexIngestionSpec(
-            new DataSchema(
-                "foo",
-                new TimestampSpec(null, null, null),
-                DimensionsSpec.EMPTY,
-                new AggregatorFactory[]{new DoubleSumAggregatorFactory("met", "met")},
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    null,
-                    ImmutableList.of(Intervals.of("2010-01-01/P2D"))
-                ),
-                null
-            ),
+            DataSchema.builder()
+                      .withDataSource("foo")
+                      .withTimestamp(new TimestampSpec(null, null, null))
+                      .withDimensions(DimensionsSpec.EMPTY)
+                      .withAggregators(new DoubleSumAggregatorFactory("met", "met"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              null,
+                              ImmutableList.of(Intervals.of("2010-01-01/P2D"))
+                          )
+                      )
+                      .build(),
             new IndexIOConfig(new LocalInputSource(new File("lol"), "rofl"), new NoopInputFormat(), true, false),
             TuningConfigBuilder.forIndexTask()
                                .withMaxRowsInMemory(10)
@@ -412,15 +413,19 @@ public void testHadoopIndexTaskSerde() throws Exception
     final HadoopIndexTask task = new HadoopIndexTask(
         null,
         new HadoopIngestionSpec(
-            new DataSchema(
-                "foo", null, new AggregatorFactory[0], new UniformGranularitySpec(
-                Granularities.DAY,
-                null,
-                ImmutableList.of(Intervals.of("2010-01-01/P1D"))
-            ),
-                null,
-                jsonMapper
-            ), new HadoopIOConfig(ImmutableMap.of("paths", "bar"), null, null), null
+            DataSchema.builder()
+                      .withDataSource("foo")
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              null,
+                              ImmutableList.of(Intervals.of("2010-01-01/P1D"))
+                          )
+                      )
+                      .withObjectMapper(jsonMapper)
+                      .build(),
+            new HadoopIOConfig(ImmutableMap.of("paths", "bar"), null, null),
+            null
         ),
         null,
         null,
@@ -454,19 +459,18 @@ public void testHadoopIndexTaskWithContextSerde() throws Exception
     final HadoopIndexTask task = new HadoopIndexTask(
         null,
         new HadoopIngestionSpec(
-            new DataSchema(
-                "foo",
-                null,
-                null,
-                new AggregatorFactory[0],
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    null, ImmutableList.of(Intervals.of("2010-01-01/P1D"))
-                ),
-                null,
-                null,
-                jsonMapper
-            ), new HadoopIOConfig(ImmutableMap.of("paths", "bar"), null, null), null
+            DataSchema.builder()
+                      .withDataSource("foo")
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              null, ImmutableList.of(Intervals.of("2010-01-01/P1D"))
+                          )
+                      )
+                      .withObjectMapper(jsonMapper)
+                      .build(),
+            new HadoopIOConfig(ImmutableMap.of("paths", "bar"), null, null),
+            null
         ),
         null,
         null,
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractMultiPhaseParallelIndexingTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractMultiPhaseParallelIndexingTest.java
index 44b8284f407d..6caab0a0652e 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractMultiPhaseParallelIndexingTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/AbstractMultiPhaseParallelIndexingTest.java
@@ -221,14 +221,13 @@ protected ParallelIndexSupervisorTask createTask(
           dropExisting
       );
       ingestionSpec = new ParallelIndexIngestionSpec(
-          new DataSchema(
-              DATASOURCE,
-              timestampSpec,
-              dimensionsSpec,
-              DEFAULT_METRICS_SPEC,
-              granularitySpec,
-              null
-          ),
+          DataSchema.builder()
+                    .withDataSource(DATASOURCE)
+                    .withTimestamp(timestampSpec)
+                    .withDimensions(dimensionsSpec)
+                    .withAggregators(DEFAULT_METRICS_SPEC)
+                    .withGranularity(granularitySpec)
+                    .build(),
           ioConfig,
           tuningConfig
       );
@@ -241,14 +240,13 @@ protected ParallelIndexSupervisorTask createTask(
           dropExisting
       );
       ingestionSpec = new ParallelIndexIngestionSpec(
-          new DataSchema(
-              DATASOURCE,
-              parseSpec.getTimestampSpec(),
-              parseSpec.getDimensionsSpec(),
-              DEFAULT_METRICS_SPEC,
-              granularitySpec,
-              null
-          ),
+          DataSchema.builder()
+                    .withDataSource(DATASOURCE)
+                    .withTimestamp(parseSpec.getTimestampSpec())
+                    .withDimensions(parseSpec.getDimensionsSpec())
+                    .withAggregators(DEFAULT_METRICS_SPEC)
+                    .withGranularity(granularitySpec)
+                    .build(),
           ioConfig,
           tuningConfig
       );
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/HashPartitionTaskKillTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/HashPartitionTaskKillTest.java
index b8c59d042a31..a21dbb84616b 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/HashPartitionTaskKillTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/HashPartitionTaskKillTest.java
@@ -41,7 +41,6 @@
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.java.util.common.guava.Comparators;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.GranularitySpec;
@@ -250,14 +249,13 @@ private ParallelIndexSupervisorTask createTestTask(
           null
       );
       ingestionSpec = new ParallelIndexIngestionSpec(
-          new DataSchema(
-              DATASOURCE,
-              timestampSpec,
-              dimensionsSpec,
-              new AggregatorFactory[]{new LongSumAggregatorFactory("val", "val")},
-              granularitySpec,
-              null
-          ),
+          DataSchema.builder()
+                    .withDataSource(DATASOURCE)
+                    .withTimestamp(timestampSpec)
+                    .withDimensions(dimensionsSpec)
+                    .withAggregators(new LongSumAggregatorFactory("val", "val"))
+                    .withGranularity(granularitySpec)
+                    .build(),
           ioConfig,
           tuningConfig
       );
@@ -271,16 +269,13 @@ private ParallelIndexSupervisorTask createTestTask(
       );
       //noinspection unchecked
       ingestionSpec = new ParallelIndexIngestionSpec(
-          new DataSchema(
-              "dataSource",
-              parseSpec.getTimestampSpec(),
-              parseSpec.getDimensionsSpec(),
-              new AggregatorFactory[]{
-                  new LongSumAggregatorFactory("val", "val")
-              },
-              granularitySpec,
-              null
-          ),
+          DataSchema.builder()
+                    .withDataSource("dataSource")
+                    .withTimestamp(parseSpec.getTimestampSpec())
+                    .withDimensions(parseSpec.getDimensionsSpec())
+                    .withAggregators(new LongSumAggregatorFactory("val", "val"))
+                    .withGranularity(granularitySpec)
+                    .build(),
           ioConfig,
           tuningConfig
       );
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/MultiPhaseParallelIndexingWithNullColumnTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/MultiPhaseParallelIndexingWithNullColumnTest.java
index 0d19cd86e03c..3adc154bb226 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/MultiPhaseParallelIndexingWithNullColumnTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/MultiPhaseParallelIndexingWithNullColumnTest.java
@@ -127,18 +127,19 @@ public void testIngestNullColumn() throws JsonProcessingException
         null,
         null,
         new ParallelIndexIngestionSpec(
-            new DataSchema(
-                DATASOURCE,
-                TIMESTAMP_SPEC,
-                DIMENSIONS_SPEC.withDimensions(dimensionSchemas),
-                DEFAULT_METRICS_SPEC,
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    Granularities.MINUTE,
-                    INTERVAL_TO_INDEX
-                ),
-                null
-            ),
+            DataSchema.builder()
+                      .withDataSource(DATASOURCE)
+                      .withTimestamp(DEFAULT_TIMESTAMP_SPEC)
+                      .withDimensions(DEFAULT_DIMENSIONS_SPEC.withDimensions(dimensionSchemas))
+                      .withAggregators(DEFAULT_METRICS_SPEC)
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              Granularities.MINUTE,
+                              INTERVAL_TO_INDEX
+                          )
+                      )
+                      .build(),
             new ParallelIndexIOConfig(
                 getInputSource(),
                 JSON_FORMAT,
@@ -177,18 +178,21 @@ public void testIngestNullColumn_useFieldDiscovery_includeAllDimensions_shouldSt
         null,
         null,
         new ParallelIndexIngestionSpec(
-            new DataSchema(
-                DATASOURCE,
-                TIMESTAMP_SPEC,
-                new DimensionsSpec.Builder().setDimensions(dimensionSchemas).setIncludeAllDimensions(true).build(),
-                DEFAULT_METRICS_SPEC,
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    Granularities.MINUTE,
-                    INTERVAL_TO_INDEX
-                ),
-                null
-            ),
+            DataSchema.builder()
+                      .withDataSource(DATASOURCE)
+                      .withTimestamp(TIMESTAMP_SPEC)
+                      .withDimensions(
+                          DimensionsSpec.builder().setDimensions(dimensionSchemas).setIncludeAllDimensions(true).build()
+                      )
+                      .withAggregators(DEFAULT_METRICS_SPEC)
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              Granularities.MINUTE,
+                              INTERVAL_TO_INDEX
+                          )
+                      )
+                      .build(),
             new ParallelIndexIOConfig(
                 getInputSource(),
                 new JsonInputFormat(
@@ -237,18 +241,21 @@ public void testIngestNullColumn_explicitPathSpec_useFieldDiscovery_includeAllDi
         null,
         null,
         new ParallelIndexIngestionSpec(
-            new DataSchema(
-                DATASOURCE,
-                TIMESTAMP_SPEC,
-                new DimensionsSpec.Builder().setIncludeAllDimensions(true).build(),
-                DEFAULT_METRICS_SPEC,
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    Granularities.MINUTE,
-                    null
-                ),
-                null
-            ),
+            DataSchema.builder()
+                      .withDataSource(DATASOURCE)
+                      .withTimestamp(TIMESTAMP_SPEC)
+                      .withDimensions(
+                          DimensionsSpec.builder().setIncludeAllDimensions(true).build()
+                      )
+                      .withAggregators(DEFAULT_METRICS_SPEC)
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              Granularities.MINUTE,
+                              null
+                          )
+                      )
+                      .build(),
             new ParallelIndexIOConfig(
                 getInputSource(),
                 new JsonInputFormat(
@@ -303,20 +310,23 @@ public void testIngestNullColumn_storeEmptyColumnsOff_shouldNotStoreEmptyColumns
         null,
         null,
         new ParallelIndexIngestionSpec(
-            new DataSchema(
-                DATASOURCE,
-                TIMESTAMP_SPEC,
-                DIMENSIONS_SPEC.withDimensions(
-                    DimensionsSpec.getDefaultSchemas(Arrays.asList("ts", "unknownDim"))
-                ),
-                DEFAULT_METRICS_SPEC,
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    Granularities.MINUTE,
-                    INTERVAL_TO_INDEX
-                ),
-                null
-            ),
+            DataSchema.builder()
+                      .withDataSource(DATASOURCE)
+                      .withTimestamp(TIMESTAMP_SPEC)
+                      .withDimensions(
+                          DIMENSIONS_SPEC.withDimensions(
+                              DimensionsSpec.getDefaultSchemas(Arrays.asList("ts", "unknownDim"))
+                          )
+                      )
+                      .withAggregators(DEFAULT_METRICS_SPEC)
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              Granularities.MINUTE,
+                              INTERVAL_TO_INDEX
+                          )
+                      )
+                      .build(),
             new ParallelIndexIOConfig(
                 getInputSource(),
                 JSON_FORMAT,
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskKillTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskKillTest.java
index cbf711469734..2ea9385a0145 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskKillTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskKillTest.java
@@ -36,7 +36,6 @@
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.Pair;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
@@ -147,20 +146,19 @@ private ParallelIndexSupervisorTask newTask(
     final int numTotalSubTasks = inputSource.estimateNumSplits(new NoopInputFormat(), null);
     // set up ingestion spec
     final ParallelIndexIngestionSpec ingestionSpec = new ParallelIndexIngestionSpec(
-        new DataSchema(
-            "dataSource",
-            DEFAULT_TIMESTAMP_SPEC,
-            DEFAULT_DIMENSIONS_SPEC,
-            new AggregatorFactory[]{
-                new LongSumAggregatorFactory("val", "val")
-            },
-            new UniformGranularitySpec(
-                Granularities.DAY,
-                Granularities.MINUTE,
-                interval == null ? null : Collections.singletonList(interval)
-            ),
-            null
-        ),
+        DataSchema.builder()
+                  .withDataSource("dataSource")
+                  .withTimestamp(DEFAULT_TIMESTAMP_SPEC)
+                  .withDimensions(DEFAULT_DIMENSIONS_SPEC)
+                  .withAggregators(new LongSumAggregatorFactory("val", "val"))
+                  .withGranularity(
+                      new UniformGranularitySpec(
+                          Granularities.DAY,
+                          Granularities.MINUTE,
+                          interval == null ? null : Collections.singletonList(interval)
+                      )
+                  )
+                  .build(),
         ioConfig,
         TuningConfigBuilder.forParallelIndexTask().withMaxNumConcurrentSubTasks(numTotalSubTasks).build()
     );
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskResourceTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskResourceTest.java
index c9858b80847b..4587ef6ce7e3 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskResourceTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskResourceTest.java
@@ -45,7 +45,6 @@
 import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
@@ -400,20 +399,19 @@ private TestSupervisorTask newTask(
   )
   {
     final ParallelIndexIngestionSpec ingestionSpec = new ParallelIndexIngestionSpec(
-        new DataSchema(
-            "dataSource",
-            DEFAULT_TIMESTAMP_SPEC,
-            DEFAULT_DIMENSIONS_SPEC,
-            new AggregatorFactory[]{
-                new LongSumAggregatorFactory("val", "val")
-            },
-            new UniformGranularitySpec(
-                Granularities.DAY,
-                Granularities.MINUTE,
-                interval == null ? null : Collections.singletonList(interval)
-            ),
-            null
-        ),
+        DataSchema.builder()
+                  .withDataSource("dataSource")
+                  .withTimestamp(DEFAULT_TIMESTAMP_SPEC)
+                  .withDimensions(DEFAULT_DIMENSIONS_SPEC)
+                  .withAggregators(new LongSumAggregatorFactory("val", "val"))
+                  .withGranularity(
+                      new UniformGranularitySpec(
+                          Granularities.DAY,
+                          Granularities.MINUTE,
+                          interval == null ? null : Collections.singletonList(interval)
+                      )
+                  )
+                  .build(),
         ioConfig,
         TuningConfigBuilder.forParallelIndexTask().withMaxNumConcurrentSubTasks(NUM_SUB_TASKS).build()
     );
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskSerdeTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskSerdeTest.java
index e11fb2ef001c..fe5188fec6c5 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskSerdeTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskSerdeTest.java
@@ -34,7 +34,6 @@
 import org.apache.druid.indexing.common.task.TuningConfigBuilder;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
@@ -220,16 +219,19 @@ ParallelIndexIngestionSpecBuilder partitionsSpec(PartitionsSpec partitionsSpec)
 
     ParallelIndexIngestionSpec build()
     {
-      DataSchema dataSchema = new DataSchema(
-          "dataSource",
-          TIMESTAMP_SPEC,
-          DIMENSIONS_SPEC,
-          new AggregatorFactory[]{
-              new LongSumAggregatorFactory("val", "val")
-          },
-          new UniformGranularitySpec(Granularities.DAY, Granularities.MINUTE, inputIntervals),
-          null
-      );
+      DataSchema dataSchema = DataSchema.builder()
+                                        .withDataSource("datasource")
+                                        .withTimestamp(TIMESTAMP_SPEC)
+                                        .withDimensions(DIMENSIONS_SPEC)
+                                        .withAggregators(new LongSumAggregatorFactory("val", "val"))
+                                        .withGranularity(
+                                            new UniformGranularitySpec(
+                                                Granularities.DAY,
+                                                Granularities.MINUTE,
+                                                inputIntervals
+                                            )
+                                        )
+                                        .build();
 
       ParallelIndexTuningConfig tuningConfig = TuningConfigBuilder
           .forParallelIndexTask()
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskTest.java
index 57dbafa173f9..b908d274e6c3 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTaskTest.java
@@ -263,14 +263,11 @@ public void testFailToConstructWhenBothAppendToExistingAndForceGuaranteedRollupA
           .withLogParseExceptions(false)
           .build();
       final ParallelIndexIngestionSpec indexIngestionSpec = new ParallelIndexIngestionSpec(
-          new DataSchema(
-              "datasource",
-              new TimestampSpec(null, null, null),
-              DimensionsSpec.EMPTY,
-              null,
-              null,
-              null
-          ),
+          DataSchema.builder()
+                    .withDataSource("datasource")
+                    .withTimestamp(new TimestampSpec(null, null, null))
+                    .withDimensions(DimensionsSpec.EMPTY)
+                    .build(),
           ioConfig,
           tuningConfig
       );
@@ -325,25 +322,24 @@ public void testFailToConstructWhenBothInputSourceAndParserAreSet()
       expectedException.expect(IAE.class);
       expectedException.expectMessage("Cannot use parser and inputSource together. Try using inputFormat instead of parser.");
       new ParallelIndexIngestionSpec(
-          new DataSchema(
-              "datasource",
-              mapper.convertValue(
-                  new StringInputRowParser(
-                      new JSONParseSpec(
-                          new TimestampSpec(null, null, null),
-                          DimensionsSpec.EMPTY,
-                          null,
-                          null,
-                          null
-                      )
-                  ),
-                  Map.class
-              ),
-              null,
-              null,
-              null,
-              mapper
-          ),
+          DataSchema.builder()
+                    .withDataSource("datasource")
+                    .withParserMap(
+                        mapper.convertValue(
+                            new StringInputRowParser(
+                                new JSONParseSpec(
+                                    new TimestampSpec(null, null, null),
+                                    DimensionsSpec.EMPTY,
+                                    null,
+                                    null,
+                                    null
+                                )
+                            ),
+                            Map.class
+                        )
+                    )
+                    .withObjectMapper(mapper)
+                    .build(),
           ioConfig,
           tuningConfig
       );
@@ -559,14 +555,11 @@ public void testCompactionTaskDoesntCleanup() throws Exception
           .build();
 
       final ParallelIndexIngestionSpec indexIngestionSpec = new ParallelIndexIngestionSpec(
-              new DataSchema(
-                      "datasource",
-                      new TimestampSpec(null, null, null),
-                      DimensionsSpec.EMPTY,
-                      null,
-                      null,
-                      null
-              ),
+          DataSchema.builder()
+                    .withDataSource("datasource")
+                    .withTimestamp(new TimestampSpec(null, null, null))
+                    .withDimensions(DimensionsSpec.EMPTY)
+                    .build(),
               ioConfig,
               tuningConfig
       );
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexTestingFactory.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexTestingFactory.java
index 2455ce692b95..f93ea8d0a8b8 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexTestingFactory.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexTestingFactory.java
@@ -31,11 +31,9 @@
 import org.apache.druid.indexing.common.task.TaskResource;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec;
 import org.apache.druid.segment.indexing.granularity.GranularitySpec;
-import org.apache.druid.segment.transform.TransformSpec;
 import org.apache.druid.timeline.partition.BuildingHashBasedNumberedShardSpec;
 import org.apache.druid.timeline.partition.HashPartitionFunction;
 import org.joda.time.Interval;
@@ -97,16 +95,13 @@ static DataSchema createDataSchema(List<Interval> granularitySpecInputIntervals)
         DimensionsSpec.getDefaultSchemas(ImmutableList.of(SCHEMA_DIMENSION))
     );
 
-    return new DataSchema(
-        DATASOURCE,
-        timestampSpec,
-        dimensionsSpec,
-        new AggregatorFactory[]{},
-        granularitySpec,
-        TransformSpec.NONE,
-        null,
-        NESTED_OBJECT_MAPPER
-    );
+    return DataSchema.builder()
+                     .withDataSource(DATASOURCE)
+                     .withTimestamp(timestampSpec)
+                     .withDimensions(dimensionsSpec)
+                     .withGranularity(granularitySpec)
+                     .withObjectMapper(NESTED_OBJECT_MAPPER)
+                     .build();
   }
 
   static ParallelIndexIngestionSpec createIngestionSpec(
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionTaskKillTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionTaskKillTest.java
index 96494b8ac794..814e3f646424 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionTaskKillTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/RangePartitionTaskKillTest.java
@@ -39,7 +39,6 @@
 import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.GranularitySpec;
@@ -329,14 +328,13 @@ protected ParallelIndexSupervisorTask newTask(
         null
     );
     ingestionSpec = new ParallelIndexIngestionSpec(
-        new DataSchema(
-            DATASOURCE,
-            timestampSpec,
-            dimensionsSpec,
-            new AggregatorFactory[]{new LongSumAggregatorFactory("val", "val")},
-            granularitySpec,
-            null
-        ),
+        DataSchema.builder()
+                  .withDataSource(DATASOURCE)
+                  .withTimestamp(timestampSpec)
+                  .withDimensions(dimensionsSpec)
+                  .withAggregators(new LongSumAggregatorFactory("val", "val"))
+                  .withGranularity(granularitySpec)
+                  .build(),
         ioConfig,
         tuningConfig
     );
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseParallelIndexingTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseParallelIndexingTest.java
index ac8371acaa0d..b51224908644 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseParallelIndexingTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseParallelIndexingTest.java
@@ -43,7 +43,6 @@
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.java.util.common.granularity.Granularity;
 import org.apache.druid.java.util.common.parsers.JSONPathSpec;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
 import org.apache.druid.segment.DataSegmentsWithSchemas;
@@ -391,20 +390,19 @@ public void testRunInParallelIngestNullColumn()
         null,
         null,
         new ParallelIndexIngestionSpec(
-            new DataSchema(
-                "dataSource",
-                DEFAULT_TIMESTAMP_SPEC,
-                DEFAULT_DIMENSIONS_SPEC.withDimensions(dimensionSchemas),
-                new AggregatorFactory[]{
-                    new LongSumAggregatorFactory("val", "val")
-                },
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    Granularities.MINUTE,
-                    Collections.singletonList(Intervals.of("2017-12/P1M"))
-                ),
-                null
-            ),
+            DataSchema.builder()
+                      .withDataSource("dataSource")
+                      .withTimestamp(DEFAULT_TIMESTAMP_SPEC)
+                      .withDimensions(DEFAULT_DIMENSIONS_SPEC.withDimensions(dimensionSchemas))
+                      .withAggregators(new LongSumAggregatorFactory("val", "val"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              Granularities.MINUTE,
+                              Collections.singletonList(Intervals.of("2017-12/P1M"))
+                          )
+                      )
+                      .build(),
             new ParallelIndexIOConfig(
                 new SettableSplittableLocalInputSource(inputDir, VALID_INPUT_SOURCE_FILTER, true),
                 DEFAULT_INPUT_FORMAT,
@@ -444,20 +442,19 @@ public void testRunInParallelIngestNullColumn_storeEmptyColumnsOff_shouldNotStor
         null,
         null,
         new ParallelIndexIngestionSpec(
-            new DataSchema(
-                "dataSource",
-                DEFAULT_TIMESTAMP_SPEC,
-                DEFAULT_DIMENSIONS_SPEC.withDimensions(dimensionSchemas),
-                new AggregatorFactory[]{
-                    new LongSumAggregatorFactory("val", "val")
-                },
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    Granularities.MINUTE,
-                    Collections.singletonList(Intervals.of("2017-12/P1M"))
-                ),
-                null
-            ),
+            DataSchema.builder()
+                      .withDataSource("dataSource")
+                      .withTimestamp(DEFAULT_TIMESTAMP_SPEC)
+                      .withDimensions(DEFAULT_DIMENSIONS_SPEC.withDimensions(dimensionSchemas))
+                      .withAggregators(new LongSumAggregatorFactory("val", "val"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              Granularities.MINUTE,
+                              Collections.singletonList(Intervals.of("2017-12/P1M"))
+                          )
+                      )
+                      .build(),
             new ParallelIndexIOConfig(
                 new SettableSplittableLocalInputSource(inputDir, VALID_INPUT_SOURCE_FILTER, true),
                 DEFAULT_INPUT_FORMAT,
@@ -785,21 +782,24 @@ public void testIngestBothExplicitAndImplicitDims() throws IOException
         null,
         null,
         new ParallelIndexIngestionSpec(
-            new DataSchema(
-                "dataSource",
-                DEFAULT_TIMESTAMP_SPEC,
-                DimensionsSpec.builder()
-                              .setDefaultSchemaDimensions(ImmutableList.of("ts", "explicitDim"))
-                              .setIncludeAllDimensions(true)
-                              .build(),
-                new AggregatorFactory[]{new CountAggregatorFactory("cnt")},
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    Granularities.MINUTE,
-                    Collections.singletonList(interval)
-                ),
-                null
-            ),
+            DataSchema.builder()
+                      .withDataSource("dataSource")
+                      .withTimestamp(DEFAULT_TIMESTAMP_SPEC)
+                      .withDimensions(
+                          DimensionsSpec.builder()
+                                        .setDefaultSchemaDimensions(ImmutableList.of("ts", "explicitDim"))
+                                        .setIncludeAllDimensions(true)
+                                        .build()
+                      )
+                      .withAggregators(new CountAggregatorFactory("cnt"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              Granularities.MINUTE,
+                              Collections.singletonList(interval)
+                          )
+                      )
+                      .build(),
             new ParallelIndexIOConfig(
                 new SettableSplittableLocalInputSource(inputDir, "*.json", true),
                 new JsonInputFormat(
@@ -868,21 +868,24 @@ public void testIngestBothExplicitAndImplicitDimsSchemaDiscovery() throws IOExce
         null,
         null,
         new ParallelIndexIngestionSpec(
-            new DataSchema(
-                "dataSource",
-                DEFAULT_TIMESTAMP_SPEC,
-                DimensionsSpec.builder()
-                              .setDefaultSchemaDimensions(ImmutableList.of("ts", "explicitDim"))
-                              .useSchemaDiscovery(true)
-                              .build(),
-                new AggregatorFactory[]{new CountAggregatorFactory("cnt")},
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    Granularities.MINUTE,
-                    Collections.singletonList(interval)
-                ),
-                null
-            ),
+            DataSchema.builder()
+                      .withDataSource("dataSource")
+                      .withTimestamp(DEFAULT_TIMESTAMP_SPEC)
+                      .withDimensions(
+                          DimensionsSpec.builder()
+                                        .setDefaultSchemaDimensions(ImmutableList.of("ts", "explicitDim"))
+                                        .useSchemaDiscovery(true)
+                                        .build()
+                      )
+                      .withAggregators(new CountAggregatorFactory("cnt"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              Granularities.MINUTE,
+                              Collections.singletonList(interval)
+                          )
+                      )
+                      .build(),
             new ParallelIndexIOConfig(
                 new SettableSplittableLocalInputSource(inputDir, "*.json", true),
                 new JsonInputFormat(
@@ -948,20 +951,19 @@ private ParallelIndexSupervisorTask newTask(
     final ParallelIndexIngestionSpec ingestionSpec;
     if (useInputFormatApi) {
       ingestionSpec = new ParallelIndexIngestionSpec(
-          new DataSchema(
-              "dataSource",
-              DEFAULT_TIMESTAMP_SPEC,
-              DEFAULT_DIMENSIONS_SPEC,
-              new AggregatorFactory[]{
-                  new LongSumAggregatorFactory("val", "val")
-              },
-              new UniformGranularitySpec(
-                  segmentGranularity,
-                  Granularities.MINUTE,
-                  interval == null ? null : Collections.singletonList(interval)
-              ),
-              null
-          ),
+          DataSchema.builder()
+                    .withDataSource("dataSource")
+                    .withTimestamp(DEFAULT_TIMESTAMP_SPEC)
+                    .withDimensions(DEFAULT_DIMENSIONS_SPEC)
+                    .withAggregators(new LongSumAggregatorFactory("val", "val"))
+                    .withGranularity(
+                        new UniformGranularitySpec(
+                            segmentGranularity,
+                            Granularities.MINUTE,
+                            interval == null ? null : Collections.singletonList(interval)
+                        )
+                    )
+                    .build(),
           new ParallelIndexIOConfig(
               new SettableSplittableLocalInputSource(inputDir, inputSourceFilter, splittableInputSource),
               DEFAULT_INPUT_FORMAT,
@@ -972,18 +974,19 @@ private ParallelIndexSupervisorTask newTask(
       );
     } else {
       ingestionSpec = new ParallelIndexIngestionSpec(
-          new DataSchema(
-              "dataSource",
-              DEFAULT_TIMESTAMP_SPEC,
-              DEFAULT_DIMENSIONS_SPEC,
-              DEFAULT_METRICS_SPEC,
-              new UniformGranularitySpec(
-                  segmentGranularity,
-                  Granularities.MINUTE,
-                  interval == null ? null : Collections.singletonList(interval)
-              ),
-              null
-          ),
+          DataSchema.builder()
+                    .withDataSource("dataSource")
+                    .withTimestamp(DEFAULT_TIMESTAMP_SPEC)
+                    .withDimensions(DEFAULT_DIMENSIONS_SPEC)
+                    .withAggregators(DEFAULT_METRICS_SPEC)
+                    .withGranularity(
+                        new UniformGranularitySpec(
+                            segmentGranularity,
+                            Granularities.MINUTE,
+                            interval == null ? null : Collections.singletonList(interval)
+                        )
+                    )
+                    .build(),
           new ParallelIndexIOConfig(
               new LocalInputSource(inputDir, inputSourceFilter),
               createInputFormatFromParseSpec(DEFAULT_PARSE_SPEC),
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseSubTaskSpecTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseSubTaskSpecTest.java
index 4acc3d3f5912..577dce1255d0 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseSubTaskSpecTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseSubTaskSpecTest.java
@@ -26,7 +26,6 @@
 import org.apache.druid.data.input.impl.LocalInputSource;
 import org.apache.druid.data.input.impl.TimestampSpec;
 import org.apache.druid.indexing.common.TestUtils;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.server.security.Action;
 import org.apache.druid.server.security.Resource;
@@ -48,14 +47,11 @@ public class SinglePhaseSubTaskSpecTest
       "groupId",
       "supervisorTaskId",
       new ParallelIndexIngestionSpec(
-          new DataSchema(
-              "dataSource",
-              new TimestampSpec(null, null, null),
-              new DimensionsSpec(null),
-              new AggregatorFactory[0],
-              null,
-              null
-          ),
+          DataSchema.builder()
+                    .withDataSource("dataSource")
+                    .withTimestamp(new TimestampSpec(null, null, null))
+                    .withDimensions(DimensionsSpec.builder().build())
+                    .build(),
           new ParallelIndexIOConfig(
               new LocalInputSource(new File("baseDir"), "filter"),
               new JsonInputFormat(null, null, null, null, null),
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/TombstoneHelperTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/TombstoneHelperTest.java
index aea98e9e1036..fcb92543914a 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/TombstoneHelperTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/batch/parallel/TombstoneHelperTest.java
@@ -67,8 +67,7 @@ public void noTombstonesWhenNoDataInInputIntervalAndNoExistingSegments() throws
     GranularitySpec granularitySpec = new UniformGranularitySpec(Granularities.DAY, null, false,
                                                                  Collections.singletonList(interval)
     );
-    DataSchema dataSchema =
-        new DataSchema("test", null, null, null, granularitySpec, null);
+    DataSchema dataSchema = DataSchema.builder().withDataSource("test").withGranularity(granularitySpec).build();
     // no segments will be pushed when all rows are thrown away, assume that:
     List<DataSegment> pushedSegments = Collections.emptyList();
 
@@ -93,8 +92,7 @@ public void tombstonesCreatedWhenNoDataInInputIntervalAndExistingSegments() thro
     GranularitySpec granularitySpec = new UniformGranularitySpec(Granularities.DAY, null, false,
                                                                  Collections.singletonList(interval)
     );
-    DataSchema dataSchema =
-        new DataSchema("test", null, null, null, granularitySpec, null);
+    DataSchema dataSchema = DataSchema.builder().withDataSource("test").withGranularity(granularitySpec).build();
     // no segments will be pushed when all rows are thrown away, assume that:
     List<DataSegment> pushedSegments = Collections.emptyList();
 
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/input/InputRowSchemasTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/input/InputRowSchemasTest.java
index 44850ad0558f..35aeef9715d3 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/input/InputRowSchemasTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/input/InputRowSchemasTest.java
@@ -122,17 +122,19 @@ public void testFromDataSchema()
             new DoubleDimensionSchema("d5")
         )
     );
-    DataSchema schema = new DataSchema(
-        "dataSourceName",
-        new TimestampSpec(null, null, null),
-        dimensionsSpec,
-        new AggregatorFactory[]{
-            new CountAggregatorFactory("count"),
-            new LongSumAggregatorFactory("met", "met")
-        },
-        new UniformGranularitySpec(Granularities.MINUTE, Granularities.NONE, null),
-        null
-    );
+    DataSchema schema =
+        DataSchema.builder()
+                  .withDataSource("dataSourceName")
+                  .withTimestamp(new TimestampSpec(null, null, null))
+                  .withDimensions(dimensionsSpec)
+                  .withAggregators(
+                      new CountAggregatorFactory("count"),
+                      new LongSumAggregatorFactory("met", "met")
+                  )
+                  .withGranularity(
+                      new UniformGranularitySpec(Granularities.MINUTE, Granularities.NONE, null)
+                  )
+                  .build();
 
     InputRowSchema inputRowSchema = InputRowSchemas.fromDataSchema(schema);
     Assert.assertEquals(timestampSpec, inputRowSchema.getTimestampSpec());
@@ -154,14 +156,12 @@ public void testFromDataSchemaWithNoAggregator()
             new DoubleDimensionSchema("d5")
         )
     );
-    DataSchema schema = new DataSchema(
-        "dataSourceName",
-        new TimestampSpec(null, null, null),
-        dimensionsSpec,
-        new AggregatorFactory[]{},
-        new UniformGranularitySpec(Granularities.MINUTE, Granularities.NONE, null),
-        null
-    );
+    DataSchema schema = DataSchema.builder()
+                                  .withDataSource("dataSourceName")
+                                  .withTimestamp(new TimestampSpec(null, null, null))
+                                  .withDimensions(dimensionsSpec)
+                                  .withGranularity(new UniformGranularitySpec(Granularities.MINUTE, Granularities.NONE, null))
+                                  .build();
 
     InputRowSchema inputRowSchema = InputRowSchemas.fromDataSchema(schema);
     Assert.assertEquals(timestampSpec, inputRowSchema.getTimestampSpec());
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/TaskLifecycleTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/TaskLifecycleTest.java
index ddbed6be7c74..522134c4556f 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/TaskLifecycleTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/TaskLifecycleTest.java
@@ -109,7 +109,6 @@
 import org.apache.druid.query.ForwardingQueryProcessingPool;
 import org.apache.druid.query.QueryRunnerFactoryConglomerate;
 import org.apache.druid.query.SegmentDescriptor;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.DoubleSumAggregatorFactory;
 import org.apache.druid.segment.IndexIO;
 import org.apache.druid.segment.IndexMergerV9Factory;
@@ -672,18 +671,19 @@ public void testIndexTask()
         null,
         null,
         new IndexIngestionSpec(
-            new DataSchema(
-                "foo",
-                new TimestampSpec(null, null, null),
-                DimensionsSpec.EMPTY,
-                new AggregatorFactory[]{new DoubleSumAggregatorFactory("met", "met")},
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    null,
-                    ImmutableList.of(Intervals.of("2010-01-01/P2D"))
-                ),
-                null
-            ),
+            DataSchema.builder()
+                      .withDataSource("foo")
+                      .withTimestamp(new TimestampSpec(null, null, null))
+                      .withDimensions(DimensionsSpec.EMPTY)
+                      .withAggregators(new DoubleSumAggregatorFactory("met", "met"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              null,
+                              ImmutableList.of(Intervals.of("2010-01-01/P2D"))
+                          )
+                      )
+                      .build(),
             new IndexIOConfig(new MockInputSource(), new NoopInputFormat(), false, false),
             TuningConfigBuilder.forIndexTask()
                                .withMaxRowsPerSegment(10000)
@@ -735,18 +735,18 @@ public void testIndexTaskFailure()
         null,
         null,
         new IndexIngestionSpec(
-            new DataSchema(
-                "foo",
-                null,
-                new AggregatorFactory[]{new DoubleSumAggregatorFactory("met", "met")},
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    null,
-                    ImmutableList.of(Intervals.of("2010-01-01/P1D"))
-                ),
-                null,
-                mapper
-            ),
+            DataSchema.builder()
+                      .withDataSource("foo")
+                      .withAggregators(new DoubleSumAggregatorFactory("met", "met"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              null,
+                              ImmutableList.of(Intervals.of("2010-01-01/P1D"))
+                          )
+                      )
+                      .withObjectMapper(mapper)
+                      .build(),
             new IndexIOConfig(new MockExceptionInputSource(), new NoopInputFormat(), false, false),
             TuningConfigBuilder.forIndexTask()
                                .withMaxRowsPerSegment(10000)
@@ -1165,18 +1165,19 @@ public void testResumeTasks() throws Exception
         null,
         null,
         new IndexIngestionSpec(
-            new DataSchema(
-                "foo",
-                new TimestampSpec(null, null, null),
-                DimensionsSpec.EMPTY,
-                new AggregatorFactory[]{new DoubleSumAggregatorFactory("met", "met")},
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    null,
-                    ImmutableList.of(Intervals.of("2010-01-01/P2D"))
-                ),
-                null
-            ),
+            DataSchema.builder()
+                      .withDataSource("foo")
+                      .withTimestamp(new TimestampSpec(null, null, null))
+                      .withDimensions(DimensionsSpec.EMPTY)
+                      .withAggregators(new DoubleSumAggregatorFactory("met", "met"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              null,
+                              ImmutableList.of(Intervals.of("2010-01-01/P2D"))
+                          )
+                      )
+                      .build(),
             new IndexIOConfig(new MockInputSource(), new NoopInputFormat(), false, false),
             TuningConfigBuilder.forIndexTask()
                                .withMaxRowsPerSegment(10000)
@@ -1253,18 +1254,19 @@ public void testUnifiedAppenderatorsManagerCleanup() throws Exception
         null,
         null,
         new IndexIngestionSpec(
-            new DataSchema(
-                "foo",
-                new TimestampSpec(null, null, null),
-                DimensionsSpec.EMPTY,
-                new AggregatorFactory[]{new DoubleSumAggregatorFactory("met", "met")},
-                new UniformGranularitySpec(
-                    Granularities.DAY,
-                    null,
-                    ImmutableList.of(Intervals.of("2010-01-01/P2D"))
-                ),
-                null
-            ),
+            DataSchema.builder()
+                      .withDataSource("foo")
+                      .withTimestamp(new TimestampSpec(null, null, null))
+                      .withDimensions(DimensionsSpec.EMPTY)
+                      .withAggregators(new DoubleSumAggregatorFactory("met", "met"))
+                      .withGranularity(
+                          new UniformGranularitySpec(
+                              Granularities.DAY,
+                              null,
+                              ImmutableList.of(Intervals.of("2010-01-01/P2D"))
+                          )
+                      )
+                      .build(),
             new IndexIOConfig(new MockInputSource(), new NoopInputFormat(), false, false),
             TuningConfigBuilder.forIndexTask()
                                .withMaxRowsPerSegment(10000)
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/TaskQueueTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/TaskQueueTest.java
index c7b7b13ef7ea..8f1393f2c675 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/TaskQueueTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/TaskQueueTest.java
@@ -549,14 +549,15 @@ public void testGetActiveTaskRedactsPassword() throws JsonProcessingException
         new NoopTaskContextEnricher()
     );
 
-    final DataSchema dataSchema = new DataSchema(
-        "DS",
-        new TimestampSpec(null, null, null),
-        new DimensionsSpec(null),
-        null,
-        new UniformGranularitySpec(Granularities.YEAR, Granularities.DAY, null),
-        null
-    );
+    final DataSchema dataSchema =
+        DataSchema.builder()
+                  .withDataSource("DS")
+                  .withTimestamp(new TimestampSpec(null, null, null))
+                  .withDimensions(DimensionsSpec.builder().build())
+                  .withGranularity(
+                      new UniformGranularitySpec(Granularities.YEAR, Granularities.DAY, null)
+                  )
+                  .build();
     final ParallelIndexIOConfig ioConfig = new ParallelIndexIOConfig(
         new HttpInputSource(Collections.singletonList(URI.create("http://host.org")),
                             "user",
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/CsvInputSourceSamplerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/CsvInputSourceSamplerTest.java
index e788545507cd..1730d4b638fb 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/CsvInputSourceSamplerTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/CsvInputSourceSamplerTest.java
@@ -47,14 +47,11 @@ public void testCSVColumnAllNull()
   {
     final TimestampSpec timestampSpec = new TimestampSpec(null, null, DateTimes.of("1970"));
     final DimensionsSpec dimensionsSpec = new DimensionsSpec(null);
-    final DataSchema dataSchema = new DataSchema(
-        "sampler",
-        timestampSpec,
-        dimensionsSpec,
-        null,
-        null,
-        null
-    );
+    final DataSchema dataSchema = DataSchema.builder()
+                                            .withDataSource("sampler")
+                                            .withTimestamp(timestampSpec)
+                                            .withDimensions(dimensionsSpec)
+                                            .build();
 
     final List<String> strCsvRows = ImmutableList.of(
         "FirstName,LastName,Number,Gender",
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerDiscoveryTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerDiscoveryTest.java
index c486c15f0f23..0220aacd8922 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerDiscoveryTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerDiscoveryTest.java
@@ -66,14 +66,11 @@ public void testDiscoveredTypesNonStrictBooleans()
       final SamplerResponse response = inputSourceSampler.sample(
           inputSource,
           new JsonInputFormat(null, null, null, null, null),
-          new DataSchema(
-              "test",
-              new TimestampSpec("t", null, null),
-              DimensionsSpec.builder().useSchemaDiscovery(true).build(),
-              null,
-              null,
-              null
-          ),
+          DataSchema.builder()
+                    .withDataSource("test")
+                    .withTimestamp(new TimestampSpec("t", null, null))
+                    .withDimensions(DimensionsSpec.builder().useSchemaDiscovery(true).build())
+                    .build(),
           null
       );
 
@@ -131,14 +128,11 @@ public void testDiscoveredTypesStrictBooleans()
     final SamplerResponse response = inputSourceSampler.sample(
         inputSource,
         new JsonInputFormat(null, null, null, null, null),
-        new DataSchema(
-            "test",
-            new TimestampSpec("t", null, null),
-            DimensionsSpec.builder().useSchemaDiscovery(true).build(),
-            null,
-            null,
-            null
-        ),
+        DataSchema.builder()
+                  .withDataSource("test")
+                  .withTimestamp(new TimestampSpec("t", null, null))
+                  .withDimensions(DimensionsSpec.builder().useSchemaDiscovery(true).build())
+                  .build(),
         null
     );
 
@@ -189,14 +183,12 @@ public void testDiscoveredTypesStrictBooleans()
   public void testTypesClassicDiscovery()
   {
     final InputSource inputSource = new InlineInputSource(Strings.join(STR_JSON_ROWS, '\n'));
-    final DataSchema dataSchema = new DataSchema(
-        "test",
-        new TimestampSpec("t", null, null),
-        DimensionsSpec.builder().build(),
-        null,
-        null,
-        null
-    );
+    final DataSchema dataSchema =
+        DataSchema.builder()
+                  .withDataSource("test")
+                  .withTimestamp(new TimestampSpec("t", null, null))
+                  .withDimensions(DimensionsSpec.builder().build())
+                  .build();
     final SamplerResponse response = inputSourceSampler.sample(
         inputSource,
         new JsonInputFormat(null, null, null, null, null),
@@ -248,23 +240,20 @@ public void testTypesClassicDiscovery()
   public void testTypesNoDiscoveryExplicitSchema()
   {
     final InputSource inputSource = new InlineInputSource(Strings.join(STR_JSON_ROWS, '\n'));
-    final DataSchema dataSchema = new DataSchema(
-        "test",
-        new TimestampSpec("t", null, null),
-        DimensionsSpec.builder().setDimensions(
-            ImmutableList.of(new StringDimensionSchema("string"),
-                             new LongDimensionSchema("long"),
-                             new DoubleDimensionSchema("double"),
-                             new StringDimensionSchema("bool"),
-                             new AutoTypeColumnSchema("variant", null),
-                             new AutoTypeColumnSchema("array", null),
-                             new AutoTypeColumnSchema("nested", null)
-            )
-        ).build(),
-        null,
-        null,
-        null
-    );
+    final DataSchema dataSchema =
+        DataSchema.builder()
+                  .withDataSource("test")
+                  .withTimestamp(new TimestampSpec("t", null, null))
+                  .withDimensions(
+                      new StringDimensionSchema("string"),
+                      new LongDimensionSchema("long"),
+                      new DoubleDimensionSchema("double"),
+                      new StringDimensionSchema("bool"),
+                      new AutoTypeColumnSchema("variant", null),
+                      new AutoTypeColumnSchema("array", null),
+                      new AutoTypeColumnSchema("nested", null)
+                  )
+                  .build();
     final SamplerResponse response = inputSourceSampler.sample(
         inputSource,
         new JsonInputFormat(null, null, null, null, null),
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerTest.java
index 087b12cef40e..80d88e0be17c 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerTest.java
@@ -1497,24 +1497,24 @@ private DataSchema createDataSchema(
   ) throws IOException
   {
     if (useInputFormatApi) {
-      return new DataSchema(
-          "sampler",
-          timestampSpec,
-          dimensionsSpec,
-          aggregators,
-          granularitySpec,
-          transformSpec
-      );
+      return DataSchema.builder()
+                       .withDataSource("sampler")
+                       .withTimestamp(timestampSpec)
+                       .withDimensions(dimensionsSpec)
+                       .withAggregators(aggregators)
+                       .withGranularity(granularitySpec)
+                       .withTransform(transformSpec)
+                       .build();
     } else {
       final Map<String, Object> parserMap = getParserMap(createInputRowParser(timestampSpec, dimensionsSpec));
-      return new DataSchema(
-          "sampler",
-          parserMap,
-          aggregators,
-          granularitySpec,
-          transformSpec,
-          OBJECT_MAPPER
-      );
+      return DataSchema.builder()
+                       .withDataSource("sampler")
+                       .withParserMap(parserMap)
+                       .withAggregators(aggregators)
+                       .withGranularity(granularitySpec)
+                       .withTransform(transformSpec)
+                       .withObjectMapper(OBJECT_MAPPER)
+                       .build();
     }
   }
 
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskRunnerAuthTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskRunnerAuthTest.java
index 0f280059e0a5..7f44d44a00db 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskRunnerAuthTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskRunnerAuthTest.java
@@ -33,10 +33,8 @@
 import org.apache.druid.indexing.seekablestream.common.StreamPartition;
 import org.apache.druid.java.util.common.DateTimes;
 import org.apache.druid.java.util.common.granularity.AllGranularity;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec;
-import org.apache.druid.segment.transform.TransformSpec;
 import org.apache.druid.server.security.Access;
 import org.apache.druid.server.security.Action;
 import org.apache.druid.server.security.AuthConfig;
@@ -107,16 +105,13 @@ public Authorizer getAuthorizer(String name)
       }
     };
 
-    DataSchema dataSchema = new DataSchema(
-        "datasource",
-        new TimestampSpec(null, null, null),
-        new DimensionsSpec(Collections.emptyList()),
-        new AggregatorFactory[]{},
-        new ArbitraryGranularitySpec(new AllGranularity(), Collections.emptyList()),
-        TransformSpec.NONE,
-        null,
-        null
-    );
+    DataSchema dataSchema =
+        DataSchema.builder()
+                  .withDataSource("datasource")
+                  .withTimestamp(new TimestampSpec(null, null, null))
+                  .withDimensions(new DimensionsSpec(Collections.emptyList()))
+                  .withGranularity(new ArbitraryGranularitySpec(new AllGranularity(), Collections.emptyList()))
+                  .build();
     SeekableStreamIndexTaskTuningConfig tuningConfig = mock(SeekableStreamIndexTaskTuningConfig.class);
     SeekableStreamIndexTaskIOConfig<String, String> ioConfig = new TestSeekableStreamIndexTaskIOConfig();
 
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskTestBase.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskTestBase.java
index 258ebff7b504..7346fe9fb6e1 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskTestBase.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskTestBase.java
@@ -95,7 +95,6 @@
 import org.apache.druid.query.QueryRunnerFactoryConglomerate;
 import org.apache.druid.query.Result;
 import org.apache.druid.query.SegmentDescriptor;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.query.aggregation.DoubleSumAggregatorFactory;
 import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
@@ -164,25 +163,23 @@ public abstract class SeekableStreamIndexTaskTestBase extends EasyMockSupport
 
   protected static final ObjectMapper OBJECT_MAPPER;
   protected static final DataSchema OLD_DATA_SCHEMA;
-  protected static final DataSchema NEW_DATA_SCHEMA = new DataSchema(
-      "test_ds",
-      new TimestampSpec("timestamp", "iso", null),
-      new DimensionsSpec(
-          Arrays.asList(
-              new StringDimensionSchema("dim1"),
-              new StringDimensionSchema("dim1t"),
-              new StringDimensionSchema("dim2"),
-              new LongDimensionSchema("dimLong"),
-              new FloatDimensionSchema("dimFloat")
-          )
-      ),
-      new AggregatorFactory[]{
-          new DoubleSumAggregatorFactory("met1sum", "met1"),
-          new CountAggregatorFactory("rows")
-      },
-      new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null),
-      null
-  );
+  protected static final DataSchema NEW_DATA_SCHEMA =
+      DataSchema.builder()
+                .withDataSource("test_ds")
+                .withTimestamp(new TimestampSpec("timestamp", "iso", null))
+                .withDimensions(
+                    new StringDimensionSchema("dim1"),
+                    new StringDimensionSchema("dim1t"),
+                    new StringDimensionSchema("dim2"),
+                    new LongDimensionSchema("dimLong"),
+                    new FloatDimensionSchema("dimFloat")
+                )
+                .withAggregators(
+                    new DoubleSumAggregatorFactory("met1sum", "met1"),
+                    new CountAggregatorFactory("rows")
+                )
+                .withGranularity(new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null))
+                .build();
   protected static final InputFormat INPUT_FORMAT = new JsonInputFormat(
       new JSONPathSpec(true, ImmutableList.of()),
       ImmutableMap.of(),
@@ -211,37 +208,38 @@ public abstract class SeekableStreamIndexTaskTestBase extends EasyMockSupport
   static {
     OBJECT_MAPPER = new TestUtils().getTestObjectMapper();
     OBJECT_MAPPER.registerSubtypes(new NamedType(JSONParseSpec.class, "json"));
-    OLD_DATA_SCHEMA = new DataSchema(
-        "test_ds",
-        OBJECT_MAPPER.convertValue(
-            new StringInputRowParser(
-                new JSONParseSpec(
-                    new TimestampSpec("timestamp", "iso", null),
-                    new DimensionsSpec(
-                        Arrays.asList(
-                            new StringDimensionSchema("dim1"),
-                            new StringDimensionSchema("dim1t"),
-                            new StringDimensionSchema("dim2"),
-                            new LongDimensionSchema("dimLong"),
-                            new FloatDimensionSchema("dimFloat")
-                        )
-                    ),
-                    new JSONPathSpec(true, ImmutableList.of()),
-                    ImmutableMap.of(),
-                    false
-                ),
-                StandardCharsets.UTF_8.name()
-            ),
-            Map.class
-        ),
-        new AggregatorFactory[]{
-            new DoubleSumAggregatorFactory("met1sum", "met1"),
-            new CountAggregatorFactory("rows")
-        },
-        new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null),
-        null,
-        OBJECT_MAPPER
-    );
+    OLD_DATA_SCHEMA = DataSchema.builder()
+                                      .withDataSource("test_ds")
+                                      .withParserMap(
+                                          OBJECT_MAPPER.convertValue(
+                                              new StringInputRowParser(
+                                                  new JSONParseSpec(
+                                                      new TimestampSpec("timestamp", "iso", null),
+                                                      new DimensionsSpec(
+                                                          Arrays.asList(
+                                                              new StringDimensionSchema("dim1"),
+                                                              new StringDimensionSchema("dim1t"),
+                                                              new StringDimensionSchema("dim2"),
+                                                              new LongDimensionSchema("dimLong"),
+                                                              new FloatDimensionSchema("dimFloat")
+                                                          )
+                                                      ),
+                                                      new JSONPathSpec(true, ImmutableList.of()),
+                                                      ImmutableMap.of(),
+                                                      false
+                                                  ),
+                                                  StandardCharsets.UTF_8.name()
+                                              ),
+                                              Map.class
+                                          )
+                                      )
+                                      .withAggregators(
+                                          new DoubleSumAggregatorFactory("met1sum", "met1"),
+                                          new CountAggregatorFactory("rows")
+                                      )
+                                      .withGranularity(new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null))
+                                      .withObjectMapper(OBJECT_MAPPER)
+                                      .build();
   }
 
   public SeekableStreamIndexTaskTestBase(
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamSamplerSpecTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamSamplerSpecTest.java
index 87cd196c268f..6510e2cfbdc5 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamSamplerSpecTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamSamplerSpecTest.java
@@ -48,7 +48,6 @@
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.java.util.common.parsers.JSONPathSpec;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.query.aggregation.DoubleSumAggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
@@ -106,36 +105,37 @@ private static List<OrderedPartitionableRecord<String, String, ByteEntity>> gene
   @Test(timeout = 10_000L)
   public void testSampleWithInputRowParser() throws Exception
   {
-    final DataSchema dataSchema = new DataSchema(
-        "test_ds",
-        OBJECT_MAPPER.convertValue(
-            new StringInputRowParser(
-                new JSONParseSpec(
-                    new TimestampSpec("timestamp", "iso", null),
-                    new DimensionsSpec(
-                        Arrays.asList(
-                            new StringDimensionSchema("dim1"),
-                            new StringDimensionSchema("dim1t"),
-                            new StringDimensionSchema("dim2"),
-                            new LongDimensionSchema("dimLong"),
-                            new FloatDimensionSchema("dimFloat")
-                        )
-                    ),
-                    new JSONPathSpec(true, ImmutableList.of()),
-                    ImmutableMap.of(),
-                    false
-                )
-            ),
-            Map.class
-        ),
-        new AggregatorFactory[]{
-            new DoubleSumAggregatorFactory("met1sum", "met1"),
-            new CountAggregatorFactory("rows")
-        },
-        new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null),
-        null,
-        OBJECT_MAPPER
-    );
+    DataSchema dataSchema = DataSchema.builder()
+                                      .withDataSource("test_ds")
+                                      .withParserMap(
+                                          OBJECT_MAPPER.convertValue(
+                                              new StringInputRowParser(
+                                                  new JSONParseSpec(
+                                                      new TimestampSpec("timestamp", "iso", null),
+                                                      new DimensionsSpec(
+                                                          Arrays.asList(
+                                                              new StringDimensionSchema("dim1"),
+                                                              new StringDimensionSchema("dim1t"),
+                                                              new StringDimensionSchema("dim2"),
+                                                              new LongDimensionSchema("dimLong"),
+                                                              new FloatDimensionSchema("dimFloat")
+                                                          )
+                                                      ),
+                                                      new JSONPathSpec(true, ImmutableList.of()),
+                                                      ImmutableMap.of(),
+                                                      false
+                                                  )
+                                              ),
+                                              Map.class
+                                          )
+                                      )
+                                      .withAggregators(
+                                          new DoubleSumAggregatorFactory("met1sum", "met1"),
+                                          new CountAggregatorFactory("rows")
+                                      )
+                                      .withGranularity(new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null))
+                                      .withObjectMapper(OBJECT_MAPPER)
+                                      .build();
 
     final SeekableStreamSupervisorIOConfig supervisorIOConfig = new TestableSeekableStreamSupervisorIOConfig(
         STREAM,
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamSupervisorSpecTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamSupervisorSpecTest.java
index 4deee6ce9b8d..baff5fc765b2 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamSupervisorSpecTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/SeekableStreamSupervisorSpecTest.java
@@ -25,7 +25,6 @@
 import com.google.common.collect.ImmutableMap;
 import org.apache.druid.data.input.impl.ByteEntity;
 import org.apache.druid.data.input.impl.DimensionSchema;
-import org.apache.druid.data.input.impl.DimensionsSpec;
 import org.apache.druid.data.input.impl.JsonInputFormat;
 import org.apache.druid.data.input.impl.StringDimensionSchema;
 import org.apache.druid.data.input.impl.TimestampSpec;
@@ -59,7 +58,6 @@
 import org.apache.druid.java.util.metrics.DruidMonitorSchedulerConfig;
 import org.apache.druid.java.util.metrics.StubServiceEmitter;
 import org.apache.druid.query.DruidMetrics;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.segment.TestHelper;
 import org.apache.druid.segment.incremental.RowIngestionMetersFactory;
@@ -1261,18 +1259,19 @@ private static DataSchema getDataSchema()
     dimensions.add(StringDimensionSchema.create("dim1"));
     dimensions.add(StringDimensionSchema.create("dim2"));
 
-    return new DataSchema(
-        DATASOURCE,
-        new TimestampSpec("timestamp", "iso", null),
-        new DimensionsSpec(dimensions),
-        new AggregatorFactory[]{new CountAggregatorFactory("rows")},
-        new UniformGranularitySpec(
-            Granularities.HOUR,
-            Granularities.NONE,
-            ImmutableList.of()
-        ),
-        null
-    );
+    return DataSchema.builder()
+                     .withDataSource(DATASOURCE)
+                     .withTimestamp(new TimestampSpec("timestamp", "iso", null))
+                     .withDimensions(dimensions)
+                     .withAggregators(new CountAggregatorFactory("rows"))
+                     .withGranularity(
+                         new UniformGranularitySpec(
+                             Granularities.HOUR,
+                             Granularities.NONE,
+                             ImmutableList.of()
+                         )
+                     )
+                     .build();
   }
 
   private SeekableStreamSupervisorIOConfig getIOConfig(int taskCount, boolean scaleOut)
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java
index 1f42ba7ce996..40bbe84b623a 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java
@@ -31,7 +31,6 @@
 import com.google.common.util.concurrent.ListenableFuture;
 import org.apache.druid.data.input.impl.ByteEntity;
 import org.apache.druid.data.input.impl.DimensionSchema;
-import org.apache.druid.data.input.impl.DimensionsSpec;
 import org.apache.druid.data.input.impl.JsonInputFormat;
 import org.apache.druid.data.input.impl.StringDimensionSchema;
 import org.apache.druid.data.input.impl.TimestampSpec;
@@ -83,7 +82,6 @@
 import org.apache.druid.java.util.metrics.StubServiceEmitter;
 import org.apache.druid.metadata.PendingSegmentRecord;
 import org.apache.druid.query.DruidMetrics;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.segment.TestHelper;
 import org.apache.druid.segment.incremental.RowIngestionMetersFactory;
@@ -2562,18 +2560,19 @@ private static DataSchema getDataSchema()
     dimensions.add(StringDimensionSchema.create("dim1"));
     dimensions.add(StringDimensionSchema.create("dim2"));
 
-    return new DataSchema(
-        DATASOURCE,
-        new TimestampSpec("timestamp", "iso", null),
-        new DimensionsSpec(dimensions),
-        new AggregatorFactory[]{new CountAggregatorFactory("rows")},
-        new UniformGranularitySpec(
-            Granularities.HOUR,
-            Granularities.NONE,
-            ImmutableList.of()
-        ),
-        null
-    );
+    return DataSchema.builder()
+                     .withDataSource(DATASOURCE)
+                     .withTimestamp(new TimestampSpec("timestamp", "iso", null))
+                     .withDimensions(dimensions)
+                     .withAggregators(new CountAggregatorFactory("rows"))
+                     .withGranularity(
+                         new UniformGranularitySpec(
+                             Granularities.HOUR,
+                             Granularities.NONE,
+                             ImmutableList.of()
+                         )
+                     )
+                     .build();
   }
 
   private static SeekableStreamSupervisorIOConfig getIOConfig()
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/worker/TaskAnnouncementTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/worker/TaskAnnouncementTest.java
index 88249509ef97..61396fc7ae61 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/worker/TaskAnnouncementTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/worker/TaskAnnouncementTest.java
@@ -28,7 +28,6 @@
 import org.apache.druid.indexing.common.task.Task;
 import org.apache.druid.indexing.common.task.TaskResource;
 import org.apache.druid.jackson.DefaultObjectMapper;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.junit.Assert;
 import org.junit.Test;
@@ -51,7 +50,7 @@ public void testBackwardsCompatibleSerde() throws Exception
         "theid",
         new TaskResource("rofl", 2),
         new IndexTask.IndexIngestionSpec(
-            new DataSchema("foo", null, new AggregatorFactory[0], null, null, new DefaultObjectMapper()),
+            DataSchema.builder().withDataSource("foo").withObjectMapper(new DefaultObjectMapper()).build(),
             ioConfig,
             null
         ),
diff --git a/server/src/main/java/org/apache/druid/segment/indexing/DataSchema.java b/server/src/main/java/org/apache/druid/segment/indexing/DataSchema.java
index bda884018812..22ee4ec41025 100644
--- a/server/src/main/java/org/apache/druid/segment/indexing/DataSchema.java
+++ b/server/src/main/java/org/apache/druid/segment/indexing/DataSchema.java
@@ -25,7 +25,6 @@
 import com.fasterxml.jackson.annotation.JsonInclude.Include;
 import com.fasterxml.jackson.annotation.JsonProperty;
 import com.fasterxml.jackson.databind.ObjectMapper;
-import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.google.common.base.Strings;
 import com.google.common.collect.Multiset;
@@ -64,6 +63,17 @@
 public class DataSchema
 {
   private static final Logger log = new Logger(DataSchema.class);
+
+  public static Builder builder()
+  {
+    return new Builder();
+  }
+
+  public static Builder builder(DataSchema schema)
+  {
+    return new Builder(schema);
+  }
+
   private final String dataSource;
   private final AggregatorFactory[] aggregators;
   private final GranularitySpec granularitySpec;
@@ -126,33 +136,6 @@ public DataSchema(
     }
   }
 
-  @VisibleForTesting
-  public DataSchema(
-      String dataSource,
-      TimestampSpec timestampSpec,
-      DimensionsSpec dimensionsSpec,
-      AggregatorFactory[] aggregators,
-      GranularitySpec granularitySpec,
-      TransformSpec transformSpec
-  )
-  {
-    this(dataSource, timestampSpec, dimensionsSpec, aggregators, granularitySpec, transformSpec, null, null);
-  }
-
-  // old constructor for backward compatibility
-  @Deprecated
-  public DataSchema(
-      String dataSource,
-      Map<String, Object> parserMap,
-      AggregatorFactory[] aggregators,
-      GranularitySpec granularitySpec,
-      TransformSpec transformSpec,
-      ObjectMapper objectMapper
-  )
-  {
-    this(dataSource, null, null, aggregators, granularitySpec, transformSpec, parserMap, objectMapper);
-  }
-
   private static void validateDatasourceName(String dataSource)
   {
     IdUtils.validateId("dataSource", dataSource);
@@ -403,44 +386,17 @@ public InputRowParser getParser()
 
   public DataSchema withGranularitySpec(GranularitySpec granularitySpec)
   {
-    return new DataSchema(
-        dataSource,
-        timestampSpec,
-        dimensionsSpec,
-        aggregators,
-        granularitySpec,
-        transformSpec,
-        parserMap,
-        objectMapper
-    );
+    return builder(this).withGranularity(granularitySpec).build();
   }
 
   public DataSchema withTransformSpec(TransformSpec transformSpec)
   {
-    return new DataSchema(
-        dataSource,
-        timestampSpec,
-        dimensionsSpec,
-        aggregators,
-        granularitySpec,
-        transformSpec,
-        parserMap,
-        objectMapper
-    );
+    return builder(this).withTransform(transformSpec).build();
   }
 
   public DataSchema withDimensionsSpec(DimensionsSpec dimensionsSpec)
   {
-    return new DataSchema(
-        dataSource,
-        timestampSpec,
-        dimensionsSpec,
-        aggregators,
-        granularitySpec,
-        transformSpec,
-        parserMap,
-        objectMapper
-    );
+    return builder(this).withDimensions(dimensionsSpec).build();
   }
 
   @Override
@@ -457,4 +413,110 @@ public String toString()
            ", inputRowParser=" + inputRowParser +
            '}';
   }
+
+  public static class Builder
+  {
+    private String dataSource;
+    private AggregatorFactory[] aggregators;
+    private GranularitySpec granularitySpec;
+    private TransformSpec transformSpec;
+    private Map<String, Object> parserMap;
+    private ObjectMapper objectMapper;
+
+    // The below fields can be initialized lazily from parser for backward compatibility.
+    private TimestampSpec timestampSpec;
+    private DimensionsSpec dimensionsSpec;
+
+    public Builder()
+    {
+
+    }
+
+    public Builder(DataSchema schema)
+    {
+      this.dataSource = schema.dataSource;
+      this.aggregators = schema.aggregators;
+      this.granularitySpec = schema.granularitySpec;
+      this.transformSpec = schema.transformSpec;
+      this.parserMap = schema.parserMap;
+      this.objectMapper = schema.objectMapper;
+      this.timestampSpec = schema.timestampSpec;
+      this.dimensionsSpec = schema.dimensionsSpec;
+    }
+
+    public Builder withDataSource(String dataSource)
+    {
+      this.dataSource = dataSource;
+      return this;
+    }
+
+    public Builder withTimestamp(TimestampSpec timestampSpec)
+    {
+      this.timestampSpec = timestampSpec;
+      return this;
+    }
+
+    public Builder withDimensions(DimensionsSpec dimensionsSpec)
+    {
+      this.dimensionsSpec = dimensionsSpec;
+      return this;
+    }
+
+    public Builder withDimensions(List<DimensionSchema> dimensions)
+    {
+      this.dimensionsSpec = DimensionsSpec.builder().setDimensions(dimensions).build();
+      return this;
+    }
+
+    public Builder withDimensions(DimensionSchema... dimensions)
+    {
+      return withDimensions(Arrays.asList(dimensions));
+    }
+
+    public Builder withAggregators(AggregatorFactory... aggregators)
+    {
+      this.aggregators = aggregators;
+      return this;
+    }
+
+    public Builder withGranularity(GranularitySpec granularitySpec)
+    {
+      this.granularitySpec = granularitySpec;
+      return this;
+    }
+
+    public Builder withTransform(TransformSpec transformSpec)
+    {
+      this.transformSpec = transformSpec;
+      return this;
+    }
+
+    @Deprecated
+    public Builder withObjectMapper(ObjectMapper objectMapper)
+    {
+      this.objectMapper = objectMapper;
+      return this;
+    }
+
+    @Deprecated
+    public Builder withParserMap(Map<String, Object> parserMap)
+    {
+      this.parserMap = parserMap;
+      return this;
+    }
+
+    public DataSchema build()
+    {
+      return new DataSchema(
+          dataSource,
+          timestampSpec,
+          dimensionsSpec,
+          aggregators,
+          granularitySpec,
+          transformSpec,
+          parserMap,
+          objectMapper
+      );
+    }
+  }
 }
diff --git a/server/src/test/java/org/apache/druid/segment/indexing/DataSchemaTest.java b/server/src/test/java/org/apache/druid/segment/indexing/DataSchemaTest.java
index 90297dd4af9d..94bd77e810df 100644
--- a/server/src/test/java/org/apache/druid/segment/indexing/DataSchemaTest.java
+++ b/server/src/test/java/org/apache/druid/segment/indexing/DataSchemaTest.java
@@ -71,6 +71,11 @@
 
 public class DataSchemaTest extends InitializedNullHandlingTest
 {
+  private static ArbitraryGranularitySpec ARBITRARY_GRANULARITY = new ArbitraryGranularitySpec(
+      Granularities.DAY,
+      ImmutableList.of(Intervals.of("2014/2015"))
+  );
+
   @Rule
   public ExpectedException expectedException = ExpectedException.none();
 
@@ -92,17 +97,16 @@ public void testDefaultExclusions()
         ), JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT
     );
 
-    DataSchema schema = new DataSchema(
-        IdUtilsTest.VALID_ID_CHARS,
-        parser,
-        new AggregatorFactory[]{
-            new DoubleSumAggregatorFactory("metric1", "col1"),
-            new DoubleSumAggregatorFactory("metric2", "col2"),
-            },
-        new ArbitraryGranularitySpec(Granularities.DAY, ImmutableList.of(Intervals.of("2014/2015"))),
-        null,
-        jsonMapper
-    );
+    DataSchema schema = DataSchema.builder()
+                                  .withDataSource(IdUtilsTest.VALID_ID_CHARS)
+                                  .withParserMap(parser)
+                                  .withAggregators(
+                                      new DoubleSumAggregatorFactory("metric1", "col1"),
+                                      new DoubleSumAggregatorFactory("metric2", "col2")
+                                  )
+                                  .withGranularity(ARBITRARY_GRANULARITY)
+                                  .withObjectMapper(jsonMapper)
+                                  .build();
 
     Assert.assertEquals(
         ImmutableSet.of("__time", "time", "col1", "col2", "metric1", "metric2"),
@@ -130,18 +134,16 @@ public void testExplicitInclude()
             null
         ), JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT
     );
-
-    DataSchema schema = new DataSchema(
-        IdUtilsTest.VALID_ID_CHARS,
-        parser,
-        new AggregatorFactory[]{
-            new DoubleSumAggregatorFactory("metric1", "col1"),
-            new DoubleSumAggregatorFactory("metric2", "col2"),
-            },
-        new ArbitraryGranularitySpec(Granularities.DAY, ImmutableList.of(Intervals.of("2014/2015"))),
-        null,
-        jsonMapper
-    );
+    DataSchema schema = DataSchema.builder()
+                                  .withDataSource(IdUtilsTest.VALID_ID_CHARS)
+                                  .withParserMap(parser)
+                                  .withAggregators(
+                                      new DoubleSumAggregatorFactory("metric1", "col1"),
+                                      new DoubleSumAggregatorFactory("metric2", "col2")
+                                  )
+                                  .withGranularity(ARBITRARY_GRANULARITY)
+                                  .withObjectMapper(jsonMapper)
+                                  .build();
 
     Assert.assertEquals(
         ImmutableSet.of("__time", "dimC", "col1", "metric1", "metric2"),
@@ -167,22 +169,28 @@ public void testTransformSpec()
         ), JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT
     );
 
-    DataSchema schema = new DataSchema(
-        IdUtilsTest.VALID_ID_CHARS,
-        parserMap,
-        new AggregatorFactory[]{
-            new DoubleSumAggregatorFactory("metric1", "col1"),
-            new DoubleSumAggregatorFactory("metric2", "col2"),
-            },
-        new ArbitraryGranularitySpec(Granularities.DAY, ImmutableList.of(Intervals.of("2014/2015"))),
-        new TransformSpec(
-            new SelectorDimFilter("dimA", "foo", null),
-            ImmutableList.of(
-                new ExpressionTransform("expr", "concat(dimA,dimA)", TestExprMacroTable.INSTANCE)
-            )
-        ),
-        jsonMapper
-    );
+    DataSchema schema = DataSchema.builder()
+                                  .withDataSource(IdUtilsTest.VALID_ID_CHARS)
+                                  .withParserMap(parserMap)
+                                  .withAggregators(
+                                      new DoubleSumAggregatorFactory("metric1", "col1"),
+                                      new DoubleSumAggregatorFactory("metric2", "col2")
+                                  )
+                                  .withGranularity(ARBITRARY_GRANULARITY)
+                                  .withTransform(
+                                      new TransformSpec(
+                                          new SelectorDimFilter("dimA", "foo", null),
+                                          ImmutableList.of(
+                                              new ExpressionTransform(
+                                                  "expr",
+                                                  "concat(dimA,dimA)",
+                                                  TestExprMacroTable.INSTANCE
+                                              )
+                                          )
+                                      )
+                                  )
+                                  .withObjectMapper(jsonMapper)
+                                  .build();
 
     // Test hack that produces a StringInputRowParser.
     final StringInputRowParser parser = (StringInputRowParser) schema.getParser();
@@ -233,17 +241,16 @@ public void testOverlapMetricNameAndDim()
         ), JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT
     );
 
-    DataSchema schema = new DataSchema(
-        IdUtilsTest.VALID_ID_CHARS,
-        parser,
-        new AggregatorFactory[]{
-            new DoubleSumAggregatorFactory("metric1", "col1"),
-            new DoubleSumAggregatorFactory("metric2", "col2"),
-            },
-        new ArbitraryGranularitySpec(Granularities.DAY, ImmutableList.of(Intervals.of("2014/2015"))),
-        null,
-        jsonMapper
-    );
+    DataSchema schema = DataSchema.builder()
+                                  .withDataSource(IdUtilsTest.VALID_ID_CHARS)
+                                  .withParserMap(parser)
+                                  .withAggregators(
+                                      new DoubleSumAggregatorFactory("metric1", "col1"),
+                                      new DoubleSumAggregatorFactory("metric2", "col2")
+                                  )
+                                  .withGranularity(ARBITRARY_GRANULARITY)
+                                  .withObjectMapper(jsonMapper)
+                                  .build();
 
     expectedException.expect(DruidException.class);
     expectedException.expectMessage(
@@ -256,25 +263,24 @@ public void testOverlapMetricNameAndDim()
   @Test
   public void testOverlapTimeAndDimPositionZero()
   {
-    DataSchema schema = new DataSchema(
-        IdUtilsTest.VALID_ID_CHARS,
-        new TimestampSpec("time", "auto", null),
-        DimensionsSpec.builder()
-                      .setDimensions(
-                          ImmutableList.of(
-                              new LongDimensionSchema("__time"),
-                              new StringDimensionSchema("dimA"),
-                              new StringDimensionSchema("dimB")
-                          )
-                      )
-                      .setDimensionExclusions(ImmutableList.of("dimC"))
-                      .build(),
-        null,
-        new ArbitraryGranularitySpec(Granularities.DAY, ImmutableList.of(Intervals.of("2014/2015"))),
-        null,
-        null,
-        jsonMapper
-    );
+    DataSchema schema = DataSchema.builder()
+                                  .withDataSource(IdUtilsTest.VALID_ID_CHARS)
+                                  .withTimestamp(new TimestampSpec("time", "auto", null))
+                                  .withDimensions(
+                                      DimensionsSpec.builder()
+                                                    .setDimensions(
+                                                        ImmutableList.of(
+                                                            new LongDimensionSchema("__time"),
+                                                            new StringDimensionSchema("dimA"),
+                                                            new StringDimensionSchema("dimB")
+                                                        )
+                                                    )
+                                                    .setDimensionExclusions(ImmutableList.of("dimC"))
+                                                    .build()
+                                  )
+                                  .withGranularity(ARBITRARY_GRANULARITY)
+                                  .withObjectMapper(jsonMapper)
+                                  .build();
 
     Assert.assertEquals(
         ImmutableList.of("__time", "dimA", "dimB"),
@@ -290,25 +296,24 @@ public void testOverlapTimeAndDimPositionZeroWrongType()
     expectedException.expect(DruidException.class);
     expectedException.expectMessage("Encountered dimension[__time] with incorrect type[STRING]. Type must be 'long'.");
 
-    DataSchema schema = new DataSchema(
-        IdUtilsTest.VALID_ID_CHARS,
-        new TimestampSpec("time", "auto", null),
-        DimensionsSpec.builder()
-                      .setDimensions(
-                          ImmutableList.of(
-                              new StringDimensionSchema("__time"),
-                              new StringDimensionSchema("dimA"),
-                              new StringDimensionSchema("dimB")
-                          )
-                      )
-                      .setDimensionExclusions(ImmutableList.of("dimC"))
-                      .build(),
-        null,
-        new ArbitraryGranularitySpec(Granularities.DAY, ImmutableList.of(Intervals.of("2014/2015"))),
-        null,
-        null,
-        jsonMapper
-    );
+    DataSchema.builder()
+              .withDataSource(IdUtilsTest.VALID_ID_CHARS)
+              .withTimestamp(new TimestampSpec("time", "auto", null))
+              .withDimensions(
+                  DimensionsSpec.builder()
+                                .setDimensions(
+                                    ImmutableList.of(
+                                        new StringDimensionSchema("__time"),
+                                        new StringDimensionSchema("dimA"),
+                                        new StringDimensionSchema("dimB")
+                                    )
+                                )
+                                .setDimensionExclusions(ImmutableList.of("dimC"))
+                                .build()
+              )
+              .withGranularity(ARBITRARY_GRANULARITY)
+              .withObjectMapper(jsonMapper)
+              .build();
   }
 
   @Test
@@ -321,50 +326,49 @@ public void testOverlapTimeAndDimPositionOne()
         + DimensionsSpec.WARNING_NON_TIME_SORT_ORDER
     );
 
-    DataSchema schema = new DataSchema(
-        IdUtilsTest.VALID_ID_CHARS,
-        new TimestampSpec("time", "auto", null),
-        DimensionsSpec.builder()
-                      .setDimensions(
-                          ImmutableList.of(
-                              new StringDimensionSchema("dimA"),
-                              new LongDimensionSchema("__time"),
-                              new StringDimensionSchema("dimB")
-                          )
-                      )
-                      .setDimensionExclusions(ImmutableList.of("dimC"))
-                      .build(),
-        null,
-        new ArbitraryGranularitySpec(Granularities.DAY, ImmutableList.of(Intervals.of("2014/2015"))),
-        null,
-        null,
-        jsonMapper
-    );
+    DataSchema.builder()
+              .withDataSource(IdUtilsTest.VALID_ID_CHARS)
+              .withTimestamp(new TimestampSpec("time", "auto", null))
+              .withDimensions(
+                  DimensionsSpec.builder()
+                                .setDimensions(
+                                    ImmutableList.of(
+                                        new StringDimensionSchema("dimA"),
+                                        new LongDimensionSchema("__time"),
+                                        new StringDimensionSchema("dimB")
+                                    )
+                                )
+                                .setDimensionExclusions(ImmutableList.of("dimC"))
+                                .build()
+              )
+              .withGranularity(ARBITRARY_GRANULARITY)
+              .withObjectMapper(jsonMapper)
+              .build();
   }
 
   @Test
   public void testOverlapTimeAndDimPositionOne_withExplicitSortOrder()
   {
-    DataSchema schema = new DataSchema(
-        IdUtilsTest.VALID_ID_CHARS,
-        new TimestampSpec("time", "auto", null),
-        DimensionsSpec.builder()
-                      .setDimensions(
-                          ImmutableList.of(
-                              new StringDimensionSchema("dimA"),
-                              new LongDimensionSchema("__time"),
-                              new StringDimensionSchema("dimB")
-                          )
-                      )
-                      .setDimensionExclusions(ImmutableList.of("dimC"))
-                      .setForceSegmentSortByTime(false)
-                      .build(),
-        null,
-        new ArbitraryGranularitySpec(Granularities.DAY, ImmutableList.of(Intervals.of("2014/2015"))),
-        null,
-        null,
-        jsonMapper
-    );
+    DataSchema schema =
+        DataSchema.builder()
+                  .withDataSource(IdUtilsTest.VALID_ID_CHARS)
+                  .withTimestamp(new TimestampSpec("time", "auto", null))
+                  .withDimensions(
+                      DimensionsSpec.builder()
+                                    .setDimensions(
+                                        ImmutableList.of(
+                                            new StringDimensionSchema("dimA"),
+                                            new LongDimensionSchema("__time"),
+                                            new StringDimensionSchema("dimB")
+                                        )
+                                    )
+                                    .setDimensionExclusions(ImmutableList.of("dimC"))
+                                    .setForceSegmentSortByTime(false)
+                                    .build()
+                  )
+                  .withGranularity(ARBITRARY_GRANULARITY)
+                  .withObjectMapper(jsonMapper)
+                  .build();
 
     Assert.assertEquals(
         ImmutableList.of("dimA", "__time", "dimB"),
@@ -402,14 +406,13 @@ public void testOverlapTimeAndDimLegacy()
         ), JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT
     );
 
-    DataSchema schema = new DataSchema(
-        IdUtilsTest.VALID_ID_CHARS,
-        parser,
-        null,
-        new ArbitraryGranularitySpec(Granularities.DAY, ImmutableList.of(Intervals.of("2014/2015"))),
-        null,
-        jsonMapper
-    );
+    DataSchema schema = DataSchema.builder()
+                                  .withDataSource(IdUtilsTest.VALID_ID_CHARS)
+                                  .withParserMap(parser)
+                                  .withGranularity(ARBITRARY_GRANULARITY)
+                                  .withObjectMapper(jsonMapper)
+                                  .build();
+
 
     expectedException.expect(DruidException.class);
     expectedException.expectMessage("Encountered dimension[__time] with incorrect type[STRING]. Type must be 'long'.");
@@ -442,20 +445,19 @@ public void testDuplicateAggregators()
         + "[metric3] seen in metricsSpec list (2 occurrences)"
     );
 
-    DataSchema schema = new DataSchema(
-        IdUtilsTest.VALID_ID_CHARS,
-        parser,
-        new AggregatorFactory[]{
-            new DoubleSumAggregatorFactory("metric1", "col1"),
-            new DoubleSumAggregatorFactory("metric2", "col2"),
-            new DoubleSumAggregatorFactory("metric1", "col3"),
-            new DoubleSumAggregatorFactory("metric3", "col4"),
-            new DoubleSumAggregatorFactory("metric3", "col5"),
-            },
-        new ArbitraryGranularitySpec(Granularities.DAY, ImmutableList.of(Intervals.of("2014/2015"))),
-        null,
-        jsonMapper
-    );
+    DataSchema schema = DataSchema.builder()
+                                  .withDataSource(IdUtilsTest.VALID_ID_CHARS)
+                                  .withParserMap(parser)
+                                  .withAggregators(
+                                      new DoubleSumAggregatorFactory("metric1", "col1"),
+                                      new DoubleSumAggregatorFactory("metric2", "col2"),
+                                      new DoubleSumAggregatorFactory("metric1", "col3"),
+                                      new DoubleSumAggregatorFactory("metric3", "col4"),
+                                      new DoubleSumAggregatorFactory("metric3", "col5")
+                                  )
+                                  .withGranularity(ARBITRARY_GRANULARITY)
+                                  .withObjectMapper(jsonMapper)
+                                  .build();
   }
 
   @Test
@@ -510,24 +512,20 @@ public void testEmptyDatasource()
         ), JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT
     );
 
-    DruidExceptionMatcher
-        .invalidInput()
-        .expectMessageIs("Invalid value for field [dataSource]: must not be null")
-        .assertThrowsAndMatches(
-            () -> new DataSchema(
-                "",
-                parser,
-                new AggregatorFactory[]{
-                    new DoubleSumAggregatorFactory("metric1", "col1"),
-                    new DoubleSumAggregatorFactory("metric2", "col2"),
-                    },
-                new ArbitraryGranularitySpec(
-                    Granularities.DAY,
-                    ImmutableList.of(Intervals.of("2014/2015"))
-                ),
-                null,
-                jsonMapper
-            ));
+    DruidExceptionMatcher.ThrowingSupplier thrower =
+        () -> DataSchema.builder()
+                        .withDataSource("")
+                        .withParserMap(parser)
+                        .withAggregators(
+                            new DoubleSumAggregatorFactory("metric1", "col1"),
+                            new DoubleSumAggregatorFactory("metric2", "col2")
+                        )
+                        .withGranularity(ARBITRARY_GRANULARITY)
+                        .withObjectMapper(jsonMapper)
+                        .build();
+    DruidExceptionMatcher.invalidInput()
+                         .expectMessageIs("Invalid value for field [dataSource]: must not be null")
+                         .assertThrowsAndMatches(thrower);
   }
 
 
@@ -547,14 +545,11 @@ public void testInvalidWhitespaceDatasource()
           dataSource
       );
       DruidExceptionMatcher.invalidInput().expectMessageIs(msg).assertThrowsAndMatches(
-          () -> new DataSchema(
-              dataSource,
-              Collections.emptyMap(),
-              null,
-              null,
-              null,
-              jsonMapper
-          )
+          () -> DataSchema.builder()
+                          .withDataSource(dataSource)
+                          .withParserMap(Collections.emptyMap())
+                          .withObjectMapper(jsonMapper)
+                          .build()
       );
     }
   }
@@ -686,17 +681,16 @@ public void testSerdeWithUpdatedDataSchemaAddedField() throws IOException
         ), JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT
     );
 
-    DataSchema originalSchema = new DataSchema(
-        IdUtilsTest.VALID_ID_CHARS,
-        parser,
-        new AggregatorFactory[]{
-            new DoubleSumAggregatorFactory("metric1", "col1"),
-            new DoubleSumAggregatorFactory("metric2", "col2"),
-            },
-        new ArbitraryGranularitySpec(Granularities.DAY, ImmutableList.of(Intervals.of("2014/2015"))),
-        null,
-        jsonMapper
-    );
+    DataSchema originalSchema = DataSchema.builder()
+                                          .withDataSource(IdUtilsTest.VALID_ID_CHARS)
+                                          .withParserMap(parser)
+                                          .withAggregators(
+                                              new DoubleSumAggregatorFactory("metric1", "col1"),
+                                              new DoubleSumAggregatorFactory("metric2", "col2")
+                                          )
+                                          .withGranularity(ARBITRARY_GRANULARITY)
+                                          .withObjectMapper(jsonMapper)
+                                          .build();
 
     String serialized = jsonMapper.writeValueAsString(originalSchema);
     TestModifiedDataSchema deserialized = jsonMapper.readValue(serialized, TestModifiedDataSchema.class);
@@ -734,7 +728,7 @@ public void testSerdeWithUpdatedDataSchemaRemovedField() throws IOException
             new DoubleSumAggregatorFactory("metric1", "col1"),
             new DoubleSumAggregatorFactory("metric2", "col2"),
             },
-        new ArbitraryGranularitySpec(Granularities.DAY, ImmutableList.of(Intervals.of("2014/2015"))),
+        ARBITRARY_GRANULARITY,
         null,
         parser,
         jsonMapper,
@@ -765,10 +759,16 @@ public void testWithDimensionSpec()
     Map<String, Object> parserMap = Mockito.mock(Map.class);
     Mockito.when(newDimSpec.withDimensionExclusions(ArgumentMatchers.any(Set.class))).thenReturn(newDimSpec);
 
-    DataSchema oldSchema = new DataSchema("dataSource", tsSpec, oldDimSpec,
-                                          new AggregatorFactory[]{aggFactory}, gSpec,
-                                          transSpec, parserMap, jsonMapper
-    );
+    DataSchema oldSchema = DataSchema.builder()
+                                     .withDataSource("dataSource")
+                                     .withTimestamp(tsSpec)
+                                     .withDimensions(oldDimSpec)
+                                     .withAggregators(aggFactory)
+                                     .withGranularity(gSpec)
+                                     .withTransform(transSpec)
+                                     .withParserMap(parserMap)
+                                     .withObjectMapper(jsonMapper)
+                                     .build();
     DataSchema newSchema = oldSchema.withDimensionsSpec(newDimSpec);
     Assert.assertSame(oldSchema.getDataSource(), newSchema.getDataSource());
     Assert.assertSame(oldSchema.getTimestampSpec(), newSchema.getTimestampSpec());
@@ -795,7 +795,7 @@ public void testCombinedDataSchemaSetsMultiValuedColumnsInfo()
                       .setDimensionExclusions(ImmutableList.of("dimC"))
                       .build(),
         null,
-        new ArbitraryGranularitySpec(Granularities.DAY, ImmutableList.of(Intervals.of("2014/2015"))),
+        ARBITRARY_GRANULARITY,
         null,
         multiValuedDimensions
     );
diff --git a/server/src/test/java/org/apache/druid/segment/realtime/appenderator/BatchAppenderatorTester.java b/server/src/test/java/org/apache/druid/segment/realtime/appenderator/BatchAppenderatorTester.java
index 22034aa33aa9..5f43236e075b 100644
--- a/server/src/test/java/org/apache/druid/segment/realtime/appenderator/BatchAppenderatorTester.java
+++ b/server/src/test/java/org/apache/druid/segment/realtime/appenderator/BatchAppenderatorTester.java
@@ -30,7 +30,6 @@
 import org.apache.druid.java.util.emitter.EmittingLogger;
 import org.apache.druid.java.util.emitter.core.NoopEmitter;
 import org.apache.druid.java.util.emitter.service.ServiceEmitter;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
 import org.apache.druid.segment.IndexIO;
@@ -151,19 +150,18 @@ public BatchAppenderatorTester(
         Map.class
     );
 
-    schema = new DataSchema(
-        DATASOURCE,
-        null,
-        null,
-        new AggregatorFactory[]{
-            new CountAggregatorFactory("count"),
-            new LongSumAggregatorFactory("met", "met")
-        },
-        new UniformGranularitySpec(Granularities.MINUTE, Granularities.NONE, null),
-        null,
-        parserMap,
-        objectMapper
-    );
+    schema = DataSchema.builder()
+                       .withDataSource(DATASOURCE)
+                       .withAggregators(
+                           new CountAggregatorFactory("count"),
+                           new LongSumAggregatorFactory("met", "met")
+                       )
+                       .withGranularity(
+                           new UniformGranularitySpec(Granularities.MINUTE, Granularities.NONE, null)
+                       )
+                       .withParserMap(parserMap)
+                       .withObjectMapper(objectMapper)
+                       .build();
 
     tuningConfig = new TestAppenderatorConfig(
         TuningConfig.DEFAULT_APPENDABLE_INDEX,
diff --git a/server/src/test/java/org/apache/druid/segment/realtime/appenderator/StreamAppenderatorTester.java b/server/src/test/java/org/apache/druid/segment/realtime/appenderator/StreamAppenderatorTester.java
index cd990e76f892..29d758aaed02 100644
--- a/server/src/test/java/org/apache/druid/segment/realtime/appenderator/StreamAppenderatorTester.java
+++ b/server/src/test/java/org/apache/druid/segment/realtime/appenderator/StreamAppenderatorTester.java
@@ -44,7 +44,6 @@
 import org.apache.druid.query.DefaultQueryRunnerFactoryConglomerate;
 import org.apache.druid.query.ForwardingQueryProcessingPool;
 import org.apache.druid.query.QueryRunnerTestHelper;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
 import org.apache.druid.query.expression.TestExprMacroTable;
@@ -135,17 +134,16 @@ public StreamAppenderatorTester(
         ),
         Map.class
     );
-    schema = new DataSchema(
-        DATASOURCE,
-        parserMap,
-        new AggregatorFactory[]{
-            new CountAggregatorFactory("count"),
-            new LongSumAggregatorFactory("met", "met")
-        },
-        new UniformGranularitySpec(Granularities.MINUTE, Granularities.NONE, null),
-        null,
-        objectMapper
-    );
+    schema = DataSchema.builder()
+                       .withDataSource(DATASOURCE)
+                       .withParserMap(parserMap)
+                       .withAggregators(
+                           new CountAggregatorFactory("count"),
+                           new LongSumAggregatorFactory("met", "met")
+                       )
+                       .withGranularity(new UniformGranularitySpec(Granularities.MINUTE, Granularities.NONE, null))
+                       .withObjectMapper(objectMapper)
+                       .build();
     tuningConfig = new TestAppenderatorConfig(
       TuningConfig.DEFAULT_APPENDABLE_INDEX,
       maxRowsInMemory,
diff --git a/server/src/test/java/org/apache/druid/segment/realtime/appenderator/UnifiedIndexerAppenderatorsManagerTest.java b/server/src/test/java/org/apache/druid/segment/realtime/appenderator/UnifiedIndexerAppenderatorsManagerTest.java
index 23ac93db0096..21f627baa085 100644
--- a/server/src/test/java/org/apache/druid/segment/realtime/appenderator/UnifiedIndexerAppenderatorsManagerTest.java
+++ b/server/src/test/java/org/apache/druid/segment/realtime/appenderator/UnifiedIndexerAppenderatorsManagerTest.java
@@ -98,14 +98,11 @@ public void setup()
     EasyMock.replay(appenderatorConfig);
     appenderator = manager.createBatchAppenderatorForTask(
         "taskId",
-        new DataSchema(
-            "myDataSource",
-            new TimestampSpec("__time", "millis", null),
-            null,
-            null,
-            new UniformGranularitySpec(Granularities.HOUR, Granularities.HOUR, false, Collections.emptyList()),
-            null
-        ),
+        DataSchema.builder()
+                  .withDataSource("myDataSource")
+                  .withTimestamp(new TimestampSpec("__time", "millis", null))
+                  .withGranularity(new UniformGranularitySpec(Granularities.HOUR, Granularities.HOUR, false, Collections.emptyList()))
+                  .build(),
         appenderatorConfig,
         new SegmentGenerationMetrics(),
         new NoopDataSegmentPusher(),
diff --git a/server/src/test/java/org/apache/druid/segment/realtime/sink/SinkTest.java b/server/src/test/java/org/apache/druid/segment/realtime/sink/SinkTest.java
index 9d85ec6c8e6b..750ea06c6534 100644
--- a/server/src/test/java/org/apache/druid/segment/realtime/sink/SinkTest.java
+++ b/server/src/test/java/org/apache/druid/segment/realtime/sink/SinkTest.java
@@ -34,7 +34,6 @@
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.java.util.common.guava.Sequences;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.segment.RowAdapters;
 import org.apache.druid.segment.RowBasedSegment;
@@ -76,14 +75,14 @@ public class SinkTest extends InitializedNullHandlingTest
   @Test
   public void testSwap() throws Exception
   {
-    final DataSchema schema = new DataSchema(
-        "test",
-        new TimestampSpec(null, null, null),
-        DimensionsSpec.EMPTY,
-        new AggregatorFactory[]{new CountAggregatorFactory("rows")},
-        new UniformGranularitySpec(Granularities.HOUR, Granularities.MINUTE, null),
-        null
-    );
+    final DataSchema schema =
+        DataSchema.builder()
+                  .withDataSource("test")
+                  .withTimestamp(new TimestampSpec(null, null, null))
+                  .withDimensions(DimensionsSpec.EMPTY)
+                  .withAggregators(new CountAggregatorFactory("rows"))
+                  .withGranularity(new UniformGranularitySpec(Granularities.HOUR, Granularities.MINUTE, null))
+                  .build();
 
     final Interval interval = Intervals.of("2013-01-01/2013-01-02");
     final String version = DateTimes.nowUtc().toString();
@@ -256,18 +255,17 @@ public void testAcquireSegmentReferences_twoWithOneSwappedToNull()
   @Test
   public void testGetSinkSignature() throws IndexSizeExceededException
   {
-    final DataSchema schema = new DataSchema(
-        "test",
-        new TimestampSpec(null, null, null),
-        new DimensionsSpec(
-            Arrays.asList(
-                new StringDimensionSchema("dim1"),
-                new LongDimensionSchema("dimLong")
-            )),
-        new AggregatorFactory[]{new CountAggregatorFactory("rows")},
-        new UniformGranularitySpec(Granularities.HOUR, Granularities.MINUTE, null),
-        null
-    );
+    final DataSchema schema =
+        DataSchema.builder()
+                  .withDataSource("test")
+                  .withTimestamp(new TimestampSpec(null, null, null))
+                  .withDimensions(
+                      new StringDimensionSchema("dim1"),
+                      new LongDimensionSchema("dimLong")
+                  )
+                  .withAggregators(new CountAggregatorFactory("rows"))
+                  .withGranularity(new UniformGranularitySpec(Granularities.HOUR, Granularities.MINUTE, null))
+                  .build();
 
     final Interval interval = Intervals.of("2013-01-01/2013-01-02");
     final String version = DateTimes.nowUtc().toString();
diff --git a/services/src/test/java/org/apache/druid/cli/validate/DruidJsonValidatorTest.java b/services/src/test/java/org/apache/druid/cli/validate/DruidJsonValidatorTest.java
index b617b7e6b877..c0634ed403c0 100644
--- a/services/src/test/java/org/apache/druid/cli/validate/DruidJsonValidatorTest.java
+++ b/services/src/test/java/org/apache/druid/cli/validate/DruidJsonValidatorTest.java
@@ -30,7 +30,6 @@
 import org.apache.druid.indexing.common.task.TaskResource;
 import org.apache.druid.jackson.DefaultObjectMapper;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.IndexSpec;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
@@ -133,14 +132,11 @@ public void testTaskValidator() throws Exception
         null,
         new TaskResource("rofl", 2),
         new IndexTask.IndexIngestionSpec(
-            new DataSchema(
-                "foo",
-                null,
-                new AggregatorFactory[0],
-                new UniformGranularitySpec(Granularities.HOUR, Granularities.NONE, null),
-                null,
-                jsonMapper
-            ),
+            DataSchema.builder()
+                      .withDataSource("foo")
+                      .withGranularity(new UniformGranularitySpec(Granularities.HOUR, Granularities.NONE, null))
+                      .withObjectMapper(jsonMapper)
+                      .build(),
             new IndexTask.IndexIOConfig(
                 new LocalInputSource(new File("lol"), "rofl"),
                 new JsonInputFormat(null, null, null, null, null),

From 579b23de487f1feb00e2051caa9254f0655fd236 Mon Sep 17 00:00:00 2001
From: Clint Wylie <cwylie@apache.org>
Date: Sun, 15 Sep 2024 16:45:51 -0700
Subject: [PATCH 23/47] abstract `IncrementalIndex` cursor stuff to prepare for
 using different "views" of the data based on the cursor build spec (#17064)

* abstract `IncrementalIndex` cursor stuff to prepare to allow for possibility of using different "views" of the data based on the cursor build spec
changes:
* introduce `IncrementalIndexRowSelector` interface to capture how `IncrementalIndexCursor` and `IncrementalIndexColumnSelectorFactory` read data
* `IncrementalIndex` implements `IncrementalIndexRowSelector`
* move `FactsHolder` interface to separate file
* other minor refactorings
---
 .../segment/incremental/FactsHolder.java      |  82 ++++
 .../segment/incremental/IncrementalIndex.java | 453 +++++++++---------
 ...IncrementalIndexColumnSelectorFactory.java |  22 +-
 .../IncrementalIndexCursorFactory.java        |   6 +-
 .../IncrementalIndexCursorHolder.java         |  68 ++-
 .../incremental/IncrementalIndexRow.java      |   2 +
 .../IncrementalIndexRowSelector.java          | 104 ++++
 .../incremental/OnheapIncrementalIndex.java   |  86 ++--
 .../test/TestFrameProcessorUtils.java         |  17 +-
 .../segment/AutoTypeColumnIndexerTest.java    |  44 +-
 .../NestedDataColumnIndexerV4Test.java        |  21 +-
 .../IncrementalIndexIngestionTest.java        |  32 +-
 .../IncrementalIndexMultiValueSpecTest.java   |  17 +-
 .../virtual/ExpressionSelectorsTest.java      |  15 +-
 14 files changed, 572 insertions(+), 397 deletions(-)
 create mode 100644 processing/src/main/java/org/apache/druid/segment/incremental/FactsHolder.java
 create mode 100644 processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexRowSelector.java

diff --git a/processing/src/main/java/org/apache/druid/segment/incremental/FactsHolder.java b/processing/src/main/java/org/apache/druid/segment/incremental/FactsHolder.java
new file mode 100644
index 000000000000..f7eede101509
--- /dev/null
+++ b/processing/src/main/java/org/apache/druid/segment/incremental/FactsHolder.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.segment.incremental;
+
+import java.util.Comparator;
+import java.util.Iterator;
+
+/**
+ * {@link IncrementalIndexRow} storage interface, a mutable data structure for building up a set or rows to eventually
+ * persist into an immutable segment
+ *
+ * @see IncrementalIndex for the data processor which constructs {@link IncrementalIndexRow} to store here
+ */
+public interface FactsHolder
+{
+  /**
+   * @return the previous rowIndex associated with the specified key, or
+   * {@link IncrementalIndexRow#EMPTY_ROW_INDEX} if there was no mapping for the key.
+   */
+  int getPriorIndex(IncrementalIndexRow key);
+
+  /**
+   * Get minimum {@link IncrementalIndexRow#getTimestamp()} present in the facts holder
+   */
+  long getMinTimeMillis();
+
+  /**
+   * Get maximum {@link IncrementalIndexRow#getTimestamp()} present in the facts holder
+   */
+  long getMaxTimeMillis();
+
+  /**
+   * Get all {@link IncrementalIndex}, depending on the implementation, these rows may or may not be ordered in the same
+   * order they will be persisted in. Use {@link #persistIterable()} if this is required.
+   */
+  Iterator<IncrementalIndexRow> iterator(boolean descending);
+
+  /**
+   * Get all {@link IncrementalIndexRow} with {@link IncrementalIndexRow#getTimestamp()} between the start and end
+   * timestamps specified
+   */
+  Iterable<IncrementalIndexRow> timeRangeIterable(boolean descending, long timeStart, long timeEnd);
+
+  /**
+   * Get all row {@link IncrementalIndexRow} 'keys', which is distinct groups if this is an aggregating facts holder or
+   * just every row present if not
+   */
+  Iterable<IncrementalIndexRow> keySet();
+
+  /**
+   * Get all {@link IncrementalIndexRow} to persist, ordered with {@link Comparator <IncrementalIndexRow>}
+   */
+  Iterable<IncrementalIndexRow> persistIterable();
+
+  /**
+   * @return the previous rowIndex associated with the specified key, or
+   * {@link IncrementalIndexRow#EMPTY_ROW_INDEX} if there was no mapping for the key.
+   */
+  int putIfAbsent(IncrementalIndexRow key, int rowIndex);
+
+  /**
+   * Clear all rows present in the facts holder
+   */
+  void clear();
+}
diff --git a/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndex.java b/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndex.java
index fc2a02c47b7b..8adc47f65336 100644
--- a/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndex.java
+++ b/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndex.java
@@ -105,7 +105,7 @@
  * {@link IncrementalIndexCursorFactory} are thread-safe, and may be called concurrently with each other, and with
  * the "add" methods. This concurrency model supports real-time queries of the data in the index.
  */
-public abstract class IncrementalIndex implements Iterable<Row>, Closeable, ColumnInspector
+public abstract class IncrementalIndex implements IncrementalIndexRowSelector, ColumnInspector, Iterable<Row>, Closeable
 {
   /**
    * Column selector used at ingestion time for inputs to aggregators.
@@ -255,8 +255,9 @@ public ColumnCapabilities getColumnCapabilities(String columnName)
 
   private final boolean useSchemaDiscovery;
 
-  private final InputRowHolder inputRowHolder = new InputRowHolder();
+  protected final InputRowHolder inputRowHolder = new InputRowHolder();
 
+  @Nullable
   private volatile DateTime maxIngestedEventTime;
 
   /**
@@ -366,8 +367,6 @@ protected IncrementalIndex(
     );
   }
 
-  public abstract FactsHolder getFacts();
-
   public abstract boolean canAppendRow();
 
   public abstract String getOutOfRowsReason();
@@ -384,100 +383,11 @@ protected abstract AddToFactsResult addToFacts(
       boolean skipMaxRowsInMemoryCheck
   ) throws IndexSizeExceededException;
 
-  public abstract int getLastRowIndex();
-
-  protected abstract float getMetricFloatValue(int rowOffset, int aggOffset);
-
-  protected abstract long getMetricLongValue(int rowOffset, int aggOffset);
-
-  protected abstract Object getMetricObjectValue(int rowOffset, int aggOffset);
-
-  protected abstract double getMetricDoubleValue(int rowOffset, int aggOffset);
-
-  protected abstract boolean isNull(int rowOffset, int aggOffset);
-
-  static class IncrementalIndexRowResult
-  {
-    private final IncrementalIndexRow incrementalIndexRow;
-    private final List<String> parseExceptionMessages;
-
-    IncrementalIndexRowResult(IncrementalIndexRow incrementalIndexRow, List<String> parseExceptionMessages)
-    {
-      this.incrementalIndexRow = incrementalIndexRow;
-      this.parseExceptionMessages = parseExceptionMessages;
-    }
-
-    IncrementalIndexRow getIncrementalIndexRow()
-    {
-      return incrementalIndexRow;
-    }
-
-    List<String> getParseExceptionMessages()
-    {
-      return parseExceptionMessages;
-    }
-  }
-
-  static class AddToFactsResult
-  {
-    private final int rowCount;
-    private final long bytesInMemory;
-    private final List<String> parseExceptionMessages;
-
-    public AddToFactsResult(
-        int rowCount,
-        long bytesInMemory,
-        List<String> parseExceptionMessages
-    )
-    {
-      this.rowCount = rowCount;
-      this.bytesInMemory = bytesInMemory;
-      this.parseExceptionMessages = parseExceptionMessages;
-    }
-
-    int getRowCount()
-    {
-      return rowCount;
-    }
-
-    public long getBytesInMemory()
-    {
-      return bytesInMemory;
-    }
-
-    public List<String> getParseExceptionMessages()
-    {
-      return parseExceptionMessages;
-    }
-  }
-
-  public static class InputRowHolder
-  {
-    @Nullable
-    private InputRow row;
-    private long rowId = -1;
-
-    public void set(final InputRow row)
-    {
-      this.row = row;
-      this.rowId++;
-    }
 
-    public void unset()
-    {
-      this.row = null;
-    }
-
-    public InputRow getRow()
-    {
-      return Preconditions.checkNotNull(row, "row");
-    }
-
-    public long getRowId()
-    {
-      return rowId;
-    }
-  }
+  public abstract Iterable<Row> iterableWithPostAggregations(
+      @Nullable List<PostAggregator> postAggs,
+      boolean descending
+  );
 
   public boolean isRollup()
   {
@@ -746,23 +656,6 @@ public static ParseException getCombinedParseException(
     );
   }
 
-  private static String getSimplifiedEventStringFromRow(InputRow inputRow)
-  {
-    if (inputRow instanceof MapBasedInputRow) {
-      return ((MapBasedInputRow) inputRow).getEvent().toString();
-    }
-
-    if (inputRow instanceof ListBasedInputRow) {
-      return ((ListBasedInputRow) inputRow).asMap().toString();
-    }
-
-    if (inputRow instanceof TransformedInputRow) {
-      InputRow innerRow = ((TransformedInputRow) inputRow).getBaseRow();
-      return getSimplifiedEventStringFromRow(innerRow);
-    }
-
-    return inputRow.toString();
-  }
 
   private synchronized void updateMaxIngestedTime(DateTime eventTime)
   {
@@ -771,6 +664,7 @@ private synchronized void updateMaxIngestedTime(DateTime eventTime)
     }
   }
 
+  @Override
   public boolean isEmpty()
   {
     return numEntries.get() == 0;
@@ -861,6 +755,7 @@ public List<DimensionDesc> getDimensions()
   /**
    * Returns the descriptor for a particular dimension.
    */
+  @Override
   @Nullable
   public DimensionDesc getDimension(String dimension)
   {
@@ -869,22 +764,39 @@ public DimensionDesc getDimension(String dimension)
     }
   }
 
-  public ColumnValueSelector<?> makeMetricColumnValueSelector(String metric, IncrementalIndexRowHolder currEntry)
+  @Override
+  @Nullable
+  public MetricDesc getMetric(String metric)
   {
-    MetricDesc metricDesc = metricDescs.get(metric);
+    return metricDescs.get(metric);
+  }
+
+  @Override
+  public List<OrderBy> getOrdering()
+  {
+    return metadata.getOrdering();
+  }
+
+  public static ColumnValueSelector<?> makeMetricColumnValueSelector(
+      IncrementalIndexRowSelector rowSelector,
+      IncrementalIndexRowHolder currEntry,
+      String metric
+  )
+  {
+    final MetricDesc metricDesc = rowSelector.getMetric(metric);
     if (metricDesc == null) {
       return NilColumnValueSelector.instance();
     }
     int metricIndex = metricDesc.getIndex();
     switch (metricDesc.getCapabilities().getType()) {
       case COMPLEX:
-        return new ObjectMetricColumnSelector(metricDesc, currEntry, metricIndex);
+        return new ObjectMetricColumnSelector(rowSelector, currEntry, metricDesc);
       case LONG:
-        return new LongMetricColumnSelector(currEntry, metricIndex);
+        return new LongMetricColumnSelector(rowSelector, currEntry, metricIndex);
       case FLOAT:
-        return new FloatMetricColumnSelector(currEntry, metricIndex);
+        return new FloatMetricColumnSelector(rowSelector, currEntry, metricIndex);
       case DOUBLE:
-        return new DoubleMetricColumnSelector(currEntry, metricIndex);
+        return new DoubleMetricColumnSelector(rowSelector, currEntry, metricIndex);
       case STRING:
         throw new IllegalStateException("String is not a metric column type");
       default:
@@ -910,13 +822,6 @@ public DateTime getMaxTime()
     return isEmpty() ? null : DateTimes.utc(getMaxTimeMillis());
   }
 
-  @Nullable
-  public Integer getDimensionIndex(String dimension)
-  {
-    DimensionDesc dimSpec = getDimension(dimension);
-    return dimSpec == null ? null : dimSpec.getIndex();
-  }
-
   /**
    * Returns names of time and dimension columns, in persist sort order. Includes {@link ColumnHolder#TIME_COLUMN_NAME}.
    */
@@ -1003,6 +908,49 @@ public Metadata getMetadata()
     return metadata;
   }
 
+  @Override
+  public Iterator<Row> iterator()
+  {
+    return iterableWithPostAggregations(null, false).iterator();
+  }
+
+  public DateTime getMaxIngestedEventTime()
+  {
+    return maxIngestedEventTime;
+  }
+
+  protected ColumnSelectorFactory makeColumnSelectorFactory(
+      @Nullable final AggregatorFactory agg,
+      final InputRowHolder in
+  )
+  {
+    return makeColumnSelectorFactory(virtualColumns, in, agg);
+  }
+
+  protected final Comparator<IncrementalIndexRow> dimsComparator()
+  {
+    return new IncrementalIndexRowComparator(timePosition, dimensionDescsList);
+  }
+
+
+  private static String getSimplifiedEventStringFromRow(InputRow inputRow)
+  {
+    if (inputRow instanceof MapBasedInputRow) {
+      return ((MapBasedInputRow) inputRow).getEvent().toString();
+    }
+
+    if (inputRow instanceof ListBasedInputRow) {
+      return ((ListBasedInputRow) inputRow).asMap().toString();
+    }
+
+    if (inputRow instanceof TransformedInputRow) {
+      InputRow innerRow = ((TransformedInputRow) inputRow).getBaseRow();
+      return getSimplifiedEventStringFromRow(innerRow);
+    }
+
+    return inputRow.toString();
+  }
+
   private static AggregatorFactory[] getCombiningAggregators(AggregatorFactory[] aggregators)
   {
     AggregatorFactory[] combiningAggregators = new AggregatorFactory[aggregators.length];
@@ -1012,30 +960,24 @@ private static AggregatorFactory[] getCombiningAggregators(AggregatorFactory[] a
     return combiningAggregators;
   }
 
-  @Override
-  public Iterator<Row> iterator()
-  {
-    return iterableWithPostAggregations(null, false).iterator();
-  }
-
-  public abstract Iterable<Row> iterableWithPostAggregations(
-      @Nullable List<PostAggregator> postAggs,
-      boolean descending
-  );
-
-  public DateTime getMaxIngestedEventTime()
+  private static boolean allNull(Object[] dims, int startPosition)
   {
-    return maxIngestedEventTime;
+    for (int i = startPosition; i < dims.length; i++) {
+      if (dims[i] != null) {
+        return false;
+      }
+    }
+    return true;
   }
 
   public static final class DimensionDesc
   {
     private final int index;
     private final String name;
-    private final DimensionHandler handler;
-    private final DimensionIndexer indexer;
+    private final DimensionHandler<?, ?, ?> handler;
+    private final DimensionIndexer<?, ?, ?> indexer;
 
-    public DimensionDesc(int index, String name, DimensionHandler handler, boolean useMaxMemoryEstimates)
+    public DimensionDesc(int index, String name, DimensionHandler<?, ?, ?> handler, boolean useMaxMemoryEstimates)
     {
       this.index = index;
       this.name = name;
@@ -1058,12 +1000,12 @@ public ColumnCapabilities getCapabilities()
       return indexer.getColumnCapabilities();
     }
 
-    public DimensionHandler getHandler()
+    public DimensionHandler<?, ?, ?> getHandler()
     {
       return handler;
     }
 
-    public DimensionIndexer getIndexer()
+    public DimensionIndexer<?, ?, ?> getIndexer()
     {
       return indexer;
     }
@@ -1124,19 +1066,90 @@ public ColumnCapabilities getCapabilities()
     }
   }
 
-  protected ColumnSelectorFactory makeColumnSelectorFactory(
-      @Nullable final AggregatorFactory agg,
-      final InputRowHolder in
-  )
+  public static class AddToFactsResult
   {
-    return makeColumnSelectorFactory(virtualColumns, in, agg);
+    private final int rowCount;
+    private final long bytesInMemory;
+    private final List<String> parseExceptionMessages;
+
+    public AddToFactsResult(
+        int rowCount,
+        long bytesInMemory,
+        List<String> parseExceptionMessages
+    )
+    {
+      this.rowCount = rowCount;
+      this.bytesInMemory = bytesInMemory;
+      this.parseExceptionMessages = parseExceptionMessages;
+    }
+
+    int getRowCount()
+    {
+      return rowCount;
+    }
+
+    public long getBytesInMemory()
+    {
+      return bytesInMemory;
+    }
+
+    public List<String> getParseExceptionMessages()
+    {
+      return parseExceptionMessages;
+    }
   }
 
-  protected final Comparator<IncrementalIndexRow> dimsComparator()
+  public static class InputRowHolder
   {
-    return new IncrementalIndexRowComparator(timePosition, dimensionDescsList);
+    @Nullable
+    private InputRow row;
+    private long rowId = -1;
+
+    public void set(final InputRow row)
+    {
+      this.row = row;
+      this.rowId++;
+    }
+
+    public void unset()
+    {
+      this.row = null;
+    }
+
+    public InputRow getRow()
+    {
+      return Preconditions.checkNotNull(row, "row");
+    }
+
+    public long getRowId()
+    {
+      return rowId;
+    }
   }
 
+  static class IncrementalIndexRowResult
+  {
+    private final IncrementalIndexRow incrementalIndexRow;
+    private final List<String> parseExceptionMessages;
+
+    IncrementalIndexRowResult(IncrementalIndexRow incrementalIndexRow, List<String> parseExceptionMessages)
+    {
+      this.incrementalIndexRow = incrementalIndexRow;
+      this.parseExceptionMessages = parseExceptionMessages;
+    }
+
+    IncrementalIndexRow getIncrementalIndexRow()
+    {
+      return incrementalIndexRow;
+    }
+
+    List<String> getParseExceptionMessages()
+    {
+      return parseExceptionMessages;
+    }
+  }
+
+
   @VisibleForTesting
   static final class IncrementalIndexRowComparator implements Comparator<IncrementalIndexRow>
   {
@@ -1207,57 +1220,19 @@ public int compare(IncrementalIndexRow lhs, IncrementalIndexRow rhs)
     }
   }
 
-  private static boolean allNull(Object[] dims, int startPosition)
-  {
-    for (int i = startPosition; i < dims.length; i++) {
-      if (dims[i] != null) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  public interface FactsHolder
-  {
-    /**
-     * @return the previous rowIndex associated with the specified key, or
-     * {@link IncrementalIndexRow#EMPTY_ROW_INDEX} if there was no mapping for the key.
-     */
-    int getPriorIndex(IncrementalIndexRow key);
-
-    long getMinTimeMillis();
-
-    long getMaxTimeMillis();
-
-    Iterator<IncrementalIndexRow> iterator(boolean descending);
-
-    Iterable<IncrementalIndexRow> timeRangeIterable(boolean descending, long timeStart, long timeEnd);
-
-    Iterable<IncrementalIndexRow> keySet();
-
-    /**
-     * Get all {@link IncrementalIndexRow} to persist, ordered with {@link Comparator<IncrementalIndexRow>}
-     *
-     * @return
-     */
-    Iterable<IncrementalIndexRow> persistIterable();
-
-    /**
-     * @return the previous rowIndex associated with the specified key, or
-     * {@link IncrementalIndexRow#EMPTY_ROW_INDEX} if there was no mapping for the key.
-     */
-    int putIfAbsent(IncrementalIndexRow key, int rowIndex);
-
-    void clear();
-  }
-
-  private final class LongMetricColumnSelector implements LongColumnSelector
+  private static final class LongMetricColumnSelector implements LongColumnSelector
   {
+    private final IncrementalIndexRowSelector rowSelector;
     private final IncrementalIndexRowHolder currEntry;
     private final int metricIndex;
 
-    public LongMetricColumnSelector(IncrementalIndexRowHolder currEntry, int metricIndex)
+    public LongMetricColumnSelector(
+        IncrementalIndexRowSelector rowSelector,
+        IncrementalIndexRowHolder currEntry,
+        int metricIndex
+    )
     {
+      this.rowSelector = rowSelector;
       this.currEntry = currEntry;
       this.metricIndex = metricIndex;
     }
@@ -1265,119 +1240,131 @@ public LongMetricColumnSelector(IncrementalIndexRowHolder currEntry, int metricI
     @Override
     public long getLong()
     {
-      assert NullHandling.replaceWithDefault() || !isNull();
-      return getMetricLongValue(currEntry.get().getRowIndex(), metricIndex);
+      return rowSelector.getMetricLongValue(currEntry.get().getRowIndex(), metricIndex);
     }
 
     @Override
-    public void inspectRuntimeShape(RuntimeShapeInspector inspector)
+    public boolean isNull()
     {
-      inspector.visit("index", IncrementalIndex.this);
+      return rowSelector.isNull(currEntry.get().getRowIndex(), metricIndex);
     }
 
     @Override
-    public boolean isNull()
+    public void inspectRuntimeShape(RuntimeShapeInspector inspector)
     {
-      return IncrementalIndex.this.isNull(currEntry.get().getRowIndex(), metricIndex);
+      inspector.visit("index", rowSelector);
     }
   }
 
-  private final class ObjectMetricColumnSelector extends ObjectColumnSelector
+  private static final class FloatMetricColumnSelector implements FloatColumnSelector
   {
+    private final IncrementalIndexRowSelector rowSelector;
     private final IncrementalIndexRowHolder currEntry;
     private final int metricIndex;
-    private Class classOfObject;
 
-    public ObjectMetricColumnSelector(
-        MetricDesc metricDesc,
+    public FloatMetricColumnSelector(
+        IncrementalIndexRowSelector rowSelector,
         IncrementalIndexRowHolder currEntry,
         int metricIndex
     )
     {
       this.currEntry = currEntry;
+      this.rowSelector = rowSelector;
       this.metricIndex = metricIndex;
-      classOfObject = ComplexMetrics.getSerdeForType(metricDesc.getType()).getObjectStrategy().getClazz();
     }
 
-    @Nullable
     @Override
-    public Object getObject()
+    public float getFloat()
     {
-      return getMetricObjectValue(currEntry.get().getRowIndex(), metricIndex);
+      return rowSelector.getMetricFloatValue(currEntry.get().getRowIndex(), metricIndex);
     }
 
     @Override
-    public Class classOfObject()
+    public void inspectRuntimeShape(RuntimeShapeInspector inspector)
     {
-      return classOfObject;
+      inspector.visit("index", rowSelector);
     }
 
     @Override
-    public void inspectRuntimeShape(RuntimeShapeInspector inspector)
+    public boolean isNull()
     {
-      inspector.visit("index", IncrementalIndex.this);
+      return rowSelector.isNull(currEntry.get().getRowIndex(), metricIndex);
     }
   }
 
-  private final class FloatMetricColumnSelector implements FloatColumnSelector
+  private static final class DoubleMetricColumnSelector implements DoubleColumnSelector
   {
+    private final IncrementalIndexRowSelector rowSelector;
     private final IncrementalIndexRowHolder currEntry;
     private final int metricIndex;
 
-    public FloatMetricColumnSelector(IncrementalIndexRowHolder currEntry, int metricIndex)
+    public DoubleMetricColumnSelector(
+        IncrementalIndexRowSelector rowSelector,
+        IncrementalIndexRowHolder currEntry,
+        int metricIndex
+    )
     {
       this.currEntry = currEntry;
+      this.rowSelector = rowSelector;
       this.metricIndex = metricIndex;
     }
 
     @Override
-    public float getFloat()
+    public double getDouble()
     {
       assert NullHandling.replaceWithDefault() || !isNull();
-      return getMetricFloatValue(currEntry.get().getRowIndex(), metricIndex);
+      return rowSelector.getMetricDoubleValue(currEntry.get().getRowIndex(), metricIndex);
     }
 
     @Override
-    public void inspectRuntimeShape(RuntimeShapeInspector inspector)
+    public boolean isNull()
     {
-      inspector.visit("index", IncrementalIndex.this);
+      return rowSelector.isNull(currEntry.get().getRowIndex(), metricIndex);
     }
 
     @Override
-    public boolean isNull()
+    public void inspectRuntimeShape(RuntimeShapeInspector inspector)
     {
-      return IncrementalIndex.this.isNull(currEntry.get().getRowIndex(), metricIndex);
+      inspector.visit("index", rowSelector);
     }
   }
 
-  private final class DoubleMetricColumnSelector implements DoubleColumnSelector
+  private static final class ObjectMetricColumnSelector extends ObjectColumnSelector
   {
+    private final IncrementalIndexRowSelector rowSelector;
     private final IncrementalIndexRowHolder currEntry;
     private final int metricIndex;
+    private final Class<?> classOfObject;
 
-    public DoubleMetricColumnSelector(IncrementalIndexRowHolder currEntry, int metricIndex)
+    public ObjectMetricColumnSelector(
+        IncrementalIndexRowSelector rowSelector,
+        IncrementalIndexRowHolder currEntry,
+        MetricDesc metricDesc
+    )
     {
       this.currEntry = currEntry;
-      this.metricIndex = metricIndex;
+      this.rowSelector = rowSelector;
+      this.metricIndex = metricDesc.getIndex();
+      this.classOfObject = ComplexMetrics.getSerdeForType(metricDesc.getType()).getObjectStrategy().getClazz();
     }
 
+    @Nullable
     @Override
-    public double getDouble()
+    public Object getObject()
     {
-      assert NullHandling.replaceWithDefault() || !isNull();
-      return getMetricDoubleValue(currEntry.get().getRowIndex(), metricIndex);
+      return rowSelector.getMetricObjectValue(currEntry.get().getRowIndex(), metricIndex);
     }
 
     @Override
-    public boolean isNull()
+    public Class<?> classOfObject()
     {
-      return IncrementalIndex.this.isNull(currEntry.get().getRowIndex(), metricIndex);
+      return classOfObject;
     }
 
     @Override
     public void inspectRuntimeShape(RuntimeShapeInspector inspector)
     {
-      inspector.visit("index", IncrementalIndex.this);
+      inspector.visit("index", rowSelector);
     }
   }
 }
diff --git a/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexColumnSelectorFactory.java b/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexColumnSelectorFactory.java
index 86e8c6690c2d..9d60edef0449 100644
--- a/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexColumnSelectorFactory.java
+++ b/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexColumnSelectorFactory.java
@@ -43,29 +43,29 @@
 class IncrementalIndexColumnSelectorFactory implements ColumnSelectorFactory, RowIdSupplier
 {
   private final ColumnInspector snapshotColumnInspector;
-  private final IncrementalIndex index;
   private final VirtualColumns virtualColumns;
   private final Order timeOrder;
   private final IncrementalIndexRowHolder rowHolder;
+  private final IncrementalIndexRowSelector rowSelector;
 
   IncrementalIndexColumnSelectorFactory(
-      IncrementalIndex index,
+      IncrementalIndexRowSelector rowSelector,
       VirtualColumns virtualColumns,
       Order timeOrder,
       IncrementalIndexRowHolder rowHolder
   )
   {
-    this.index = index;
     this.virtualColumns = virtualColumns;
     this.timeOrder = timeOrder;
     this.rowHolder = rowHolder;
+    this.rowSelector = rowSelector;
     this.snapshotColumnInspector = new ColumnInspector()
     {
       @Nullable
       @Override
       public ColumnCapabilities getColumnCapabilities(String column)
       {
-        return IncrementalIndexCursorFactory.snapshotColumnCapabilities(index, column);
+        return IncrementalIndexCursorFactory.snapshotColumnCapabilities(rowSelector, column);
       }
     };
   }
@@ -87,13 +87,13 @@ private DimensionSelector makeDimensionSelectorUndecorated(DimensionSpec dimensi
 
     if (dimension.equals(ColumnHolder.TIME_COLUMN_NAME) && timeOrder != Order.NONE) {
       return new SingleScanTimeDimensionSelector(
-          makeColumnValueSelector(dimension),
+          makeColumnValueSelector(ColumnHolder.TIME_COLUMN_NAME),
           extractionFn,
           timeOrder
       );
     }
 
-    final IncrementalIndex.DimensionDesc dimensionDesc = index.getDimension(dimensionSpec.getDimension());
+    final IncrementalIndex.DimensionDesc dimensionDesc = rowSelector.getDimension(dimensionSpec.getDimension());
     if (dimensionDesc == null) {
       // not a dimension, column may be a metric
       ColumnCapabilities capabilities = getColumnCapabilities(dimension);
@@ -122,19 +122,17 @@ public ColumnValueSelector<?> makeColumnValueSelector(String columnName)
     if (virtualColumns.exists(columnName)) {
       return virtualColumns.makeColumnValueSelector(columnName, this);
     }
-
-    if (columnName.equals(ColumnHolder.TIME_COLUMN_NAME)) {
+    if (ColumnHolder.TIME_COLUMN_NAME.equals(columnName)) {
       return rowHolder;
     }
 
-    final Integer dimIndex = index.getDimensionIndex(columnName);
-    if (dimIndex != null) {
-      final IncrementalIndex.DimensionDesc dimensionDesc = index.getDimension(columnName);
+    final IncrementalIndex.DimensionDesc dimensionDesc = rowSelector.getDimension(columnName);
+    if (dimensionDesc != null) {
       final DimensionIndexer indexer = dimensionDesc.getIndexer();
       return indexer.makeColumnValueSelector(rowHolder, dimensionDesc);
     }
 
-    return index.makeMetricColumnValueSelector(columnName, rowHolder);
+    return IncrementalIndex.makeMetricColumnValueSelector(rowSelector, rowHolder, columnName);
   }
 
   @Override
diff --git a/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexCursorFactory.java b/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexCursorFactory.java
index e034f820dfbf..b73a7b682a30 100644
--- a/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexCursorFactory.java
+++ b/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexCursorFactory.java
@@ -99,9 +99,9 @@ public ColumnCapabilities getColumnCapabilities(String column)
     return snapshotColumnCapabilities(index, column);
   }
 
-  static ColumnCapabilities snapshotColumnCapabilities(IncrementalIndex index, String column)
+  static ColumnCapabilities snapshotColumnCapabilities(IncrementalIndexRowSelector selector, String column)
   {
-    IncrementalIndex.DimensionDesc desc = index.getDimension(column);
+    IncrementalIndex.DimensionDesc desc = selector.getDimension(column);
     // nested column indexer is a liar, and behaves like any type if it only processes unnested literals of a single
     // type, so force it to use nested column type
     if (desc != null && desc.getIndexer() instanceof NestedDataColumnIndexerV4) {
@@ -122,7 +122,7 @@ static ColumnCapabilities snapshotColumnCapabilities(IncrementalIndex index, Str
     // multi-valuedness at cursor creation time, instead of the latest state, and getSnapshotColumnCapabilities could
     // be removed.
     return ColumnCapabilitiesImpl.snapshot(
-        index.getColumnCapabilities(column),
+        selector.getColumnCapabilities(column),
         COERCE_LOGIC
     );
   }
diff --git a/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexCursorHolder.java b/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexCursorHolder.java
index 02c09398d8e5..72ec9116d1f7 100644
--- a/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexCursorHolder.java
+++ b/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexCursorHolder.java
@@ -23,52 +23,46 @@
 import org.apache.druid.query.BaseQuery;
 import org.apache.druid.query.Order;
 import org.apache.druid.query.OrderBy;
-import org.apache.druid.query.filter.Filter;
 import org.apache.druid.query.filter.ValueMatcher;
 import org.apache.druid.segment.ColumnSelectorFactory;
 import org.apache.druid.segment.Cursor;
 import org.apache.druid.segment.CursorBuildSpec;
 import org.apache.druid.segment.CursorHolder;
 import org.apache.druid.segment.Cursors;
-import org.apache.druid.segment.VirtualColumns;
 import org.apache.druid.segment.filter.ValueMatchers;
-import org.joda.time.Interval;
 
-import javax.annotation.Nullable;
-import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
 
 public class IncrementalIndexCursorHolder implements CursorHolder
 {
-  private final IncrementalIndex index;
+  private final IncrementalIndexRowSelector rowSelector;
   private final CursorBuildSpec spec;
   private final List<OrderBy> ordering;
 
   public IncrementalIndexCursorHolder(
-      IncrementalIndex index,
+      IncrementalIndexRowSelector rowSelector,
       CursorBuildSpec spec
   )
   {
-    this.index = index;
+    this.rowSelector = rowSelector;
     this.spec = spec;
-    if (index.timePosition == 0) {
+    List<OrderBy> ordering = rowSelector.getOrdering();
+    if (Cursors.getTimeOrdering(ordering) != Order.NONE) {
       if (Cursors.preferDescendingTimeOrdering(spec)) {
         this.ordering = Cursors.descendingTimeOrder();
       } else {
         this.ordering = Cursors.ascendingTimeOrder();
       }
     } else {
-      // In principle, we could report a sort order here for certain types of fact holders; for example the
-      // RollupFactsHolder would be sorted by dimensions. However, this is left for future work.
-      this.ordering = Collections.emptyList();
+      this.ordering = ordering;
     }
   }
 
   @Override
   public Cursor asCursor()
   {
-    if (index.isEmpty()) {
+    if (rowSelector.isEmpty()) {
       return null;
     }
 
@@ -76,13 +70,10 @@ public Cursor asCursor()
       spec.getQueryMetrics().vectorized(false);
     }
 
-
     return new IncrementalIndexCursor(
-        index,
-        spec.getVirtualColumns(),
-        Cursors.getTimeOrdering(ordering),
-        spec.getFilter(),
-        spec.getInterval()
+        rowSelector,
+        spec,
+        Cursors.getTimeOrdering(ordering)
     );
   }
 
@@ -94,11 +85,11 @@ public List<OrderBy> getOrdering()
 
   static class IncrementalIndexCursor implements Cursor
   {
-    private IncrementalIndexRowHolder currEntry;
+    private final IncrementalIndexRowSelector rowSelector;
+    private final IncrementalIndexRowHolder currEntry;
     private final ColumnSelectorFactory columnSelectorFactory;
     private final ValueMatcher filterMatcher;
     private final int maxRowIndex;
-    private final IncrementalIndex.FactsHolder facts;
     private Iterator<IncrementalIndexRow> baseIter;
     private Iterable<IncrementalIndexRow> cursorIterable;
     private boolean emptyRange;
@@ -106,30 +97,31 @@ static class IncrementalIndexCursor implements Cursor
     private boolean done;
 
     IncrementalIndexCursor(
-        IncrementalIndex index,
-        VirtualColumns virtualColumns,
-        Order timeOrder,
-        @Nullable Filter filter,
-        Interval actualInterval
+        IncrementalIndexRowSelector index,
+        CursorBuildSpec buildSpec,
+        Order timeOrder
     )
     {
       currEntry = new IncrementalIndexRowHolder();
-      columnSelectorFactory = new IncrementalIndexColumnSelectorFactory(
-          index,
-          virtualColumns,
-          timeOrder,
-          currEntry
-      );
       // Set maxRowIndex before creating the filterMatcher. See https://github.com/apache/druid/pull/6340
       maxRowIndex = index.getLastRowIndex();
-      filterMatcher = filter == null ? ValueMatchers.allTrue() : filter.makeMatcher(columnSelectorFactory);
       numAdvanced = -1;
-      facts = index.getFacts();
-      cursorIterable = facts.timeRangeIterable(
+
+      rowSelector = index;
+      cursorIterable = rowSelector.getFacts().timeRangeIterable(
           timeOrder == Order.DESCENDING,
-          actualInterval.getStartMillis(),
-          actualInterval.getEndMillis()
+          buildSpec.getInterval().getStartMillis(),
+          buildSpec.getInterval().getEndMillis()
       );
+      columnSelectorFactory = new IncrementalIndexColumnSelectorFactory(
+          rowSelector,
+          buildSpec.getVirtualColumns(),
+          timeOrder,
+          currEntry
+      );
+      filterMatcher = buildSpec.getFilter() == null
+                      ? ValueMatchers.allTrue()
+                      : buildSpec.getFilter().makeMatcher(columnSelectorFactory);
       emptyRange = !cursorIterable.iterator().hasNext();
 
       reset();
@@ -152,7 +144,7 @@ public void advance()
       while (baseIter.hasNext()) {
         BaseQuery.checkInterrupted();
 
-        IncrementalIndexRow entry = baseIter.next();
+        final IncrementalIndexRow entry = baseIter.next();
         if (beyondMaxRowIndex(entry.getRowIndex())) {
           continue;
         }
diff --git a/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexRow.java b/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexRow.java
index 89e94961f6b2..2e817b993ce0 100644
--- a/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexRow.java
+++ b/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexRow.java
@@ -144,6 +144,8 @@ public Object apply(@Nullable Object input)
           {
             if (input == null || (input.getClass().isArray() && Array.getLength(input) == 0)) {
               return Collections.singletonList("null");
+            } else if (input instanceof int[]) {
+              return Arrays.toString((int[]) input);
             }
             return Collections.singletonList(input);
           }
diff --git a/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexRowSelector.java b/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexRowSelector.java
new file mode 100644
index 000000000000..bafa127e881e
--- /dev/null
+++ b/processing/src/main/java/org/apache/druid/segment/incremental/IncrementalIndexRowSelector.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.segment.incremental;
+
+import org.apache.druid.query.OrderBy;
+import org.apache.druid.segment.ColumnInspector;
+
+import javax.annotation.Nullable;
+import java.util.List;
+
+/**
+ * Interface that abstracts selecting data from a {@link FactsHolder}
+ */
+public interface IncrementalIndexRowSelector extends ColumnInspector
+{
+  /**
+   * get {@link IncrementalIndex.DimensionDesc} for the specified column, if available, which provides access to things
+   * like {@link org.apache.druid.segment.DimensionIndexer} and {@link org.apache.druid.segment.DimensionHandler} as
+   * well as column capabilities and position within the row
+   */
+  @Nullable
+  IncrementalIndex.DimensionDesc getDimension(String columnName);
+
+  /**
+   * Get {@link IncrementalIndex.MetricDesc} which provides column capabilities and position in the aggregators section
+   * of the row
+   */
+  @Nullable
+  IncrementalIndex.MetricDesc getMetric(String s);
+
+  /**
+   * Ordering for the data in the facts table
+   */
+  List<OrderBy> getOrdering();
+
+  /**
+   * Are there any {@link IncrementalIndexRow} stored in the {@link FactsHolder}?
+   */
+  boolean isEmpty();
+
+  /**
+   * Get the {@link FactsHolder} containing all of the {@link IncrementalIndexRow} backing this selector
+   */
+  FactsHolder getFacts();
+
+  /**
+   * Highest value {@link IncrementalIndexRow#getRowIndex()} available in this selector. Note that these values do not
+   * reflect the position of the row in the {@link FactsHolder}, rather just the order in which they were processed
+   */
+  int getLastRowIndex();
+
+  /**
+   * @param rowOffset row to get float aggregator value
+   * @param aggOffset position of the aggregator in the aggregators array of the data schema
+   * @return          float value of the metric
+   */
+  float getMetricFloatValue(int rowOffset, int aggOffset);
+
+  /**
+   * @param rowOffset row to get long aggregator value
+   * @param aggOffset position of the aggregator in the aggregators array of the data schema
+   * @return          long value of the aggregator for this row
+   */
+  long getMetricLongValue(int rowOffset, int aggOffset);
+
+  /**
+   * @param rowOffset row to get double aggregator value
+   * @param aggOffset position of the aggregator in the aggregators array of the data schema
+   * @return          double value of the aggregator for this row
+   */
+  double getMetricDoubleValue(int rowOffset, int aggOffset);
+
+  /**
+   * @param rowOffset row to get long aggregator value
+   * @param aggOffset position of the aggregator in the aggregators array of the data schema
+   * @return          long value of the aggregator for this row
+   */
+  @Nullable
+  Object getMetricObjectValue(int rowOffset, int aggOffset);
+
+  /**
+   * @param rowOffset row to check for a aggregator value
+   * @param aggOffset position of the aggregator in the aggregators array of the data schema
+   * @return          is the value null for this row?
+   */
+  boolean isNull(int rowOffset, int aggOffset);
+}
diff --git a/processing/src/main/java/org/apache/druid/segment/incremental/OnheapIncrementalIndex.java b/processing/src/main/java/org/apache/druid/segment/incremental/OnheapIncrementalIndex.java
index b5e580f44f2f..8c554e016fc4 100644
--- a/processing/src/main/java/org/apache/druid/segment/incremental/OnheapIncrementalIndex.java
+++ b/processing/src/main/java/org/apache/druid/segment/incremental/OnheapIncrementalIndex.java
@@ -155,7 +155,7 @@ public class OnheapIncrementalIndex extends IncrementalIndex
     } else {
       this.facts = new PlainNonTimeOrderedFactsHolder(dimsComparator());
     }
-    maxBytesPerRowForAggregators =
+    this.maxBytesPerRowForAggregators =
         useMaxMemoryEstimates ? getMaxBytesPerRowForAggregators(incrementalIndexSchema) : 0;
     this.useMaxMemoryEstimates = useMaxMemoryEstimates;
   }
@@ -252,14 +252,15 @@ protected AddToFactsResult addToFacts(
   ) throws IndexSizeExceededException
   {
     final List<String> parseExceptionMessages = new ArrayList<>();
+    final AtomicLong totalSizeInBytes = getBytesInMemory();
+
     final int priorIndex = facts.getPriorIndex(key);
 
     Aggregator[] aggs;
     final AggregatorFactory[] metrics = getMetrics();
     final AtomicInteger numEntries = getNumEntries();
-    final AtomicLong totalSizeInBytes = getBytesInMemory();
     if (IncrementalIndexRow.EMPTY_ROW_INDEX != priorIndex) {
-      aggs = concurrentGet(priorIndex);
+      aggs = aggregators.get(priorIndex);
       long aggSizeDelta = doAggregate(metrics, aggs, inputRowHolder, parseExceptionMessages);
       totalSizeInBytes.addAndGet(useMaxMemoryEstimates ? 0 : aggSizeDelta);
     } else {
@@ -272,7 +273,7 @@ protected AddToFactsResult addToFacts(
       aggSizeForRow += doAggregate(metrics, aggs, inputRowHolder, parseExceptionMessages);
 
       final int rowIndex = indexIncrement.getAndIncrement();
-      concurrentSet(rowIndex, aggs);
+      aggregators.put(rowIndex, aggs);
 
       // Last ditch sanity checks
       if ((numEntries.get() >= maxRowCount || totalSizeInBytes.get() >= maxBytesInMemory)
@@ -363,6 +364,18 @@ private long doAggregate(
       InputRowHolder inputRowHolder,
       List<String> parseExceptionsHolder
   )
+  {
+    return doAggregate(metrics, aggs, inputRowHolder, parseExceptionsHolder, useMaxMemoryEstimates, preserveExistingMetrics);
+  }
+
+  private static long doAggregate(
+      AggregatorFactory[] metrics,
+      Aggregator[] aggs,
+      InputRowHolder inputRowHolder,
+      List<String> parseExceptionsHolder,
+      boolean useMaxMemoryEstimates,
+      boolean preserveExistingMetrics
+  )
   {
     long totalIncrementalBytes = 0L;
     for (int i = 0; i < metrics.length; i++) {
@@ -418,17 +431,6 @@ private void closeAggregators()
     }
   }
 
-  protected Aggregator[] concurrentGet(int offset)
-  {
-    // All get operations should be fine
-    return aggregators.get(offset);
-  }
-
-  protected void concurrentSet(int offset, Aggregator[] value)
-  {
-    aggregators.put(offset, value);
-  }
-
   @Override
   public boolean canAppendRow()
   {
@@ -459,42 +461,53 @@ public String getOutOfRowsReason()
     return outOfRowsReason;
   }
 
-  protected Aggregator[] getAggsForRow(int rowOffset)
-  {
-    return concurrentGet(rowOffset);
-  }
-
   @Override
   public float getMetricFloatValue(int rowOffset, int aggOffset)
   {
-    return ((Number) getMetricHelper(getMetricAggs(), concurrentGet(rowOffset), aggOffset, Aggregator::getFloat)).floatValue();
+    return ((Number) getMetricHelper(
+        getMetricAggs(),
+        aggregators.get(rowOffset),
+        aggOffset,
+        Aggregator::getFloat
+    )).floatValue();
   }
 
   @Override
   public long getMetricLongValue(int rowOffset, int aggOffset)
   {
-    return ((Number) getMetricHelper(getMetricAggs(), concurrentGet(rowOffset), aggOffset, Aggregator::getLong)).longValue();
+    return ((Number) getMetricHelper(
+        getMetricAggs(),
+        aggregators.get(rowOffset),
+        aggOffset,
+        Aggregator::getLong
+    )).longValue();
   }
 
   @Override
-  public Object getMetricObjectValue(int rowOffset, int aggOffset)
+  public double getMetricDoubleValue(int rowOffset, int aggOffset)
   {
-    return getMetricHelper(getMetricAggs(), concurrentGet(rowOffset), aggOffset, Aggregator::get);
+    return ((Number) getMetricHelper(
+        getMetricAggs(),
+        aggregators.get(rowOffset),
+        aggOffset,
+        Aggregator::getDouble
+    )).doubleValue();
   }
 
   @Override
-  protected double getMetricDoubleValue(int rowOffset, int aggOffset)
+  public Object getMetricObjectValue(int rowOffset, int aggOffset)
   {
-    return ((Number) getMetricHelper(getMetricAggs(), concurrentGet(rowOffset), aggOffset, Aggregator::getDouble)).doubleValue();
+    return getMetricHelper(getMetricAggs(), aggregators.get(rowOffset), aggOffset, Aggregator::get);
   }
 
   @Override
   public boolean isNull(int rowOffset, int aggOffset)
   {
+    final Aggregator[] aggs = aggregators.get(rowOffset);
     if (preserveExistingMetrics) {
-      return concurrentGet(rowOffset)[aggOffset].isNull() && concurrentGet(rowOffset)[aggOffset + getMetricAggs().length].isNull();
+      return aggs[aggOffset].isNull() && aggs[aggOffset + getMetricAggs().length].isNull();
     } else {
-      return concurrentGet(rowOffset)[aggOffset].isNull();
+      return aggs[aggOffset].isNull();
     }
   }
 
@@ -535,7 +548,7 @@ public Iterable<Row> iterableWithPostAggregations(
                 theVals.put(dimensionName, rowVals);
               }
 
-              Aggregator[] aggs = getAggsForRow(rowOffset);
+              Aggregator[] aggs = aggregators.get(rowOffset);
               int aggLength = preserveExistingMetrics ? aggs.length / 2 : aggs.length;
               for (int i = 0; i < aggLength; ++i) {
                 theVals.put(metrics[i].getName(), getMetricHelper(metrics, aggs, i, Aggregator::get));
@@ -560,11 +573,16 @@ public Iterable<Row> iterableWithPostAggregations(
    * for aggregating from input into output field and the aggregator for combining already aggregated field, as needed
    */
   @Nullable
-  private <T> Object getMetricHelper(AggregatorFactory[] metrics, Aggregator[] aggs, int aggOffset, Function<Aggregator, T> getMetricTypeFunction)
+  private <T> Object getMetricHelper(
+      AggregatorFactory[] metrics,
+      Aggregator[] aggs,
+      int aggOffset,
+      Function<Aggregator, T> getMetricTypeFunction
+  )
   {
     if (preserveExistingMetrics) {
-      // Since the preserveExistingMetrics flag is set, we will have to check and possibly retrieve the aggregated values
-      // from two aggregators, the aggregator for aggregating from input into output field and the aggregator
+      // Since the preserveExistingMetrics flag is set, we will have to check and possibly retrieve the aggregated
+      // values from two aggregators, the aggregator for aggregating from input into output field and the aggregator
       // for combining already aggregated field
       if (aggs[aggOffset].isNull()) {
         // If the aggregator for aggregating from input into output field is null, then we get the value from the
@@ -583,8 +601,8 @@ private <T> Object getMetricHelper(AggregatorFactory[] metrics, Aggregator[] agg
         return aggregatorFactory.combine(aggregatedFromSource, aggregatedFromCombined);
       }
     } else {
-      // If preserveExistingMetrics flag is not set then we simply get metrics from the list of Aggregator, aggs, using the
-      // given aggOffset
+      // If preserveExistingMetrics flag is not set then we simply get metrics from the list of Aggregator, aggs,
+      // using the given aggOffset
       return getMetricTypeFunction.apply(aggs[aggOffset]);
     }
   }
diff --git a/processing/src/test/java/org/apache/druid/frame/processor/test/TestFrameProcessorUtils.java b/processing/src/test/java/org/apache/druid/frame/processor/test/TestFrameProcessorUtils.java
index ae748198aea3..baac335f0c42 100644
--- a/processing/src/test/java/org/apache/druid/frame/processor/test/TestFrameProcessorUtils.java
+++ b/processing/src/test/java/org/apache/druid/frame/processor/test/TestFrameProcessorUtils.java
@@ -26,10 +26,7 @@
 import org.apache.druid.frame.Frame;
 import org.apache.druid.frame.FrameType;
 import org.apache.druid.frame.testutil.FrameSequenceBuilder;
-import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.CursorFactory;
-import org.apache.druid.segment.VirtualColumns;
 import org.apache.druid.segment.incremental.IncrementalIndex;
 import org.apache.druid.segment.incremental.IncrementalIndexCursorFactory;
 import org.apache.druid.segment.incremental.IncrementalIndexSchema;
@@ -48,15 +45,11 @@ public static CursorFactory toCursorFactory(List<InputRow> inputRows)
   {
     final IncrementalIndex index = new OnheapIncrementalIndex.Builder()
         .setIndexSchema(
-            new IncrementalIndexSchema(
-                0,
-                new TimestampSpec("__time", "millis", null),
-                Granularities.NONE,
-                VirtualColumns.EMPTY,
-                DimensionsSpec.builder().useSchemaDiscovery(true).build(),
-                new AggregatorFactory[0],
-                false
-            )
+            IncrementalIndexSchema.builder()
+                                  .withTimestampSpec(new TimestampSpec("__time", "millis", null))
+                                  .withDimensionsSpec(DimensionsSpec.builder().useSchemaDiscovery(true).build())
+                                  .withRollup(false)
+                                  .build()
         )
         .setMaxRowCount(1000)
         .build();
diff --git a/processing/src/test/java/org/apache/druid/segment/AutoTypeColumnIndexerTest.java b/processing/src/test/java/org/apache/druid/segment/AutoTypeColumnIndexerTest.java
index 80b6d23d4b85..2c570981f656 100644
--- a/processing/src/test/java/org/apache/druid/segment/AutoTypeColumnIndexerTest.java
+++ b/processing/src/test/java/org/apache/druid/segment/AutoTypeColumnIndexerTest.java
@@ -26,8 +26,6 @@
 import org.apache.druid.data.input.impl.DimensionsSpec;
 import org.apache.druid.data.input.impl.TimestampSpec;
 import org.apache.druid.guice.BuiltInTypesModule;
-import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.dimension.DefaultDimensionSpec;
 import org.apache.druid.query.dimension.DimensionSpec;
 import org.apache.druid.segment.column.ColumnType;
@@ -494,18 +492,17 @@ public void testNestedColumnIndexerSchemaDiscoveryTypeCoercion() throws IndexSiz
     long minTimestamp = System.currentTimeMillis();
     IncrementalIndex index = new OnheapIncrementalIndex.Builder()
         .setIndexSchema(
-            new IncrementalIndexSchema(
-                minTimestamp,
-                new TimestampSpec(TIME_COL, "millis", null),
-                Granularities.NONE,
-                VirtualColumns.EMPTY,
-                DimensionsSpec.builder()
-                              .setDimensions(ImmutableList.of(new AutoTypeColumnSchema(NESTED_COL, ColumnType.STRING)))
-                              .useSchemaDiscovery(true)
-                              .build(),
-                new AggregatorFactory[0],
-                false
-            )
+            IncrementalIndexSchema.builder()
+                                  .withMinTimestamp(minTimestamp)
+                                  .withTimestampSpec(new TimestampSpec(TIME_COL, "millis", null))
+                                  .withDimensionsSpec(
+                                      DimensionsSpec.builder()
+                                                    .setDimensions(ImmutableList.of(new AutoTypeColumnSchema(NESTED_COL, ColumnType.STRING)))
+                                                    .useSchemaDiscovery(true)
+                                                    .build()
+                                  )
+                                  .withRollup(false)
+                                  .build()
         )
         .setMaxRowCount(1000)
         .build();
@@ -699,15 +696,16 @@ private static IncrementalIndex makeIncrementalIndex(long minTimestamp)
   {
     IncrementalIndex index = new OnheapIncrementalIndex.Builder()
         .setIndexSchema(
-            new IncrementalIndexSchema(
-                minTimestamp,
-                new TimestampSpec(TIME_COL, "millis", null),
-                Granularities.NONE,
-                VirtualColumns.EMPTY,
-                DimensionsSpec.builder().useSchemaDiscovery(true).build(),
-                new AggregatorFactory[0],
-                false
-            )
+            IncrementalIndexSchema.builder()
+                                  .withMinTimestamp(minTimestamp)
+                                  .withTimestampSpec(new TimestampSpec(TIME_COL, "millis", null))
+                                  .withDimensionsSpec(
+                                      DimensionsSpec.builder()
+                                                    .useSchemaDiscovery(true)
+                                                    .build()
+                                  )
+                                  .withRollup(false)
+                                  .build()
         )
         .setMaxRowCount(1000)
         .build();
diff --git a/processing/src/test/java/org/apache/druid/segment/NestedDataColumnIndexerV4Test.java b/processing/src/test/java/org/apache/druid/segment/NestedDataColumnIndexerV4Test.java
index 2e9deab42b49..9fc9fc0f578d 100644
--- a/processing/src/test/java/org/apache/druid/segment/NestedDataColumnIndexerV4Test.java
+++ b/processing/src/test/java/org/apache/druid/segment/NestedDataColumnIndexerV4Test.java
@@ -26,8 +26,6 @@
 import org.apache.druid.data.input.impl.DimensionsSpec;
 import org.apache.druid.data.input.impl.TimestampSpec;
 import org.apache.druid.guice.BuiltInTypesModule;
-import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.dimension.DefaultDimensionSpec;
 import org.apache.druid.query.dimension.DimensionSpec;
 import org.apache.druid.segment.column.ColumnType;
@@ -478,15 +476,16 @@ private static IncrementalIndex makeIncrementalIndex(long minTimestamp)
   {
     IncrementalIndex index = new OnheapIncrementalIndex.Builder()
         .setIndexSchema(
-            new IncrementalIndexSchema(
-                minTimestamp,
-                new TimestampSpec(TIME_COL, "millis", null),
-                Granularities.NONE,
-                VirtualColumns.EMPTY,
-                DimensionsSpec.builder().useSchemaDiscovery(true).build(),
-                new AggregatorFactory[0],
-                false
-            )
+            IncrementalIndexSchema.builder()
+                                  .withMinTimestamp(minTimestamp)
+                                  .withTimestampSpec(new TimestampSpec(TIME_COL, "millis", null))
+                                  .withDimensionsSpec(
+                                      DimensionsSpec.builder()
+                                                    .useSchemaDiscovery(true)
+                                                    .build()
+                                  )
+                                  .withRollup(false)
+                                  .build()
         )
         .setMaxRowCount(1000)
         .build();
diff --git a/processing/src/test/java/org/apache/druid/segment/incremental/IncrementalIndexIngestionTest.java b/processing/src/test/java/org/apache/druid/segment/incremental/IncrementalIndexIngestionTest.java
index 77e0470c5486..e1a7319cab18 100644
--- a/processing/src/test/java/org/apache/druid/segment/incremental/IncrementalIndexIngestionTest.java
+++ b/processing/src/test/java/org/apache/druid/segment/incremental/IncrementalIndexIngestionTest.java
@@ -25,9 +25,12 @@
 import org.apache.druid.guice.BuiltInTypesModule;
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.query.aggregation.Aggregator;
+import org.apache.druid.query.aggregation.AggregatorAndSize;
 import org.apache.druid.query.aggregation.LongMaxAggregator;
 import org.apache.druid.query.aggregation.LongMaxAggregatorFactory;
 import org.apache.druid.segment.CloserRule;
+import org.apache.druid.segment.ColumnSelectorFactory;
+import org.apache.druid.segment.ColumnValueSelector;
 import org.apache.druid.testing.InitializedNullHandlingTest;
 import org.easymock.EasyMock;
 import org.junit.Rule;
@@ -69,22 +72,39 @@ public void testOnHeapIncrementalIndexClose() throws Exception
   {
     // Prepare the mocks & set close() call count expectation to 1
     Aggregator mockedAggregator = EasyMock.createMock(LongMaxAggregator.class);
+    EasyMock.expect(mockedAggregator.aggregateWithSize()).andReturn(0L).anyTimes();
     mockedAggregator.close();
     EasyMock.expectLastCall().times(1);
 
-    final IncrementalIndex genericIndex = indexCreator.createIndex(
+
+    EasyMock.replay(mockedAggregator);
+
+    final IncrementalIndex incrementalIndex = indexCreator.createIndex(
         new IncrementalIndexSchema.Builder()
             .withQueryGranularity(Granularities.MINUTE)
-            .withMetrics(new LongMaxAggregatorFactory("max", "max"))
+            .withMetrics(new LongMaxAggregatorFactory("max", "max")
+            {
+              @Override
+              protected Aggregator factorize(ColumnSelectorFactory metricFactory, ColumnValueSelector selector)
+              {
+                return mockedAggregator;
+              }
+
+              @Override
+              public AggregatorAndSize factorizeWithSize(ColumnSelectorFactory metricFactory)
+              {
+                return new AggregatorAndSize(mockedAggregator, Long.BYTES);
+              }
+            })
             .build()
     );
 
     // This test is specific to the on-heap index
-    if (!(genericIndex instanceof OnheapIncrementalIndex)) {
+    if (!(incrementalIndex instanceof OnheapIncrementalIndex)) {
       return;
     }
 
-    final OnheapIncrementalIndex index = (OnheapIncrementalIndex) genericIndex;
+    final OnheapIncrementalIndex index = (OnheapIncrementalIndex) incrementalIndex;
 
     index.add(new MapBasedInputRow(
             0,
@@ -92,11 +112,7 @@ public void testOnHeapIncrementalIndexClose() throws Exception
             ImmutableMap.of("billy", 1, "max", 1)
     ));
 
-    // override the aggregators with the mocks
-    index.concurrentGet(0)[0] = mockedAggregator;
-
     // close the indexer and validate the expectations
-    EasyMock.replay(mockedAggregator);
     index.close();
     EasyMock.verify(mockedAggregator);
   }
diff --git a/processing/src/test/java/org/apache/druid/segment/incremental/IncrementalIndexMultiValueSpecTest.java b/processing/src/test/java/org/apache/druid/segment/incremental/IncrementalIndexMultiValueSpecTest.java
index 80c8207ed605..f5779bf73629 100644
--- a/processing/src/test/java/org/apache/druid/segment/incremental/IncrementalIndexMultiValueSpecTest.java
+++ b/processing/src/test/java/org/apache/druid/segment/incremental/IncrementalIndexMultiValueSpecTest.java
@@ -28,10 +28,7 @@
 import org.apache.druid.data.input.impl.StringDimensionSchema;
 import org.apache.druid.data.input.impl.TimestampSpec;
 import org.apache.druid.guice.BuiltInTypesModule;
-import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.CloserRule;
-import org.apache.druid.segment.VirtualColumns;
 import org.apache.druid.testing.InitializedNullHandlingTest;
 import org.junit.Assert;
 import org.junit.Rule;
@@ -80,15 +77,11 @@ public void test() throws IndexSizeExceededException
             new StringDimensionSchema("string3", DimensionSchema.MultiValueHandling.SORTED_SET, true)
         )
     );
-    IncrementalIndexSchema schema = new IncrementalIndexSchema(
-        0,
-        new TimestampSpec("ds", "auto", null),
-        Granularities.ALL,
-        VirtualColumns.EMPTY,
-        dimensionsSpec,
-        new AggregatorFactory[0],
-        false
-    );
+    IncrementalIndexSchema schema = IncrementalIndexSchema.builder()
+                                                          .withTimestampSpec(new TimestampSpec("ds", "auto", null))
+                                                          .withDimensionsSpec(dimensionsSpec)
+                                                          .withRollup(false)
+                                                          .build();
     Map<String, Object> map = new HashMap<String, Object>()
     {
       @Override
diff --git a/processing/src/test/java/org/apache/druid/segment/virtual/ExpressionSelectorsTest.java b/processing/src/test/java/org/apache/druid/segment/virtual/ExpressionSelectorsTest.java
index 632a848830d8..a3060f078a2c 100644
--- a/processing/src/test/java/org/apache/druid/segment/virtual/ExpressionSelectorsTest.java
+++ b/processing/src/test/java/org/apache/druid/segment/virtual/ExpressionSelectorsTest.java
@@ -25,7 +25,6 @@
 import org.apache.druid.common.config.NullHandling;
 import org.apache.druid.common.guava.SettableSupplier;
 import org.apache.druid.data.input.MapBasedInputRow;
-import org.apache.druid.data.input.impl.DimensionsSpec;
 import org.apache.druid.data.input.impl.TimestampSpec;
 import org.apache.druid.java.util.common.DateTimes;
 import org.apache.druid.java.util.common.granularity.Granularities;
@@ -50,7 +49,6 @@
 import org.apache.druid.segment.QueryableIndex;
 import org.apache.druid.segment.QueryableIndexCursorFactory;
 import org.apache.druid.segment.TestObjectColumnSelector;
-import org.apache.druid.segment.VirtualColumns;
 import org.apache.druid.segment.column.ColumnCapabilities;
 import org.apache.druid.segment.column.ColumnCapabilitiesImpl;
 import org.apache.druid.segment.column.ColumnType;
@@ -620,15 +618,10 @@ public void test_incrementalIndexStringSelector() throws IndexSizeExceededExcept
     // underlying dimension selector.
     // This occurred during schemaless ingestion with spare dimension values and no explicit null rows, so the
     // conditions are replicated by this test. See https://github.com/apache/druid/pull/10248 for details
-    IncrementalIndexSchema schema = new IncrementalIndexSchema(
-        0,
-        new TimestampSpec("time", "millis", DateTimes.nowUtc()),
-        Granularities.NONE,
-        VirtualColumns.EMPTY,
-        DimensionsSpec.EMPTY,
-        new AggregatorFactory[]{new CountAggregatorFactory("count")},
-        true
-    );
+    IncrementalIndexSchema schema = IncrementalIndexSchema.builder()
+                                                          .withTimestampSpec(new TimestampSpec("time", "millis", DateTimes.nowUtc()))
+                                                          .withMetrics(new AggregatorFactory[]{new CountAggregatorFactory("count")})
+                                                          .build();
 
     IncrementalIndex index = new OnheapIncrementalIndex.Builder().setMaxRowCount(100).setIndexSchema(schema).build();
     index.add(

From 7ac7d65b07db99639efe5a714c207d94e8063ba0 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Sun, 15 Sep 2024 17:03:18 -0700
Subject: [PATCH 24/47] Speed up FrameFileTest, SuperSorterTest. (#17068)

* Speed up FrameFileTest, SuperSorterTest.

These are two heavily parameterized tests that, together, account for
about 60% of runtime in the test suite.

FrameFileTest changes:

1) Cache frame files in a static, rather than building the frame file
   for each parameterization of the test.

2) Adjust TestArrayCursorFactory to cache the signature, rather than
   re-creating it on each call to getColumnCapabilities.

SuperSorterTest changes:

1) Dramatically reduce the number of tests that run with
   "maxRowsPerFrame" = 1. These are particularly slow due to writing so
   many small files. Some still run, since it's useful to test edge cases,
   but much fewer than before.

2) Reduce the "maxActiveProcessors" axis of the test from [1, 2, 4] to
   [1, 3]. The aim is to reduce the number of cases while still getting
   good coverage of the feature.

3) Reduce the "maxChannelsPerProcessor" axis of the test from [2, 3, 8]
   to [2, 7]. The aim is to reduce the number of cases while still getting
   good coverage of the feature.

4) Use in-memory input channels rather than file channels.

5) Defer formatting of assertion failure messages until they are needed.

6) Cache the cursor factory and its signature in a static.

7) Cache sorted test rows (used for verification) in a static.

* It helps to include the file.

* Style.
---
 .../frame/key/RowKeyComparisonRunLengths.java |   7 +-
 .../druid/frame/TestArrayCursorFactory.java   |  37 +--
 .../druid/frame/file/FrameFileTest.java       | 155 ++++++++++---
 .../frame/processor/SuperSorterTest.java      | 219 +++++++++++-------
 .../frame/testutil/FrameSequenceBuilder.java  |  16 +-
 .../druid/frame/testutil/FrameTestUtil.java   |  27 ++-
 6 files changed, 316 insertions(+), 145 deletions(-)

diff --git a/processing/src/main/java/org/apache/druid/frame/key/RowKeyComparisonRunLengths.java b/processing/src/main/java/org/apache/druid/frame/key/RowKeyComparisonRunLengths.java
index cc05dea993e5..ab6797a7c003 100644
--- a/processing/src/main/java/org/apache/druid/frame/key/RowKeyComparisonRunLengths.java
+++ b/processing/src/main/java/org/apache/druid/frame/key/RowKeyComparisonRunLengths.java
@@ -88,11 +88,12 @@ public static RowKeyComparisonRunLengths create(final List<KeyColumn> keyColumns
         );
       }
 
-      ColumnType columnType = rowSignature.getColumnType(keyColumn.columnName())
-                                          .orElseThrow(() -> DruidException.defensive("Need column types"));
+      ColumnType columnType =
+          rowSignature.getColumnType(keyColumn.columnName())
+                      .orElseThrow(() -> DruidException.defensive("No type for column[%s]", keyColumn.columnName()));
 
       // First key column to be processed
-      if (runLengthEntryBuilders.size() == 0) {
+      if (runLengthEntryBuilders.isEmpty()) {
         final boolean isByteComparable = isByteComparable(columnType);
         runLengthEntryBuilders.add(
             new RunLengthEntryBuilder(isByteComparable, keyColumn.order())
diff --git a/processing/src/test/java/org/apache/druid/frame/TestArrayCursorFactory.java b/processing/src/test/java/org/apache/druid/frame/TestArrayCursorFactory.java
index 2a6116bfddff..4bde3d31fe0c 100644
--- a/processing/src/test/java/org/apache/druid/frame/TestArrayCursorFactory.java
+++ b/processing/src/test/java/org/apache/druid/frame/TestArrayCursorFactory.java
@@ -48,9 +48,12 @@
  */
 public class TestArrayCursorFactory extends QueryableIndexCursorFactory
 {
+  private final RowSignature signature;
+
   public TestArrayCursorFactory(QueryableIndex index)
   {
     super(index);
+    this.signature = computeRowSignature(index);
   }
 
   @Override
@@ -81,15 +84,31 @@ public void close()
     };
   }
 
-
   @Override
   public RowSignature getRowSignature()
+  {
+    return signature;
+  }
+
+  @Nullable
+  @Override
+  public ColumnCapabilities getColumnCapabilities(String column)
+  {
+    final ColumnCapabilities ourType = getRowSignature().getColumnCapabilities(column);
+    if (ourType != null) {
+      return ColumnCapabilitiesImpl.copyOf(super.getColumnCapabilities(column)).setType(ourType.toColumnType());
+    } else {
+      return super.getColumnCapabilities(column);
+    }
+  }
+
+  private static RowSignature computeRowSignature(final QueryableIndex index)
   {
     final RowSignature.Builder builder = RowSignature.builder();
     builder.addTimeColumn();
 
-    for (final String column : super.getRowSignature().getColumnNames()) {
-      ColumnCapabilities columnCapabilities = super.getColumnCapabilities(column);
+    for (final String column : new QueryableIndexCursorFactory(index).getRowSignature().getColumnNames()) {
+      ColumnCapabilities columnCapabilities = index.getColumnCapabilities(column);
       ColumnType columnType = columnCapabilities == null ? null : columnCapabilities.toColumnType();
       //change MV strings columns to Array<String>
       if (columnType != null
@@ -103,18 +122,6 @@ public RowSignature getRowSignature()
     return builder.build();
   }
 
-  @Nullable
-  @Override
-  public ColumnCapabilities getColumnCapabilities(String column)
-  {
-    final ColumnCapabilities ourType = getRowSignature().getColumnCapabilities(column);
-    if (ourType != null) {
-      return ColumnCapabilitiesImpl.copyOf(super.getColumnCapabilities(column)).setType(ourType.toColumnType());
-    } else {
-      return super.getColumnCapabilities(column);
-    }
-  }
-
   private class DecoratedCursor implements Cursor
   {
     private final Cursor cursor;
diff --git a/processing/src/test/java/org/apache/druid/frame/file/FrameFileTest.java b/processing/src/test/java/org/apache/druid/frame/file/FrameFileTest.java
index 9c92eb14e3e5..c916a458564c 100644
--- a/processing/src/test/java/org/apache/druid/frame/file/FrameFileTest.java
+++ b/processing/src/test/java/org/apache/druid/frame/file/FrameFileTest.java
@@ -39,6 +39,7 @@
 import org.apache.druid.segment.incremental.IncrementalIndexCursorFactory;
 import org.apache.druid.testing.InitializedNullHandlingTest;
 import org.hamcrest.Matchers;
+import org.junit.AfterClass;
 import org.junit.Assert;
 import org.junit.Assume;
 import org.junit.Before;
@@ -49,17 +50,28 @@
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
 
+import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.IOException;
+import java.io.OutputStream;
 import java.math.RoundingMode;
+import java.nio.file.Files;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
+import java.util.Objects;
 import java.util.function.Function;
 import java.util.stream.IntStream;
 
 @RunWith(Parameterized.class)
 public class FrameFileTest extends InitializedNullHandlingTest
 {
+  /**
+   * Static cache of generated frame files, to speed up tests. Cleared in {@link #afterClass()}.
+   */
+  private static final Map<FrameFileKey, byte[]> FRAME_FILES = new HashMap<>();
+
   // Partition every 99 rows if "partitioned" is true.
   private static final int PARTITION_SIZE = 99;
 
@@ -122,6 +134,7 @@ int getRowCount()
     };
 
     abstract CursorFactory getCursorFactory();
+
     abstract int getRowCount();
   }
 
@@ -195,38 +208,21 @@ public void setUp() throws IOException
   {
     cursorFactory = adapterType.getCursorFactory();
     rowCount = adapterType.getRowCount();
+    file = temporaryFolder.newFile();
 
-    if (partitioned) {
-      // Partition every PARTITION_SIZE rows.
-      file = FrameTestUtil.writeFrameFileWithPartitions(
-          FrameSequenceBuilder.fromCursorFactory(cursorFactory).frameType(frameType).maxRowsPerFrame(maxRowsPerFrame).frames().map(
-              new Function<Frame, IntObjectPair<Frame>>()
-              {
-                private int rows = 0;
-
-                @Override
-                public IntObjectPair<Frame> apply(final Frame frame)
-                {
-                  final int partitionNum = rows / PARTITION_SIZE;
-                  rows += frame.numRows();
-                  return IntObjectPair.of(
-                      partitionNum >= SKIP_PARTITION ? partitionNum + 1 : partitionNum,
-                      frame
-                  );
-                }
-              }
-          ),
-          temporaryFolder.newFile()
-      );
-
-    } else {
-      file = FrameTestUtil.writeFrameFile(
-          FrameSequenceBuilder.fromCursorFactory(cursorFactory).frameType(frameType).maxRowsPerFrame(maxRowsPerFrame).frames(),
-          temporaryFolder.newFile()
-      );
+    try (final OutputStream out = Files.newOutputStream(file.toPath())) {
+      final FrameFileKey frameFileKey = new FrameFileKey(adapterType, frameType, maxRowsPerFrame, partitioned);
+      final byte[] frameFileBytes = FRAME_FILES.computeIfAbsent(frameFileKey, FrameFileTest::computeFrameFile);
+      out.write(frameFileBytes);
     }
   }
 
+  @AfterClass
+  public static void afterClass()
+  {
+    FRAME_FILES.clear();
+  }
+
   @Test
   public void test_numFrames() throws IOException
   {
@@ -414,4 +410,107 @@ private static int countRows(final CursorFactory cursorFactory)
     return FrameTestUtil.readRowsFromCursorFactory(cursorFactory, RowSignature.empty(), false)
                         .accumulate(0, (i, in) -> i + 1);
   }
+
+  /**
+   * Returns bytes, in frame file format, corresponding to the given {@link FrameFileKey}.
+   */
+  private static byte[] computeFrameFile(final FrameFileKey frameFileKey)
+  {
+    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+
+    try {
+      if (frameFileKey.partitioned) {
+        // Partition every PARTITION_SIZE rows.
+        FrameTestUtil.writeFrameFileWithPartitions(
+            FrameSequenceBuilder.fromCursorFactory(frameFileKey.adapterType.getCursorFactory())
+                                .frameType(frameFileKey.frameType)
+                                .maxRowsPerFrame(frameFileKey.maxRowsPerFrame)
+                                .frames()
+                                .map(
+                                    new Function<Frame, IntObjectPair<Frame>>()
+                                    {
+                                      private int rows = 0;
+
+                                      @Override
+                                      public IntObjectPair<Frame> apply(final Frame frame)
+                                      {
+                                        final int partitionNum = rows / PARTITION_SIZE;
+                                        rows += frame.numRows();
+                                        return IntObjectPair.of(
+                                            partitionNum >= SKIP_PARTITION ? partitionNum + 1 : partitionNum,
+                                            frame
+                                        );
+                                      }
+                                    }
+                                ),
+            baos
+        );
+      } else {
+        FrameTestUtil.writeFrameFile(
+            FrameSequenceBuilder.fromCursorFactory(frameFileKey.adapterType.getCursorFactory())
+                                .frameType(frameFileKey.frameType)
+                                .maxRowsPerFrame(frameFileKey.maxRowsPerFrame)
+                                .frames(),
+            baos
+        );
+      }
+    }
+    catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+
+    return baos.toByteArray();
+  }
+
+  /**
+   * Key for {@link #FRAME_FILES}, and input to {@link #computeFrameFile(FrameFileKey)}.
+   */
+  private static class FrameFileKey
+  {
+    final AdapterType adapterType;
+    final FrameType frameType;
+    final int maxRowsPerFrame;
+    final boolean partitioned;
+
+    public FrameFileKey(AdapterType adapterType, FrameType frameType, int maxRowsPerFrame, boolean partitioned)
+    {
+      this.adapterType = adapterType;
+      this.frameType = frameType;
+      this.maxRowsPerFrame = maxRowsPerFrame;
+      this.partitioned = partitioned;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+      if (this == o) {
+        return true;
+      }
+      if (o == null || getClass() != o.getClass()) {
+        return false;
+      }
+      FrameFileKey that = (FrameFileKey) o;
+      return maxRowsPerFrame == that.maxRowsPerFrame
+             && partitioned == that.partitioned
+             && adapterType == that.adapterType
+             && frameType == that.frameType;
+    }
+
+    @Override
+    public int hashCode()
+    {
+      return Objects.hash(adapterType, frameType, maxRowsPerFrame, partitioned);
+    }
+
+    @Override
+    public String toString()
+    {
+      return "FrameFileKey{" +
+             "adapterType=" + adapterType +
+             ", frameType=" + frameType +
+             ", maxRowsPerFrame=" + maxRowsPerFrame +
+             ", partitioned=" + partitioned +
+             '}';
+    }
+  }
 }
diff --git a/processing/src/test/java/org/apache/druid/frame/processor/SuperSorterTest.java b/processing/src/test/java/org/apache/druid/frame/processor/SuperSorterTest.java
index 2149d6cbf1c7..7a885af49c54 100644
--- a/processing/src/test/java/org/apache/druid/frame/processor/SuperSorterTest.java
+++ b/processing/src/test/java/org/apache/druid/frame/processor/SuperSorterTest.java
@@ -29,13 +29,7 @@
 import org.apache.druid.frame.FrameType;
 import org.apache.druid.frame.allocation.ArenaMemoryAllocator;
 import org.apache.druid.frame.channel.BlockingQueueFrameChannel;
-import org.apache.druid.frame.channel.ByteTracker;
-import org.apache.druid.frame.channel.ReadableFileFrameChannel;
 import org.apache.druid.frame.channel.ReadableFrameChannel;
-import org.apache.druid.frame.channel.WritableFrameChannel;
-import org.apache.druid.frame.channel.WritableFrameFileChannel;
-import org.apache.druid.frame.file.FrameFile;
-import org.apache.druid.frame.file.FrameFileWriter;
 import org.apache.druid.frame.key.ClusterBy;
 import org.apache.druid.frame.key.ClusterByPartition;
 import org.apache.druid.frame.key.ClusterByPartitions;
@@ -47,6 +41,7 @@
 import org.apache.druid.frame.read.FrameReader;
 import org.apache.druid.frame.testutil.FrameSequenceBuilder;
 import org.apache.druid.frame.testutil.FrameTestUtil;
+import org.apache.druid.frame.write.FrameWriters;
 import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.concurrent.Execs;
@@ -62,8 +57,10 @@
 import org.hamcrest.MatcherAssert;
 import org.hamcrest.Matchers;
 import org.junit.After;
+import org.junit.AfterClass;
 import org.junit.Assert;
 import org.junit.Before;
+import org.junit.BeforeClass;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
@@ -73,12 +70,12 @@
 import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.nio.channels.Channels;
-import java.nio.file.Files;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.concurrent.TimeUnit;
 import java.util.function.Consumer;
 
@@ -228,6 +225,15 @@ public void testLimitHint() throws Exception
   @RunWith(Parameterized.class)
   public static class ParameterizedCasesTest extends InitializedNullHandlingTest
   {
+    private static CursorFactory CURSOR_FACTORY;
+    private static RowSignature CURSOR_FACTORY_SIGNATURE_WITH_ROW_NUMBER;
+
+    /**
+     * Static cache of sorted versions of the {@link #CURSOR_FACTORY} dataset, to speed up tests.
+     * Cleared in {@link #tearDownClass()}.
+     */
+    private static final Map<ClusterBy, List<List<Object>>> SORTED_TEST_ROWS = new HashMap<>();
+
     @Rule
     public TemporaryFolder temporaryFolder = new TemporaryFolder();
 
@@ -241,7 +247,6 @@ public static class ParameterizedCasesTest extends InitializedNullHandlingTest
     private final boolean partitionsDeferred;
     private final long limitHint;
 
-    private CursorFactory cursorFactory;
     private RowSignature signature;
     private FrameProcessorExecutor exec;
     private List<ReadableFrameChannel> inputChannels;
@@ -285,11 +290,12 @@ public static Iterable<Object[]> constructorFeeder()
     {
       final List<Object[]> constructors = new ArrayList<>();
 
-      for (int maxRowsPerFrame : new int[]{Integer.MAX_VALUE, 50, 1}) {
+      // Add some constructors for testing maxRowsPerFrame > 1. Later on, we'll add some for maxRowsPerFrame = 1.
+      for (int maxRowsPerFrame : new int[]{Integer.MAX_VALUE, 50}) {
         for (int maxBytesPerFrame : new int[]{20_000, 2_000_000}) {
           for (int numChannels : new int[]{1, 3}) {
-            for (int maxActiveProcessors : new int[]{1, 2, 4}) {
-              for (int maxChannelsPerProcessor : new int[]{2, 3, 8}) {
+            for (int maxActiveProcessors : new int[]{1, 3}) {
+              for (int maxChannelsPerProcessor : new int[]{2, 7}) {
                 for (int numThreads : new int[]{1, 3}) {
                   for (boolean isComposedStorage : new boolean[]{true, false}) {
                     for (boolean partitionsDeferred : new boolean[]{true, false}) {
@@ -317,16 +323,51 @@ public static Iterable<Object[]> constructorFeeder()
         }
       }
 
+      // Add some constructors for testing maxRowsPerFrame = 1. This isn't part of the full matrix since it's quite
+      // slow, but we still want to exercise it a bit.
+      for (boolean isComposedStorage : new boolean[]{true, false}) {
+        for (long limitHint : new long[]{SuperSorter.UNLIMITED, 3, 1_000}) {
+          constructors.add(
+              new Object[]{
+                  1 /* maxRowsPerFrame */,
+                  20_000 /* maxBytesPerFrame */,
+                  3 /* numChannels */,
+                  2 /* maxActiveProcessors */,
+                  3 /* maxChannelsPerProcessor */,
+                  1 /* numThreads */,
+                  isComposedStorage,
+                  false /* partitionsDeferred */,
+                  limitHint
+              }
+          );
+        }
+      }
+
       return constructors;
     }
 
+    @BeforeClass
+    public static void setUpClass()
+    {
+      CURSOR_FACTORY = new QueryableIndexCursorFactory(TestIndex.getNoRollupMMappedTestIndex());
+      CURSOR_FACTORY_SIGNATURE_WITH_ROW_NUMBER =
+          FrameSequenceBuilder.signatureWithRowNumber(CURSOR_FACTORY.getRowSignature());
+    }
+
+    @AfterClass
+    public static void tearDownClass()
+    {
+      CURSOR_FACTORY = null;
+      CURSOR_FACTORY_SIGNATURE_WITH_ROW_NUMBER = null;
+      SORTED_TEST_ROWS.clear();
+    }
+
     @Before
     public void setUp()
     {
       exec = new FrameProcessorExecutor(
           MoreExecutors.listeningDecorator(Execs.multiThreaded(numThreads, getClass().getSimpleName() + "[%d]"))
       );
-      cursorFactory = new QueryableIndexCursorFactory(TestIndex.getNoRollupMMappedTestIndex());
     }
 
     @After
@@ -352,15 +393,15 @@ private void setUpInputChannels(final ClusterBy clusterBy) throws Exception
       }
 
       final FrameSequenceBuilder frameSequenceBuilder =
-          FrameSequenceBuilder.fromCursorFactory(cursorFactory)
+          FrameSequenceBuilder.fromCursorFactory(CURSOR_FACTORY)
                               .maxRowsPerFrame(maxRowsPerFrame)
                               .sortBy(clusterBy.getColumns())
                               .allocator(ArenaMemoryAllocator.create(ByteBuffer.allocate(maxBytesPerFrame)))
                               .frameType(FrameType.ROW_BASED)
                               .populateRowNumber();
 
-      inputChannels = makeFileChannels(frameSequenceBuilder.frames(), temporaryFolder.newFolder(), numChannels);
-      signature = frameSequenceBuilder.signature();
+      inputChannels = makeRoundRobinChannels(frameSequenceBuilder.frames(), numChannels);
+      signature = FrameWriters.sortableSignature(CURSOR_FACTORY_SIGNATURE_WITH_ROW_NUMBER, clusterBy.getColumns());
       frameReader = FrameReader.create(signature);
     }
 
@@ -411,7 +452,7 @@ private void verifySuperSorter(
       Assert.assertEquals(clusterByPartitions.size(), outputChannels.getAllChannels().size());
       Assert.assertEquals(Double.valueOf(1.0), superSorterProgressTracker.snapshot().getProgressDigest());
 
-      final int[] clusterByPartColumns = clusterBy.getColumns().stream().mapToInt(
+      final int[] clusterByColumns = clusterBy.getColumns().stream().mapToInt(
           part -> signature.indexOf(part.columnName())
       ).toArray();
 
@@ -427,33 +468,36 @@ private void verifySuperSorter(
             frameReader
         ).forEach(
             row -> {
-              final Object[] array = new Object[clusterByPartColumns.length];
+              final Object[] array = new Object[clusterByColumns.length];
 
               for (int i = 0; i < array.length; i++) {
-                array[i] = row.get(clusterByPartColumns[i]);
+                array[i] = row.get(clusterByColumns[i]);
               }
 
               final RowKey key = createKey(clusterBy, array);
 
-              Assert.assertTrue(
-                  StringUtils.format(
-                      "Key %s >= partition %,d start %s",
-                      keyReader.read(key),
-                      partitionNumber,
-                      partition.getStart() == null ? null : keyReader.read(partition.getStart())
-                  ),
-                  partition.getStart() == null || keyComparator.compare(key, partition.getStart()) >= 0
-              );
-
-              Assert.assertTrue(
-                  StringUtils.format(
-                      "Key %s < partition %,d end %s",
-                      keyReader.read(key),
-                      partitionNumber,
-                      partition.getEnd() == null ? null : keyReader.read(partition.getEnd())
-                  ),
-                  partition.getEnd() == null || keyComparator.compare(key, partition.getEnd()) < 0
-              );
+              if (!(partition.getStart() == null || keyComparator.compare(key, partition.getStart()) >= 0)) {
+                // Defer formatting of error message until it's actually needed
+                Assert.fail(
+                    StringUtils.format(
+                        "Key %s >= partition %,d start %s",
+                        keyReader.read(key),
+                        partitionNumber,
+                        partition.getStart() == null ? null : keyReader.read(partition.getStart())
+                    )
+                );
+              }
+
+              if (!(partition.getEnd() == null || keyComparator.compare(key, partition.getEnd()) < 0)) {
+                Assert.fail(
+                    StringUtils.format(
+                        "Key %s < partition %,d end %s",
+                        keyReader.read(key),
+                        partitionNumber,
+                        partition.getEnd() == null ? null : keyReader.read(partition.getEnd())
+                    )
+                );
+              }
 
               readRows.add(row);
             }
@@ -464,21 +508,9 @@ private void verifySuperSorter(
         MatcherAssert.assertThat(readRows.size(), Matchers.greaterThanOrEqualTo(Ints.checkedCast(limitHint)));
       }
 
-      final Sequence<List<Object>> expectedRows = Sequences.sort(
-          FrameTestUtil.readRowsFromCursorFactory(cursorFactory, signature, true),
-          Comparator.comparing(
-              row -> {
-                final Object[] array = new Object[clusterByPartColumns.length];
-
-                for (int i = 0; i < array.length; i++) {
-                  array[i] = row.get(clusterByPartColumns[i]);
-                }
-
-                return createKey(clusterBy, array);
-              },
-              keyComparator
-          )
-      ).limit(limitHint == SuperSorter.UNLIMITED ? Long.MAX_VALUE : readRows.size());
+      final Sequence<List<Object>> expectedRows =
+          Sequences.simple(getOrComputeSortedTestRows(clusterBy))
+                   .limit(limitHint == SuperSorter.UNLIMITED ? Long.MAX_VALUE : readRows.size());
 
       FrameTestUtil.assertRowsEqual(expectedRows, Sequences.simple(readRows));
     }
@@ -724,29 +756,63 @@ private RowKey createKey(final ClusterBy clusterBy, final Object... objects)
       final RowSignature keySignature = KeyTestUtils.createKeySignature(clusterBy.getColumns(), signature);
       return KeyTestUtils.createKey(keySignature, objects);
     }
+
+    /**
+     * Retrieve sorted test rows from {@link #SORTED_TEST_ROWS}, or else compute using
+     * {@link #computeSortedTestRows(ClusterBy)}.
+     */
+    private static List<List<Object>> getOrComputeSortedTestRows(final ClusterBy clusterBy)
+    {
+      return SORTED_TEST_ROWS.computeIfAbsent(clusterBy, SuperSorterTest.ParameterizedCasesTest::computeSortedTestRows);
+    }
+
+    /**
+     * Sort test rows from {@link TestIndex#getNoRollupMMappedTestIndex()} by the given {@link ClusterBy}.
+     */
+    private static List<List<Object>> computeSortedTestRows(final ClusterBy clusterBy)
+    {
+      final QueryableIndexCursorFactory cursorFactory =
+          new QueryableIndexCursorFactory(TestIndex.getNoRollupMMappedTestIndex());
+      final RowSignature signature =
+          FrameWriters.sortableSignature(
+              FrameSequenceBuilder.signatureWithRowNumber(cursorFactory.getRowSignature()),
+              clusterBy.getColumns()
+          );
+      final RowSignature keySignature = KeyTestUtils.createKeySignature(clusterBy.getColumns(), signature);
+      final int[] clusterByColumns =
+          clusterBy.getColumns().stream().mapToInt(part -> signature.indexOf(part.columnName())).toArray();
+      final Comparator<RowKey> keyComparator = clusterBy.keyComparator(keySignature);
+
+      return Sequences.sort(
+          FrameTestUtil.readRowsFromCursorFactory(cursorFactory, signature, true),
+          Comparator.comparing(
+              row -> {
+                final Object[] array = new Object[clusterByColumns.length];
+
+                for (int i = 0; i < array.length; i++) {
+                  array[i] = row.get(clusterByColumns[i]);
+                }
+
+                return KeyTestUtils.createKey(keySignature, array);
+              },
+              keyComparator
+          )
+      ).toList();
+    }
   }
 
-  private static List<ReadableFrameChannel> makeFileChannels(
+  /**
+   * Distribute frames round-robin to some number of channels.
+   */
+  private static List<ReadableFrameChannel> makeRoundRobinChannels(
       final Sequence<Frame> frames,
-      final File tmpDir,
       final int numChannels
   ) throws IOException
   {
-    final List<File> files = new ArrayList<>();
-    final List<WritableFrameChannel> writableChannels = new ArrayList<>();
+    final List<BlockingQueueFrameChannel> channels = new ArrayList<>(numChannels);
 
     for (int i = 0; i < numChannels; i++) {
-      final File file = new File(tmpDir, StringUtils.format("channel-%d", i));
-      files.add(file);
-      writableChannels.add(
-          new WritableFrameFileChannel(
-              FrameFileWriter.open(
-                  Channels.newChannel(Files.newOutputStream(file.toPath())),
-                  null,
-                  ByteTracker.unboundedTracker()
-              )
-          )
-      );
+      channels.add(new BlockingQueueFrameChannel(2000) /* enough even for 1 row per frame; dataset has < 2000 rows */);
     }
 
     frames.forEach(
@@ -758,7 +824,7 @@ private static List<ReadableFrameChannel> makeFileChannels(
           public void accept(final Frame frame)
           {
             try {
-              writableChannels.get(i % writableChannels.size()).write(frame);
+              channels.get(i % channels.size()).writable().write(frame);
             }
             catch (IOException e) {
               throw new RuntimeException(e);
@@ -771,20 +837,11 @@ public void accept(final Frame frame)
 
     final List<ReadableFrameChannel> retVal = new ArrayList<>();
 
-    for (int i = 0; i < writableChannels.size(); i++) {
-      WritableFrameChannel writableChannel = writableChannels.get(i);
-      writableChannel.close();
-      retVal.add(new ReadableFileFrameChannel(FrameFile.open(files.get(i), null)));
+    for (final BlockingQueueFrameChannel channel : channels) {
+      channel.writable().close();
+      retVal.add(channel.readable());
     }
 
     return retVal;
   }
-
-  private static <T> long countSequence(final Sequence<T> sequence)
-  {
-    return sequence.accumulate(
-        0L,
-        (accumulated, in) -> accumulated + 1
-    );
-  }
 }
diff --git a/processing/src/test/java/org/apache/druid/frame/testutil/FrameSequenceBuilder.java b/processing/src/test/java/org/apache/druid/frame/testutil/FrameSequenceBuilder.java
index d3fbffd7b4af..1cb6298b8b10 100644
--- a/processing/src/test/java/org/apache/druid/frame/testutil/FrameSequenceBuilder.java
+++ b/processing/src/test/java/org/apache/druid/frame/testutil/FrameSequenceBuilder.java
@@ -67,6 +67,17 @@ public static FrameSequenceBuilder fromCursorFactory(final CursorFactory cursorF
     return new FrameSequenceBuilder(cursorFactory);
   }
 
+  /**
+   * Returns what {@link #signature()} would return if {@link #populateRowNumber()} is set.
+   */
+  public static RowSignature signatureWithRowNumber(final RowSignature signature)
+  {
+    return RowSignature.builder()
+                       .addAll(signature)
+                       .add(FrameTestUtil.ROW_NUMBER_COLUMN, ColumnType.LONG)
+                       .build();
+  }
+
   public FrameSequenceBuilder frameType(final FrameType frameType)
   {
     this.frameType = frameType;
@@ -108,10 +119,7 @@ public RowSignature signature()
     final RowSignature baseSignature;
 
     if (populateRowNumber) {
-      baseSignature = RowSignature.builder()
-                                  .addAll(cursorFactory.getRowSignature())
-                                  .add(FrameTestUtil.ROW_NUMBER_COLUMN, ColumnType.LONG)
-                                  .build();
+      baseSignature = signatureWithRowNumber(cursorFactory.getRowSignature());
     } else {
       baseSignature = cursorFactory.getRowSignature();
     }
diff --git a/processing/src/test/java/org/apache/druid/frame/testutil/FrameTestUtil.java b/processing/src/test/java/org/apache/druid/frame/testutil/FrameTestUtil.java
index c75a57a86990..2bb8789740c4 100644
--- a/processing/src/test/java/org/apache/druid/frame/testutil/FrameTestUtil.java
+++ b/processing/src/test/java/org/apache/druid/frame/testutil/FrameTestUtil.java
@@ -56,9 +56,10 @@
 
 import javax.annotation.Nullable;
 import java.io.File;
-import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.OutputStream;
 import java.nio.channels.Channels;
+import java.nio.file.Files;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -79,12 +80,14 @@ private FrameTestUtil()
 
   public static File writeFrameFile(final Sequence<Frame> frames, final File file) throws IOException
   {
-    try (
-        final FileOutputStream fos = new FileOutputStream(file);
-        final FrameFileWriter writer = FrameFileWriter.open(
-            Channels.newChannel(fos), null, ByteTracker.unboundedTracker()
-        )
-    ) {
+    writeFrameFile(frames, Files.newOutputStream(file.toPath()));
+    return file;
+  }
+
+  public static void writeFrameFile(final Sequence<Frame> frames, final OutputStream out) throws IOException
+  {
+    try (final FrameFileWriter writer =
+             FrameFileWriter.open(Channels.newChannel(out), null, ByteTracker.unboundedTracker())) {
       frames.forEach(
           frame -> {
             try {
@@ -96,17 +99,15 @@ public static File writeFrameFile(final Sequence<Frame> frames, final File file)
           }
       );
     }
-
-    return file;
   }
 
-  public static File writeFrameFileWithPartitions(
+  public static void writeFrameFileWithPartitions(
       final Sequence<IntObjectPair<Frame>> framesWithPartitions,
-      final File file
+      final OutputStream out
   ) throws IOException
   {
     try (final FrameFileWriter writer = FrameFileWriter.open(
-        Channels.newChannel(new FileOutputStream(file)),
+        Channels.newChannel(out),
         null,
         ByteTracker.unboundedTracker()
     )) {
@@ -121,8 +122,6 @@ public static File writeFrameFileWithPartitions(
           }
       );
     }
-
-    return file;
   }
 
   public static void assertRowsEqual(final Sequence<List<Object>> expected, final Sequence<List<Object>> actual)

From 6b0d17398286c0fd1cdb38020479c8b6d718df6e Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Sun, 15 Sep 2024 23:02:12 -0700
Subject: [PATCH 25/47] Remove some unnecessary JoinableFactoryWrappers.
 (#17051)

* Remove some unnecessary JoinableFactoryWrappers.

* Remove unused import.
---
 .../druid/testing/tools/ServerManagerForQueryErrorTest.java   | 3 ---
 .../server/coordination/ServerManagerForQueryErrorTest.java   | 3 ---
 .../org/apache/druid/server/coordination/ServerManager.java   | 4 ----
 .../apache/druid/server/coordination/ServerManagerTest.java   | 2 --
 4 files changed, 12 deletions(-)

diff --git a/integration-tests-ex/tools/src/main/java/org/apache/druid/testing/tools/ServerManagerForQueryErrorTest.java b/integration-tests-ex/tools/src/main/java/org/apache/druid/testing/tools/ServerManagerForQueryErrorTest.java
index 60a057ece799..d5b82e9c8d58 100644
--- a/integration-tests-ex/tools/src/main/java/org/apache/druid/testing/tools/ServerManagerForQueryErrorTest.java
+++ b/integration-tests-ex/tools/src/main/java/org/apache/druid/testing/tools/ServerManagerForQueryErrorTest.java
@@ -47,7 +47,6 @@
 import org.apache.druid.query.SegmentDescriptor;
 import org.apache.druid.segment.ReferenceCountingSegment;
 import org.apache.druid.segment.SegmentReference;
-import org.apache.druid.segment.join.JoinableFactoryWrapper;
 import org.apache.druid.server.SegmentManager;
 import org.apache.druid.server.coordination.ServerManager;
 import org.apache.druid.server.initialization.ServerConfig;
@@ -98,7 +97,6 @@ public ServerManagerForQueryErrorTest(
       Cache cache,
       CacheConfig cacheConfig,
       SegmentManager segmentManager,
-      JoinableFactoryWrapper joinableFactoryWrapper,
       ServerConfig serverConfig
   )
   {
@@ -111,7 +109,6 @@ public ServerManagerForQueryErrorTest(
         cache,
         cacheConfig,
         segmentManager,
-        joinableFactoryWrapper,
         serverConfig
     );
   }
diff --git a/integration-tests/src/main/java/org/apache/druid/server/coordination/ServerManagerForQueryErrorTest.java b/integration-tests/src/main/java/org/apache/druid/server/coordination/ServerManagerForQueryErrorTest.java
index 7b434667fa94..8bfceb544708 100644
--- a/integration-tests/src/main/java/org/apache/druid/server/coordination/ServerManagerForQueryErrorTest.java
+++ b/integration-tests/src/main/java/org/apache/druid/server/coordination/ServerManagerForQueryErrorTest.java
@@ -47,7 +47,6 @@
 import org.apache.druid.query.SegmentDescriptor;
 import org.apache.druid.segment.ReferenceCountingSegment;
 import org.apache.druid.segment.SegmentReference;
-import org.apache.druid.segment.join.JoinableFactoryWrapper;
 import org.apache.druid.server.SegmentManager;
 import org.apache.druid.server.initialization.ServerConfig;
 import org.apache.druid.timeline.VersionedIntervalTimeline;
@@ -96,7 +95,6 @@ public ServerManagerForQueryErrorTest(
       Cache cache,
       CacheConfig cacheConfig,
       SegmentManager segmentManager,
-      JoinableFactoryWrapper joinableFactoryWrapper,
       ServerConfig serverConfig
   )
   {
@@ -109,7 +107,6 @@ public ServerManagerForQueryErrorTest(
         cache,
         cacheConfig,
         segmentManager,
-        joinableFactoryWrapper,
         serverConfig
     );
   }
diff --git a/server/src/main/java/org/apache/druid/server/coordination/ServerManager.java b/server/src/main/java/org/apache/druid/server/coordination/ServerManager.java
index 4bee18091f24..3317375347db 100644
--- a/server/src/main/java/org/apache/druid/server/coordination/ServerManager.java
+++ b/server/src/main/java/org/apache/druid/server/coordination/ServerManager.java
@@ -59,7 +59,6 @@
 import org.apache.druid.segment.ReferenceCountingSegment;
 import org.apache.druid.segment.SegmentReference;
 import org.apache.druid.segment.TimeBoundaryInspector;
-import org.apache.druid.segment.join.JoinableFactoryWrapper;
 import org.apache.druid.server.ResourceIdPopulatingQueryRunner;
 import org.apache.druid.server.SegmentManager;
 import org.apache.druid.server.SetAndVerifyContextQueryRunner;
@@ -90,7 +89,6 @@ public class ServerManager implements QuerySegmentWalker
   private final ObjectMapper objectMapper;
   private final CacheConfig cacheConfig;
   private final SegmentManager segmentManager;
-  private final JoinableFactoryWrapper joinableFactoryWrapper;
   private final ServerConfig serverConfig;
 
   @Inject
@@ -103,7 +101,6 @@ public ServerManager(
       Cache cache,
       CacheConfig cacheConfig,
       SegmentManager segmentManager,
-      JoinableFactoryWrapper joinableFactoryWrapper,
       ServerConfig serverConfig
   )
   {
@@ -117,7 +114,6 @@ public ServerManager(
 
     this.cacheConfig = cacheConfig;
     this.segmentManager = segmentManager;
-    this.joinableFactoryWrapper = joinableFactoryWrapper;
     this.serverConfig = serverConfig;
   }
 
diff --git a/server/src/test/java/org/apache/druid/server/coordination/ServerManagerTest.java b/server/src/test/java/org/apache/druid/server/coordination/ServerManagerTest.java
index ec4a008c1ed7..69a7c08f2be9 100644
--- a/server/src/test/java/org/apache/druid/server/coordination/ServerManagerTest.java
+++ b/server/src/test/java/org/apache/druid/server/coordination/ServerManagerTest.java
@@ -77,7 +77,6 @@
 import org.apache.druid.segment.Segment;
 import org.apache.druid.segment.TestHelper;
 import org.apache.druid.segment.TestIndex;
-import org.apache.druid.segment.join.JoinableFactoryWrapperTest;
 import org.apache.druid.segment.loading.LeastBytesUsedStorageLocationSelectorStrategy;
 import org.apache.druid.segment.loading.SegmentLoaderConfig;
 import org.apache.druid.segment.loading.SegmentLoadingException;
@@ -208,7 +207,6 @@ public <T, QueryType extends Query<T>> QueryRunnerFactory<T, QueryType> findFact
         new LocalCacheProvider().get(),
         new CacheConfig(),
         segmentManager,
-        JoinableFactoryWrapperTest.NOOP_JOINABLE_FACTORY_WRAPPER,
         new ServerConfig()
     );
 

From b86649daca69e6a1cdca8b30625602a62c015fe6 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Sun, 15 Sep 2024 23:03:26 -0700
Subject: [PATCH 26/47] Move TerminalStageSpecFactory packages. (#17049)

* Move TerminalStageSpecFactory packages.

These packages are moved from the "guice" package to the "indexing.destination"
package. They make more sense here, since "guice" is reserved for Guice modules,
annotations, and providers.

* Rearrange imports.
---
 .../main/java/org/apache/druid/msq/guice/MSQSqlModule.java    | 2 ++
 .../destination}/MSQTerminalStageSpecFactory.java             | 3 +--
 .../SegmentGenerationTerminalStageSpecFactory.java            | 4 +---
 .../main/java/org/apache/druid/msq/sql/MSQTaskQueryMaker.java | 2 +-
 .../main/java/org/apache/druid/msq/sql/MSQTaskSqlEngine.java  | 2 +-
 .../test/java/org/apache/druid/msq/exec/TestMSQSqlModule.java | 2 +-
 .../src/test/java/org/apache/druid/msq/test/MSQTestBase.java  | 2 +-
 7 files changed, 8 insertions(+), 9 deletions(-)
 rename extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/{guice => indexing/destination}/MSQTerminalStageSpecFactory.java (91%)
 rename extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/{guice => indexing/destination}/SegmentGenerationTerminalStageSpecFactory.java (87%)

diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQSqlModule.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQSqlModule.java
index 5d837940e19e..b7239980d52d 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQSqlModule.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQSqlModule.java
@@ -27,6 +27,8 @@
 import org.apache.druid.guice.annotations.LoadScope;
 import org.apache.druid.initialization.DruidModule;
 import org.apache.druid.metadata.input.InputSourceModule;
+import org.apache.druid.msq.indexing.destination.MSQTerminalStageSpecFactory;
+import org.apache.druid.msq.indexing.destination.SegmentGenerationTerminalStageSpecFactory;
 import org.apache.druid.msq.sql.MSQTaskSqlEngine;
 import org.apache.druid.sql.SqlStatementFactory;
 import org.apache.druid.sql.SqlToolbox;
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQTerminalStageSpecFactory.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/destination/MSQTerminalStageSpecFactory.java
similarity index 91%
rename from extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQTerminalStageSpecFactory.java
rename to extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/destination/MSQTerminalStageSpecFactory.java
index d5a1bd79455a..d7179f132255 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQTerminalStageSpecFactory.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/destination/MSQTerminalStageSpecFactory.java
@@ -17,9 +17,8 @@
  * under the License.
  */
 
-package org.apache.druid.msq.guice;
+package org.apache.druid.msq.indexing.destination;
 
-import org.apache.druid.msq.indexing.destination.TerminalStageSpec;
 import org.apache.druid.sql.calcite.planner.PlannerContext;
 import org.apache.druid.sql.calcite.rel.DruidQuery;
 
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/SegmentGenerationTerminalStageSpecFactory.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/destination/SegmentGenerationTerminalStageSpecFactory.java
similarity index 87%
rename from extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/SegmentGenerationTerminalStageSpecFactory.java
rename to extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/destination/SegmentGenerationTerminalStageSpecFactory.java
index f761c0616a6c..09c3c514ec05 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/SegmentGenerationTerminalStageSpecFactory.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/destination/SegmentGenerationTerminalStageSpecFactory.java
@@ -17,10 +17,8 @@
  * under the License.
  */
 
-package org.apache.druid.msq.guice;
+package org.apache.druid.msq.indexing.destination;
 
-import org.apache.druid.msq.indexing.destination.SegmentGenerationStageSpec;
-import org.apache.druid.msq.indexing.destination.TerminalStageSpec;
 import org.apache.druid.sql.calcite.planner.PlannerContext;
 import org.apache.druid.sql.calcite.rel.DruidQuery;
 
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/sql/MSQTaskQueryMaker.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/sql/MSQTaskQueryMaker.java
index 830fb87e1b2e..ae667a7a5585 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/sql/MSQTaskQueryMaker.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/sql/MSQTaskQueryMaker.java
@@ -33,7 +33,6 @@
 import org.apache.druid.java.util.common.granularity.Granularity;
 import org.apache.druid.java.util.common.guava.Sequences;
 import org.apache.druid.msq.exec.MSQTasks;
-import org.apache.druid.msq.guice.MSQTerminalStageSpecFactory;
 import org.apache.druid.msq.indexing.MSQControllerTask;
 import org.apache.druid.msq.indexing.MSQSpec;
 import org.apache.druid.msq.indexing.MSQTuningConfig;
@@ -42,6 +41,7 @@
 import org.apache.druid.msq.indexing.destination.ExportMSQDestination;
 import org.apache.druid.msq.indexing.destination.MSQDestination;
 import org.apache.druid.msq.indexing.destination.MSQSelectDestination;
+import org.apache.druid.msq.indexing.destination.MSQTerminalStageSpecFactory;
 import org.apache.druid.msq.indexing.destination.TaskReportMSQDestination;
 import org.apache.druid.msq.util.MSQTaskQueryMakerUtils;
 import org.apache.druid.msq.util.MultiStageQueryContext;
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/sql/MSQTaskSqlEngine.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/sql/MSQTaskSqlEngine.java
index 9e07a909f4e5..bdebe32a16fb 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/sql/MSQTaskSqlEngine.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/sql/MSQTaskSqlEngine.java
@@ -42,7 +42,7 @@
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.java.util.common.granularity.Granularity;
-import org.apache.druid.msq.guice.MSQTerminalStageSpecFactory;
+import org.apache.druid.msq.indexing.destination.MSQTerminalStageSpecFactory;
 import org.apache.druid.msq.querykit.QueryKitUtils;
 import org.apache.druid.msq.util.ArrayIngestMode;
 import org.apache.druid.msq.util.DimensionSchemaUtils;
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/TestMSQSqlModule.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/TestMSQSqlModule.java
index 5bf4bbd44dc4..0b48d2904dd3 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/TestMSQSqlModule.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/TestMSQSqlModule.java
@@ -26,7 +26,7 @@
 import org.apache.druid.guice.LazySingleton;
 import org.apache.druid.initialization.ServerInjectorBuilderTest.TestDruidModule;
 import org.apache.druid.msq.guice.MultiStageQuery;
-import org.apache.druid.msq.guice.SegmentGenerationTerminalStageSpecFactory;
+import org.apache.druid.msq.indexing.destination.SegmentGenerationTerminalStageSpecFactory;
 import org.apache.druid.msq.sql.MSQTaskSqlEngine;
 import org.apache.druid.msq.test.MSQTestBase;
 import org.apache.druid.msq.test.MSQTestOverlordServiceClient;
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java
index be05a0fcc8a9..e5e3ba68e44a 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestBase.java
@@ -92,12 +92,12 @@
 import org.apache.druid.msq.guice.MSQIndexingModule;
 import org.apache.druid.msq.guice.MSQSqlModule;
 import org.apache.druid.msq.guice.MultiStageQuery;
-import org.apache.druid.msq.guice.SegmentGenerationTerminalStageSpecFactory;
 import org.apache.druid.msq.indexing.InputChannelFactory;
 import org.apache.druid.msq.indexing.MSQControllerTask;
 import org.apache.druid.msq.indexing.MSQSpec;
 import org.apache.druid.msq.indexing.MSQTuningConfig;
 import org.apache.druid.msq.indexing.destination.DataSourceMSQDestination;
+import org.apache.druid.msq.indexing.destination.SegmentGenerationTerminalStageSpecFactory;
 import org.apache.druid.msq.indexing.destination.TaskReportMSQDestination;
 import org.apache.druid.msq.indexing.error.InsertLockPreemptedFaultTest;
 import org.apache.druid.msq.indexing.error.MSQErrorReport;

From bdd3b76eb61e0cea5287b7567b6677869be0b48e Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Sun, 15 Sep 2024 23:52:32 -0700
Subject: [PATCH 27/47] Additional tests for ChannelStageOutputReader. (#17050)

The existing tests are moved into a "WithMaximalBuffering" subclass,
and a new "WithMinimalBuffering" subclass is added to test cases
where only a single frame is buffered.
---
 .../output/ChannelStageOutputReaderTest.java  | 514 +++++++++++++-----
 1 file changed, 368 insertions(+), 146 deletions(-)

diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/shuffle/output/ChannelStageOutputReaderTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/shuffle/output/ChannelStageOutputReaderTest.java
index 0095a8fdb3ae..d0410897a07e 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/shuffle/output/ChannelStageOutputReaderTest.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/shuffle/output/ChannelStageOutputReaderTest.java
@@ -19,8 +19,10 @@
 
 package org.apache.druid.msq.shuffle.output;
 
+import com.google.common.collect.Iterables;
 import com.google.common.io.ByteStreams;
 import com.google.common.math.IntMath;
+import com.google.common.util.concurrent.ListenableFuture;
 import org.apache.druid.common.guava.FutureUtils;
 import org.apache.druid.frame.Frame;
 import org.apache.druid.frame.FrameType;
@@ -30,10 +32,12 @@
 import org.apache.druid.frame.read.FrameReader;
 import org.apache.druid.frame.testutil.FrameSequenceBuilder;
 import org.apache.druid.frame.testutil.FrameTestUtil;
+import org.apache.druid.segment.QueryableIndexCursorFactory;
 import org.apache.druid.segment.TestIndex;
 import org.apache.druid.segment.incremental.IncrementalIndex;
 import org.apache.druid.segment.incremental.IncrementalIndexCursorFactory;
 import org.apache.druid.testing.InitializedNullHandlingTest;
+import org.apache.druid.utils.CloseableUtils;
 import org.hamcrest.CoreMatchers;
 import org.hamcrest.MatcherAssert;
 import org.hamcrest.Matchers;
@@ -50,206 +54,424 @@
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
 import java.math.RoundingMode;
+import java.nio.file.Files;
 import java.util.List;
 
 public class ChannelStageOutputReaderTest extends InitializedNullHandlingTest
 {
-  private static final int MAX_FRAMES = 10;
-  private static final int EXPECTED_NUM_ROWS = 1209;
+  /**
+   * Tests that use {@link BlockingQueueFrameChannel#minimal()}.
+   */
+  public static class WithMinimalBuffering extends InitializedNullHandlingTest
+  {
+    private final Frame frame = Iterables.getOnlyElement(
+        FrameSequenceBuilder
+            .fromCursorFactory(new QueryableIndexCursorFactory(TestIndex.getNoRollupMMappedTestIndex()))
+            .frameType(FrameType.ROW_BASED)
+            .frames()
+            .toList()
+    );
 
-  private final BlockingQueueFrameChannel channel = new BlockingQueueFrameChannel(MAX_FRAMES);
-  private final ChannelStageOutputReader reader = new ChannelStageOutputReader(channel.readable());
+    @Rule
+    public TemporaryFolder temporaryFolder = new TemporaryFolder();
+
+    private BlockingQueueFrameChannel channel;
+    private ChannelStageOutputReader channelReader;
+    private File tmpFile;
+    private OutputStream tmpOut;
+    private FrameFile tmpFrameFile;
+
+    // Variables used by doRead()
+    private long offset;
+    private ListenableFuture<InputStream> nextRead;
+
+    @Before
+    public void setUp() throws Exception
+    {
+      channel = BlockingQueueFrameChannel.minimal();
+      channelReader = new ChannelStageOutputReader(channel.readable());
+      tmpFile = temporaryFolder.newFile();
+      tmpOut = Files.newOutputStream(tmpFile.toPath());
+    }
 
-  @Rule
-  public TemporaryFolder temporaryFolder = new TemporaryFolder();
+    @After
+    public void tearDown() throws Exception
+    {
+      CloseableUtils.closeAll(tmpOut, tmpFrameFile);
+    }
 
-  private FrameReader frameReader;
-  private List<Frame> frameList;
+    @Test
+    public void test_remote_empty() throws Exception
+    {
+      // Close without writing anything.
+      channel.writable().close();
 
-  @Before
-  public void setUp()
-  {
-    final IncrementalIndex index = TestIndex.getIncrementalTestIndex();
-    final IncrementalIndexCursorFactory cursorFactory = new IncrementalIndexCursorFactory(index);
-    frameReader = FrameReader.create(cursorFactory.getRowSignature());
-    frameList = FrameSequenceBuilder.fromCursorFactory(cursorFactory)
-                                    .frameType(FrameType.ROW_BASED)
-                                    .maxRowsPerFrame(IntMath.divide(index.size(), MAX_FRAMES, RoundingMode.CEILING))
-                                    .frames()
-                                    .toList();
-  }
+      while (doRead(-1)) {
+        // Do nothing, just keep reading.
+      }
 
-  @After
-  public void tearDown()
-  {
-    reader.close();
-  }
+      Assert.assertEquals(0, tmpFrameFile.numFrames());
+    }
 
-  @Test
-  public void test_readLocally() throws IOException
-  {
-    writeAllFramesToChannel();
+    @Test
+    public void test_remote_oneFrame() throws Exception
+    {
+      // Close after writing one frame.
+      channel.writable().write(frame);
+      channel.writable().close();
 
-    Assert.assertSame(channel.readable(), reader.readLocally());
-    reader.close(); // Won't close the channel, because it's already been returned by readLocally
+      while (doRead(-1)) {
+        // Do nothing, just keep reading.
+      }
 
-    final int numRows = FrameTestUtil.readRowsFromFrameChannel(channel.readable(), frameReader).toList().size();
-    Assert.assertEquals(EXPECTED_NUM_ROWS, numRows);
-  }
+      Assert.assertEquals(1, tmpFrameFile.numFrames());
+      Assert.assertEquals(frame.numBytes(), tmpFrameFile.frame(0).numBytes());
+    }
 
-  @Test
-  public void test_readLocally_closePriorToRead() throws IOException
-  {
-    writeAllFramesToChannel();
+    @Test
+    public void test_remote_oneFrame_writeAfterFirstRead() throws Exception
+    {
+      Assert.assertTrue(doRead(-1));
 
-    reader.close();
+      // Close after writing one frame.
+      channel.writable().write(frame);
+      channel.writable().close();
 
-    // Can't read the channel after closing the reader
-    Assert.assertThrows(
-        IllegalStateException.class,
-        reader::readLocally
-    );
-  }
+      while (doRead(-1)) {
+        // Do nothing, just keep reading.
+      }
 
-  @Test
-  public void test_readLocally_thenReadRemotely() throws IOException
-  {
-    writeAllFramesToChannel();
+      Assert.assertEquals(1, tmpFrameFile.numFrames());
+      Assert.assertEquals(frame.numBytes(), tmpFrameFile.frame(0).numBytes());
+    }
+
+    @Test
+    public void test_remote_oneFrame_readOneByteAtATime() throws Exception
+    {
+      // Close after writing one frame.
+      channel.writable().write(frame);
+      channel.writable().close();
 
-    Assert.assertSame(channel.readable(), reader.readLocally());
+      while (doRead(1)) {
+        // Do nothing, just keep reading.
+      }
 
-    // Can't read remotely after reading locally
-    Assert.assertThrows(
-        IllegalStateException.class,
-        () -> reader.readRemotelyFrom(0)
-    );
+      Assert.assertEquals(1, tmpFrameFile.numFrames());
+      Assert.assertEquals(frame.numBytes(), tmpFrameFile.frame(0).numBytes());
+    }
 
-    // Can still read locally after this error
-    final int numRows = FrameTestUtil.readRowsFromFrameChannel(channel.readable(), frameReader).toList().size();
-    Assert.assertEquals(EXPECTED_NUM_ROWS, numRows);
-  }
+    @Test
+    public void test_remote_threeFrames_readOneByteAtATime() throws Exception
+    {
+      // Write one frame.
+      channel.writable().write(frame);
 
-  @Test
-  public void test_readRemotely_strideBasedOnReturnedChunk() throws IOException
-  {
-    // Test that reads entire chunks from readRemotelyFrom. This is a typical usage pattern.
+      // See that we can't write another frame.
+      final IllegalStateException e = Assert.assertThrows(
+          IllegalStateException.class,
+          () -> channel.writable().write(frame)
+      );
 
-    writeAllFramesToChannel();
+      MatcherAssert.assertThat(
+          e,
+          ThrowableMessageMatcher.hasMessage(CoreMatchers.startsWith("Channel has no capacity"))
+      );
 
-    final File tmpFile = temporaryFolder.newFile();
+      // Read the first frame until we start blocking.
+      while (nextRead == null) {
+        Assert.assertTrue(doRead(1));
+      }
 
-    try (final FileOutputStream tmpOut = new FileOutputStream(tmpFile)) {
-      int numReads = 0;
-      long offset = 0;
+      // Write the next frame.
+      Assert.assertFalse(nextRead.isDone());
+      channel.writable().write(frame);
 
-      while (true) {
-        try (final InputStream in = FutureUtils.getUnchecked(reader.readRemotelyFrom(offset), true)) {
-          numReads++;
-          final long bytesWritten = ByteStreams.copy(in, tmpOut);
-          offset += bytesWritten;
+      // This write would have unblocked nextRead, which will now be done.
+      Assert.assertTrue(nextRead.isDone());
 
-          if (bytesWritten == 0) {
-            break;
-          }
-        }
+      // Write a third frame.
+      channel.writable().write(frame);
+
+      // See that we can't write a fourth frame.
+      final IllegalStateException e2 = Assert.assertThrows(
+          IllegalStateException.class,
+          () -> channel.writable().write(frame)
+      );
+
+      MatcherAssert.assertThat(
+          e2,
+          ThrowableMessageMatcher.hasMessage(CoreMatchers.startsWith("Channel has no capacity"))
+      );
+
+      // And read until we start blocking.
+      while (nextRead == null) {
+        Assert.assertTrue(doRead(1));
+      }
+
+      // Close.
+      channel.writable().close();
+
+      // Read until end of stream.
+      while (doRead(1)) {
+        // Just keep looping.
       }
 
-      MatcherAssert.assertThat(numReads, Matchers.greaterThan(1));
+      Assert.assertEquals(3, tmpFrameFile.numFrames());
+      Assert.assertEquals(frame.numBytes(), tmpFrameFile.frame(0).numBytes());
+      Assert.assertEquals(frame.numBytes(), tmpFrameFile.frame(1).numBytes());
+      Assert.assertEquals(frame.numBytes(), tmpFrameFile.frame(2).numBytes());
     }
 
-    final FrameFile frameFile = FrameFile.open(tmpFile, null);
-    final int numRows =
-        FrameTestUtil.readRowsFromFrameChannel(new ReadableFileFrameChannel(frameFile), frameReader).toList().size();
+    /**
+     * Do the next read operation.
+     *
+     * @return false if done reading, true if there's more to read
+     */
+    private boolean doRead(final long limit) throws IOException
+    {
+      if (nextRead == null) {
+        nextRead = channelReader.readRemotelyFrom(offset);
+      }
+
+      if (nextRead.isDone()) {
+        try (final InputStream in = FutureUtils.getUncheckedImmediately(nextRead)) {
+          nextRead = null;
+          long readSize = 0;
+
+          if (limit == -1) {
+            // Unlimited
+            readSize = ByteStreams.copy(in, tmpOut);
+          } else {
+            // Limited
+            while (readSize < limit) {
+              final int r = in.read();
+              if (r != -1) {
+                readSize++;
+                tmpOut.write(r);
+              } else {
+                break;
+              }
+            }
+          }
+
+          offset += readSize;
+
+          if (readSize == 0) {
+            channel.readable().close();
+            tmpOut.close();
+            tmpFrameFile = FrameFile.open(tmpFile, null);
+            return false;
+          }
+        }
+      }
 
-    Assert.assertEquals(EXPECTED_NUM_ROWS, numRows);
+      return true;
+    }
   }
 
-  @Test
-  public void test_readRemotely_strideOneByte() throws IOException
+  /**
+   * Tests that use {@link BlockingQueueFrameChannel} that is fully buffered.
+   */
+  public static class WithMaximalBuffering extends InitializedNullHandlingTest
   {
-    // Test that reads one byte at a time from readRemotelyFrom. This helps ensure that there are no edge cases
-    // in the chunk-reading logic.
+    private static final int MAX_FRAMES = 10;
+    private static final int EXPECTED_NUM_ROWS = 1209;
+
+    private final BlockingQueueFrameChannel channel = new BlockingQueueFrameChannel(MAX_FRAMES);
+    private final ChannelStageOutputReader reader = new ChannelStageOutputReader(channel.readable());
+
+    @Rule
+    public TemporaryFolder temporaryFolder = new TemporaryFolder();
+
+    private FrameReader frameReader;
+    private List<Frame> frameList;
+
+    @Before
+    public void setUp()
+    {
+      final IncrementalIndex index = TestIndex.getIncrementalTestIndex();
+      final IncrementalIndexCursorFactory adapter = new IncrementalIndexCursorFactory(index);
+      frameReader = FrameReader.create(adapter.getRowSignature());
+      frameList = FrameSequenceBuilder.fromCursorFactory(adapter)
+                                      .frameType(FrameType.ROW_BASED)
+                                      .maxRowsPerFrame(IntMath.divide(index.size(), MAX_FRAMES, RoundingMode.CEILING))
+                                      .frames()
+                                      .toList();
+    }
+
+    @After
+    public void tearDown()
+    {
+      reader.close();
+    }
 
-    writeAllFramesToChannel();
+    @Test
+    public void test_readLocally() throws IOException
+    {
+      writeAllFramesToChannel();
 
-    final File tmpFile = temporaryFolder.newFile();
+      Assert.assertSame(channel.readable(), reader.readLocally());
+      reader.close(); // Won't close the channel, because it's already been returned by readLocally
 
-    try (final FileOutputStream tmpOut = new FileOutputStream(tmpFile)) {
-      int numReads = 0;
-      long offset = 0;
+      final int numRows = FrameTestUtil.readRowsFromFrameChannel(channel.readable(), frameReader).toList().size();
+      Assert.assertEquals(EXPECTED_NUM_ROWS, numRows);
+    }
 
-      while (true) {
-        try (final InputStream in = FutureUtils.getUnchecked(reader.readRemotelyFrom(offset), true)) {
-          numReads++;
-          final int nextByte = in.read();
+    @Test
+    public void test_readLocally_closePriorToRead() throws IOException
+    {
+      writeAllFramesToChannel();
 
-          if (nextByte < 0) {
-            break;
-          }
+      reader.close();
+
+      // Can't read the channel after closing the reader
+      Assert.assertThrows(
+          IllegalStateException.class,
+          reader::readLocally
+      );
+    }
+
+    @Test
+    public void test_readLocally_thenReadRemotely() throws IOException
+    {
+      writeAllFramesToChannel();
 
-          tmpOut.write(nextByte);
-          offset++;
+      Assert.assertSame(channel.readable(), reader.readLocally());
+
+      // Can't read remotely after reading locally
+      Assert.assertThrows(
+          IllegalStateException.class,
+          () -> reader.readRemotelyFrom(0)
+      );
+
+      // Can still read locally after this error
+      final int numRows = FrameTestUtil.readRowsFromFrameChannel(channel.readable(), frameReader).toList().size();
+      Assert.assertEquals(EXPECTED_NUM_ROWS, numRows);
+    }
+
+    @Test
+    public void test_readRemotely_strideBasedOnReturnedChunk() throws IOException
+    {
+      // Test that reads entire chunks from readRemotelyFrom. This is a typical usage pattern.
+
+      writeAllFramesToChannel();
+
+      final File tmpFile = temporaryFolder.newFile();
+
+      try (final FileOutputStream tmpOut = new FileOutputStream(tmpFile)) {
+        int numReads = 0;
+        long offset = 0;
+
+        while (true) {
+          try (final InputStream in = FutureUtils.getUnchecked(reader.readRemotelyFrom(offset), true)) {
+            numReads++;
+            final long bytesWritten = ByteStreams.copy(in, tmpOut);
+            offset += bytesWritten;
+
+            if (bytesWritten == 0) {
+              break;
+            }
+          }
         }
+
+        MatcherAssert.assertThat(numReads, Matchers.greaterThan(1));
       }
 
-      Assert.assertEquals(numReads, offset + 1);
+      final FrameFile frameFile = FrameFile.open(tmpFile, null);
+      final int numRows =
+          FrameTestUtil.readRowsFromFrameChannel(new ReadableFileFrameChannel(frameFile), frameReader).toList().size();
+
+      Assert.assertEquals(EXPECTED_NUM_ROWS, numRows);
     }
 
-    final FrameFile frameFile = FrameFile.open(tmpFile, null);
-    final int numRows =
-        FrameTestUtil.readRowsFromFrameChannel(new ReadableFileFrameChannel(frameFile), frameReader).toList().size();
+    @Test
+    public void test_readRemotely_strideOneByte() throws IOException
+    {
+      // Test that reads one byte at a time from readRemotelyFrom. This helps ensure that there are no edge cases
+      // in the chunk-reading logic.
 
-    Assert.assertEquals(EXPECTED_NUM_ROWS, numRows);
-  }
+      writeAllFramesToChannel();
 
-  @Test
-  public void test_readRemotely_thenLocally() throws IOException
-  {
-    writeAllFramesToChannel();
+      final File tmpFile = temporaryFolder.newFile();
 
-    // Read remotely
-    FutureUtils.getUnchecked(reader.readRemotelyFrom(0), true);
+      try (final OutputStream tmpOut = Files.newOutputStream(tmpFile.toPath())) {
+        int numReads = 0;
+        long offset = 0;
 
-    // Then read locally
-    Assert.assertThrows(
-        IllegalStateException.class,
-        reader::readLocally
-    );
-  }
+        while (true) {
+          try (final InputStream in = FutureUtils.getUnchecked(reader.readRemotelyFrom(offset), true)) {
+            numReads++;
+            final int nextByte = in.read();
 
-  @Test
-  public void test_readRemotely_cannotReverse() throws IOException
-  {
-    writeAllFramesToChannel();
+            if (nextByte < 0) {
+              break;
+            }
 
-    // Read remotely from offset = 1.
-    final InputStream in = FutureUtils.getUnchecked(reader.readRemotelyFrom(1), true);
-    final int offset = ByteStreams.toByteArray(in).length;
-    MatcherAssert.assertThat(offset, Matchers.greaterThan(0));
+            tmpOut.write(nextByte);
+            offset++;
+          }
+        }
 
-    // Then read again from offset = 0; should get an error.
-    final RuntimeException e = Assert.assertThrows(
-        RuntimeException.class,
-        () -> FutureUtils.getUnchecked(reader.readRemotelyFrom(0), true)
-    );
+        Assert.assertEquals(numReads, offset + 1);
+      }
 
-    MatcherAssert.assertThat(
-        e,
-        ThrowableCauseMatcher.hasCause(
-            Matchers.allOf(
-                CoreMatchers.instanceOf(IllegalStateException.class),
-                ThrowableMessageMatcher.hasMessage(CoreMatchers.startsWith("Offset[0] no longer available"))
-            )
-        )
-    );
-  }
+      final FrameFile frameFile = FrameFile.open(tmpFile, null);
+      final int numRows =
+          FrameTestUtil.readRowsFromFrameChannel(new ReadableFileFrameChannel(frameFile), frameReader).toList().size();
 
-  private void writeAllFramesToChannel() throws IOException
-  {
-    for (Frame frame : frameList) {
-      channel.writable().write(frame);
+      Assert.assertEquals(EXPECTED_NUM_ROWS, numRows);
+    }
+
+    @Test
+    public void test_readRemotely_thenLocally() throws IOException
+    {
+      writeAllFramesToChannel();
+
+      // Read remotely
+      FutureUtils.getUnchecked(reader.readRemotelyFrom(0), true);
+
+      // Then read locally
+      Assert.assertThrows(
+          IllegalStateException.class,
+          reader::readLocally
+      );
+    }
+
+    @Test
+    public void test_readRemotely_cannotReverse() throws IOException
+    {
+      writeAllFramesToChannel();
+
+      // Read remotely from offset = 1.
+      final InputStream in = FutureUtils.getUnchecked(reader.readRemotelyFrom(1), true);
+      final int offset = ByteStreams.toByteArray(in).length;
+      MatcherAssert.assertThat(offset, Matchers.greaterThan(0));
+
+      // Then read again from offset = 0; should get an error.
+      final RuntimeException e = Assert.assertThrows(
+          RuntimeException.class,
+          () -> FutureUtils.getUnchecked(reader.readRemotelyFrom(0), true)
+      );
+
+      MatcherAssert.assertThat(
+          e,
+          ThrowableCauseMatcher.hasCause(
+              Matchers.allOf(
+                  CoreMatchers.instanceOf(IllegalStateException.class),
+                  ThrowableMessageMatcher.hasMessage(CoreMatchers.startsWith("Offset[0] no longer available"))
+              )
+          )
+      );
+    }
+
+    private void writeAllFramesToChannel() throws IOException
+    {
+      for (Frame frame : frameList) {
+        channel.writable().write(frame);
+      }
+      channel.writable().close();
     }
-    channel.writable().close();
   }
 }

From ed91daa82b058a12d94775b9c69acfb6432ef912 Mon Sep 17 00:00:00 2001
From: Misha <mikhailsviatohorof@gmail.com>
Date: Mon, 16 Sep 2024 12:40:25 +0200
Subject: [PATCH 28/47] Fix low sonatype findings (#17017)

Fixed vulnerabilities
CVE-2021-26291 : Apache Maven is vulnerable to Man-in-the-Middle (MitM) attacks. Various
functions across several files, mentioned below, allow for custom repositories to use the
insecure HTTP protocol. An attacker can exploit this as part of a Man-in-the-Middle (MitM)
attack, taking over or impersonating a repository using the insecure HTTP protocol.
Unsuspecting users may then have the compromised repository defined as a dependency in
their Project Object Model (pom) file and download potentially malicious files from it.
Was fixed by removing outdated tesla-aether library containing vulnerable maven-settings (v3.1.1) package, pull-deps utility updated to use maven resolver instead.

sonatype-2020-0244 : The joni package is vulnerable to Man-in-the-Middle (MitM) attacks.
This project downloads dependencies over HTTP due to an insecure repository configuration
within the .pom file. Consequently, a MitM could intercept requests to the specified
repository and replace the requested dependencies with malicious versions, which can execute
arbitrary code from the application that was built with them.
Was fixed by upgrading joni package to recommended 2.1.34 version
---
 indexing-service/pom.xml                      |  13 +-
 .../sampler/InputSourceSamplerTest.java       |   2 +-
 licenses.yaml                                 | 110 ++++--
 pom.xml                                       |   5 +
 server/pom.xml                                |  19 +-
 services/pom.xml                              |  54 ++-
 .../apache/druid/cli/PullDependencies.java    | 360 ++++++------------
 .../druid/cli/PullDependenciesTest.java       | 247 ++++++++++--
 8 files changed, 491 insertions(+), 319 deletions(-)

diff --git a/indexing-service/pom.xml b/indexing-service/pom.xml
index c851c6ef98fd..a9479d7db082 100644
--- a/indexing-service/pom.xml
+++ b/indexing-service/pom.xml
@@ -189,10 +189,6 @@
             <artifactId>datasketches-memory</artifactId>
             <scope>provided</scope>
         </dependency>
-        <dependency>
-            <groupId>net.thisptr</groupId>
-            <artifactId>jackson-jq</artifactId>
-        </dependency>
         <dependency>
             <groupId>org.codehaus.jackson</groupId>
             <artifactId>jackson-core-asl</artifactId>
@@ -203,10 +199,6 @@
             <artifactId>commons-collections4</artifactId>
             <scope>provided</scope>
         </dependency>
-        <dependency>
-            <groupId>org.eclipse.aether</groupId>
-            <artifactId>aether-api</artifactId>
-        </dependency>
         <!-- Tests -->
         <dependency>
             <groupId>junit</groupId>
@@ -267,6 +259,11 @@
             <artifactId>mockito-core</artifactId>
             <scope>test</scope>
         </dependency>
+        <dependency>
+            <groupId>org.apache.maven.resolver</groupId>
+            <artifactId>maven-resolver-api</artifactId>
+            <version>1.3.1</version>
+        </dependency>
     </dependencies>
 
     <profiles>
diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerTest.java
index 80d88e0be17c..690b74e347e0 100644
--- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerTest.java
+++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/InputSourceSamplerTest.java
@@ -23,7 +23,7 @@
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.ImmutableSet;
-import net.thisptr.jackson.jq.internal.misc.Lists;
+import com.google.common.collect.Lists;
 import org.apache.druid.client.indexing.SamplerResponse;
 import org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow;
 import org.apache.druid.data.input.InputFormat;
diff --git a/licenses.yaml b/licenses.yaml
index c4e2fa52300b..0ecffdb0a0ad 100644
--- a/licenses.yaml
+++ b/licenses.yaml
@@ -1837,33 +1837,38 @@ name: Apache Maven
 license_category: binary
 module: java-core
 license_name: Apache License version 2.0
-version: 3.1.1
+version: 3.6.0
+libraries:
+  - org.apache.maven: maven-repository-metadata
+  - org.apache.maven: maven-builder-support
+notices:
+  - maven-repository-metadata: |
+      Maven Repository Metadata Model
+      Copyright 2001-2018 The Apache Software Foundation
+  - maven-builder-support: |
+      Maven Builder Support
+      Copyright 2001-2018 The Apache Software Foundation
+---
+
+name: Maven Artifact Resolver Provider
+license_category: binary
+module: java-core
+license_name: Apache License version 2.0
+version: 3.6.0
 libraries:
-  - org.apache.maven: maven-aether-provider
+  - org.apache.maven: maven-resolver-provider
   - org.apache.maven: maven-model
   - org.apache.maven: maven-model-builder
-  - org.apache.maven: maven-repository-metadata
-  - org.apache.maven: maven-settings
-  - org.apache.maven: maven-settings-builder
 notices:
-  - maven-aether-provider: |
-      Maven Aether Provider
-      Copyright 2001-2013 The Apache Software Foundation
+  - maven-resolver-provider: |
+      Maven Artifact Resolver Provider
+      Copyright 2001-2018 The Apache Software Foundation
   - maven-model: |
       Maven Model
-      Copyright 2001-2013 The Apache Software Foundation
+      Copyright 2001-2018 The Apache Software Foundation
   - maven-model-builder: |
       Maven Model Builder
-      Copyright 2001-2013 The Apache Software Foundation
-  - maven-repository-metadata: |
-      Maven Repository Metadata Model
-      Copyright 2001-2013 The Apache Software Foundation
-  - maven-settings: |
-      Maven Settings
-      Copyright 2001-2013 The Apache Software Foundation
-  - maven-settings-builder: |
-      Maven Settings Builder
-      Copyright 2001-2013 The Apache Software Foundation
+      Copyright 2001-2018 The Apache Software Foundation
 ---
 
 name: Apache Maven Artifact
@@ -1879,6 +1884,67 @@ notices:
       Copyright 2001-2018 The Apache Software Foundation
 ---
 
+name: Maven Artifact Resolver Connector Basic
+license_category: binary
+module: java-core
+license_name: Apache License version 2.0
+version: 1.3.1
+libraries:
+  - org.apache.maven.resolver: maven-resolver-connector-basic
+  - org.apache.maven.resolver: maven-resolver-spi
+  - org.apache.maven.resolver: maven-resolver-api
+  - org.apache.maven.resolver: maven-resolver-util
+notices:
+  - maven-resolver-connector-basic: |
+      Maven Artifact Resolver Connector Basic
+      Copyright 2001-2018 The Apache Software Foundation
+  - maven-resolver-spi: |
+      Maven Artifact Resolver SPI
+      Copyright 2001-2018 The Apache Software Foundation
+  - maven-resolver-api: |
+      Maven Artifact Resolver API
+      Copyright 2001-2018 The Apache Software Foundation
+  - maven-resolver-util: |
+      Maven Artifact Resolver Utilities
+      Copyright 2001-2018 The Apache Software Foundation
+---
+
+name: Maven Artifact Resolver Transport HTTP
+license_category: binary
+module: java-core
+license_name: Apache License version 2.0
+version: 1.3.1
+libraries:
+  - org.apache.maven.resolver: maven-resolver-transport-http
+notices:
+  - maven-resolver-transport-http: |
+      Maven Artifact Resolver Transport HTTP
+      Copyright 2001-2018 The Apache Software Foundation
+
+---
+
+name: Maven Artifact Resolver Implementation
+license_category: binary
+module: java-core
+license_name: Apache License version 2.0
+version: 1.3.1
+libraries:
+  - org.apache.maven.resolver: maven-resolver-impl
+notices:
+  - maven-resolver-impl: |
+      Maven Artifact Resolver Implementation
+      Copyright 2001-2018 The Apache Software Foundation
+---
+
+name: Plexus Component Annotations
+license_category: binary
+module: java-core
+license_name: Apache License version 2.0
+version: 1.7.1
+libraries:
+  - org.codehaus.plexus: plexus-component-annotations
+---
+
 name: Apache Maven Wagon API
 license_category: binary
 module: java-core
@@ -1967,7 +2033,7 @@ name: Plexus Interpolation API
 license_category: binary
 module: java-core
 license_name: Apache License version 2.0
-version: 1.19
+version: 1.25
 libraries:
   - org.codehaus.plexus: plexus-interpolation
 
@@ -3245,7 +3311,7 @@ name: JCodings
 license_category: binary
 module: java-core
 license_name: MIT License
-version: 1.0.43
+version: 1.0.50
 copyright: JRuby Team
 license_file_path: licenses/bin/jcodings.MIT
 libraries:
@@ -3257,7 +3323,7 @@ name: Joni
 license_category: binary
 module: java-core
 license_name: MIT License
-version: 2.1.27
+version: 2.1.34
 copyright: JRuby Team
 license_file_path: licenses/bin/joni.MIT
 libraries:
diff --git a/pom.xml b/pom.xml
index d8e96ed859dd..42f7333ea2c9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1363,6 +1363,11 @@
               </exclusion>
             </exclusions>
           </dependency>
+          <dependency>
+              <groupId>org.jruby.joni</groupId>
+              <artifactId>joni</artifactId>
+              <version>2.1.34</version>
+          </dependency>
         </dependencies>
     </dependencyManagement>
 
diff --git a/server/pom.xml b/server/pom.xml
index d1954adc319a..6cfeee55f57c 100644
--- a/server/pom.xml
+++ b/server/pom.xml
@@ -140,8 +140,19 @@
             <artifactId>jsr305</artifactId>
         </dependency>
         <dependency>
-            <groupId>io.tesla.aether</groupId>
-            <artifactId>tesla-aether</artifactId>
+            <groupId>org.apache.maven.resolver</groupId>
+            <artifactId>maven-resolver-connector-basic</artifactId>
+            <version>1.3.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.maven.resolver</groupId>
+            <artifactId>maven-resolver-transport-http</artifactId>
+            <version>1.3.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.maven</groupId>
+            <artifactId>maven-resolver-provider</artifactId>
+            <version>3.6.0</version>
         </dependency>
         <dependency>
             <groupId>com.amazonaws</groupId>
@@ -500,7 +511,9 @@
                 <configuration>
                     <usedDependencies>
                         <!-- These are needed for scope: compile -->
-                        <dependency>io.tesla.aether:tesla-aether</dependency>
+                        <dependency>org.apache.maven:maven-resolver-provider</dependency>
+                        <dependency>org.apache.maven.resolver:maven-resolver-transport-http</dependency>
+                        <dependency>org.apache.maven.resolver:maven-resolver-connector-basic</dependency>
                         <!-- These are needed for scope: runtime -->
                         <dependency>org.xerial.snappy:snappy-java</dependency>
                     </usedDependencies>
diff --git a/services/pom.xml b/services/pom.xml
index 2f8b70bd88e2..83c109b5abaf 100644
--- a/services/pom.xml
+++ b/services/pom.xml
@@ -56,6 +56,12 @@
             <groupId>org.apache.druid</groupId>
             <artifactId>druid-indexing-service</artifactId>
             <version>${project.parent.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.eclipse.aether</groupId>
+                    <artifactId>aether-api</artifactId>
+                </exclusion>
+            </exclusions>
         </dependency>
         <dependency>
             <groupId>org.apache.druid</groupId>
@@ -171,10 +177,6 @@
             <groupId>io.netty</groupId>
             <artifactId>netty-common</artifactId>
         </dependency>
-        <dependency>
-            <groupId>org.eclipse.aether</groupId>
-            <artifactId>aether-api</artifactId>
-        </dependency>
         <dependency>
             <groupId>javax.servlet</groupId>
             <artifactId>javax.servlet-api</artifactId>
@@ -187,17 +189,49 @@
             <groupId>com.google.guava</groupId>
             <artifactId>guava</artifactId>
         </dependency>
-        <dependency>
-            <groupId>org.eclipse.aether</groupId>
-            <artifactId>aether-util</artifactId>
-        </dependency>
         <dependency>
             <groupId>com.google.inject.extensions</groupId>
             <artifactId>guice-servlet</artifactId>
         </dependency>
         <dependency>
-            <groupId>io.tesla.aether</groupId>
-            <artifactId>tesla-aether</artifactId>
+            <groupId>org.apache.maven.resolver</groupId>
+            <artifactId>maven-resolver-connector-basic</artifactId>
+            <version>1.3.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.maven.resolver</groupId>
+            <artifactId>maven-resolver-transport-http</artifactId>
+            <version>1.3.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.maven.resolver</groupId>
+            <artifactId>maven-resolver-util</artifactId>
+            <version>1.3.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.maven.resolver</groupId>
+            <artifactId>maven-resolver-impl</artifactId>
+            <version>1.3.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.maven.resolver</groupId>
+            <artifactId>maven-resolver-spi</artifactId>
+            <version>1.3.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.maven.resolver</groupId>
+            <artifactId>maven-resolver-api</artifactId>
+            <version>1.3.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.maven</groupId>
+            <artifactId>maven-artifact</artifactId>
+            <version>3.6.0</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.maven</groupId>
+            <artifactId>maven-resolver-provider</artifactId>
+            <version>3.6.0</version>
         </dependency>
         <dependency>
             <groupId>javax.xml.bind</groupId>
diff --git a/services/src/main/java/org/apache/druid/cli/PullDependencies.java b/services/src/main/java/org/apache/druid/cli/PullDependencies.java
index 6ea8626ba7e7..f0588d2b372d 100644
--- a/services/src/main/java/org/apache/druid/cli/PullDependencies.java
+++ b/services/src/main/java/org/apache/druid/cli/PullDependencies.java
@@ -22,43 +22,45 @@
 import com.github.rvesse.airline.annotations.Command;
 import com.github.rvesse.airline.annotations.Option;
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Strings;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableSetMultimap;
 import com.google.common.collect.SetMultimap;
 import com.google.inject.Inject;
-import io.netty.util.SuppressForbidden;
-import io.tesla.aether.Repository;
-import io.tesla.aether.TeslaAether;
-import io.tesla.aether.guice.RepositorySystemSessionProvider;
-import io.tesla.aether.internal.DefaultTeslaAether;
 import org.apache.druid.guice.ExtensionsConfig;
 import org.apache.druid.indexing.common.config.TaskConfig;
 import org.apache.druid.java.util.common.FileUtils;
 import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.logger.Logger;
+import org.apache.maven.artifact.resolver.ArtifactNotFoundException;
+import org.apache.maven.repository.internal.MavenRepositorySystemUtils;
+import org.eclipse.aether.DefaultRepositorySystemSession;
+import org.eclipse.aether.RepositorySystem;
 import org.eclipse.aether.RepositorySystemSession;
 import org.eclipse.aether.artifact.Artifact;
 import org.eclipse.aether.artifact.DefaultArtifact;
 import org.eclipse.aether.collection.CollectRequest;
+import org.eclipse.aether.connector.basic.BasicRepositoryConnectorFactory;
 import org.eclipse.aether.graph.Dependency;
 import org.eclipse.aether.graph.DependencyNode;
-import org.eclipse.aether.repository.Authentication;
+import org.eclipse.aether.impl.DefaultServiceLocator;
+import org.eclipse.aether.repository.LocalRepository;
 import org.eclipse.aether.repository.Proxy;
 import org.eclipse.aether.repository.RemoteRepository;
+import org.eclipse.aether.resolution.ArtifactResult;
 import org.eclipse.aether.resolution.DependencyRequest;
+import org.eclipse.aether.resolution.DependencyResolutionException;
+import org.eclipse.aether.resolution.DependencyResult;
+import org.eclipse.aether.spi.connector.RepositoryConnectorFactory;
+import org.eclipse.aether.spi.connector.transport.TransporterFactory;
+import org.eclipse.aether.transport.http.HttpTransporterFactory;
 import org.eclipse.aether.util.artifact.JavaScopes;
 import org.eclipse.aether.util.filter.DependencyFilterUtils;
 import org.eclipse.aether.util.repository.AuthenticationBuilder;
+import org.eclipse.aether.util.repository.DefaultProxySelector;
 
 import java.io.File;
 import java.io.IOException;
-import java.io.OutputStream;
-import java.io.PrintStream;
-import java.io.UnsupportedEncodingException;
-import java.net.URI;
-import java.net.URISyntaxException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
@@ -86,72 +88,6 @@ public class PullDependencies implements Runnable
                   .put("com.fasterxml.jackson.core", "jackson-core")
                   .put("com.fasterxml.jackson.core", "jackson-annotations")
                   .build();
-   /*
-      It is possible that extensions will pull down a lot of jars that are either
-      duplicates OR conflict with druid jars. In that case, there are two problems that arise
-
-      1. Large quantity of jars are passed around to things like hadoop when they are not needed (and should not be included)
-      2. Classpath priority becomes "mostly correct" and attempted to enforced correctly, but not fully tested
-
-      These jar groups should be included by druid and *not* pulled down in extensions
-      Note to future developers: This list is hand-crafted and will probably be out of date in the future
-      A good way to know where to look for errant dependencies is to compare the lib/ directory in the distribution
-      tarball with the jars included in the extension directories.
-
-      This list is best-effort, and might still pull down more than desired.
-
-      A simple example is that if an extension's dependency uses some-library-123.jar,
-      druid uses some-library-456.jar, and hadoop uses some-library-666.jar, then we probably want to use some-library-456.jar,
-      so don't pull down some-library-123.jar, and ask hadoop to load some-library-456.jar.
-
-      In the case where some-library is NOT on this list, both some-library-456.jar and some-library-123.jar will be
-      on the class path and propagated around the system. Most places TRY to make sure some-library-456.jar has
-      precedence, but it is easy for this assumption to be violated and for the precedence of some-library-456.jar,
-      some-library-123.jar and some-library-456.jar to not be properly defined.
-
-      As of this writing there are no special unit tests for classloader issues and library version conflicts.
-
-      Different tasks which are classloader sensitive attempt to maintain a sane order for loading libraries in the
-      classloader, but it is always possible that something didn't load in the right order. Also we don't want to be
-      throwing around a ton of jars we don't need to.
-
-      Here is a list of dependencies extensions should probably exclude.
-
-      Conflicts can be discovered using the following command on the distribution tarball:
-         `find lib -iname "*.jar" | cut -d / -f 2 | sed -e 's/-[0-9]\.[0-9]/@/' | cut -f 1 -d @ | sort | uniq | xargs -I {} find extensions -name "*{}*.jar" | sort`
-
-      "org.apache.druid",
-      "com.metamx.druid",
-      "asm",
-      "org.ow2.asm",
-      "org.jboss.netty",
-      "com.google.guava",
-      "com.google.code.findbugs",
-      "com.google.protobuf",
-      "com.esotericsoftware.minlog",
-      "log4j",
-      "org.slf4j",
-      "commons-logging",
-      "org.eclipse.jetty",
-      "org.mortbay.jetty",
-      "com.sun.jersey",
-      "com.sun.jersey.contribs",
-      "common-beanutils",
-      "commons-codec",
-      "commons-lang",
-      "commons-cli",
-      "commons-io",
-      "javax.activation",
-      "org.apache.httpcomponents",
-      "org.apache.zookeeper",
-      "org.codehaus.jackson",
-      "com.fasterxml.jackson",
-      "com.fasterxml.jackson.core",
-      "com.fasterxml.jackson.dataformat",
-      "com.fasterxml.jackson.datatype",
-      "org.roaringbitmap",
-      "net.java.dev.jets3t"
-      */
 
   private static final Dependencies SECURITY_VULNERABILITY_EXCLUSIONS =
       Dependencies.builder()
@@ -160,8 +96,6 @@ on the class path and propagated around the system. Most places TRY to make sure
 
   private final Dependencies hadoopExclusions;
 
-  private TeslaAether aether;
-
   @Inject
   public ExtensionsConfig extensionsConfig;
 
@@ -196,60 +130,53 @@ on the class path and propagated around the system. Most places TRY to make sure
       title = "A local repository that Maven will use to put downloaded files. Then pull-deps will lay these files out into the extensions directory as needed."
   )
   public String localRepository = StringUtils.format("%s/%s", System.getProperty("user.home"), ".m2/repository");
-
-  @Option(
-      name = {"-r", "--remoteRepository"},
-      title = "Add a remote repository. Unless --no-default-remote-repositories is provided, these will be used after https://repo1.maven.org/maven2/"
-  )
-  List<String> remoteRepositories = new ArrayList<>();
-
   @Option(
       name = "--no-default-remote-repositories",
       description = "Don't use the default remote repositories, only use the repositories provided directly via --remoteRepository"
   )
   public boolean noDefaultRemoteRepositories = false;
-
   @Option(
       name = {"-d", "--defaultVersion"},
       title = "Version to use for extension artifacts without version information."
   )
   public String defaultVersion = PullDependencies.class.getPackage().getImplementationVersion();
-
   @Option(
       name = {"--use-proxy"},
       title = "Use http/https proxy to pull dependencies."
   )
   public boolean useProxy = false;
-
   @Option(
       name = {"--proxy-type"},
       title = "The proxy type, should be either http or https"
   )
   public String proxyType = "https";
-
   @Option(
       name = {"--proxy-host"},
       title = "The proxy host"
   )
   public String proxyHost = "";
-
   @Option(
       name = {"--proxy-port"},
       title = "The proxy port"
   )
   public int proxyPort = -1;
-
   @Option(
       name = {"--proxy-username"},
       title = "The proxy username"
   )
   public String proxyUsername = "";
-
   @Option(
       name = {"--proxy-password"},
       title = "The proxy password"
   )
   public String proxyPassword = "";
+  @Option(
+      name = {"-r", "--remoteRepository"},
+      title = "Add a remote repository. Unless --no-default-remote-repositories is provided, these will be used after https://repo1.maven.org/maven2/"
+  )
+  List<String> remoteRepositories = new ArrayList<>();
+  private RepositorySystem repositorySystem;
+  private RepositorySystemSession repositorySystemSession;
 
   @SuppressWarnings("unused")  // used by com.github.rvesse.airline
   public PullDependencies()
@@ -261,18 +188,74 @@ public PullDependencies()
   }
 
   // Used for testing only
-  PullDependencies(TeslaAether aether, ExtensionsConfig extensionsConfig, Dependencies hadoopExclusions)
+  PullDependencies(
+      RepositorySystem repositorySystem,
+      RepositorySystemSession repositorySystemSession,
+      ExtensionsConfig extensionsConfig,
+      Dependencies hadoopExclusions
+  )
   {
-    this.aether = aether;
+    this.repositorySystem = repositorySystem;
+    this.repositorySystemSession = repositorySystemSession;
     this.extensionsConfig = extensionsConfig;
     this.hadoopExclusions = hadoopExclusions;
   }
 
+  private RepositorySystem getRepositorySystem()
+  {
+    DefaultServiceLocator locator = MavenRepositorySystemUtils.newServiceLocator();
+    locator.addService(RepositoryConnectorFactory.class, BasicRepositoryConnectorFactory.class);
+    locator.addService(TransporterFactory.class, HttpTransporterFactory.class);
+    return locator.getService(RepositorySystem.class);
+  }
+
+  protected RepositorySystemSession getRepositorySystemSession()
+  {
+    DefaultRepositorySystemSession session = MavenRepositorySystemUtils.newSession();
+    LocalRepository localRepo = new LocalRepository(localRepository);
+    session.setLocalRepositoryManager(repositorySystem.newLocalRepositoryManager(session, localRepo));
+
+    // Set up the proxy configuration if required
+    if (useProxy) {
+      Proxy proxy = new Proxy(
+          proxyType,
+          proxyHost,
+          proxyPort,
+          isBlank(proxyUsername) ? null : new AuthenticationBuilder()
+              .addUsername(proxyUsername)
+              .addPassword(proxyPassword)
+              .build()
+      );
+
+      final DefaultProxySelector proxySelector = new DefaultProxySelector();
+      proxySelector.add(proxy, null);
+
+      session.setProxySelector(proxySelector);
+    }
+
+    return session;
+  }
+
+  protected List<RemoteRepository> getRemoteRepositories()
+  {
+    List<RemoteRepository> repositories = new ArrayList<>();
+
+    if (!noDefaultRemoteRepositories) {
+      repositories.add(new RemoteRepository.Builder("central", "default", DEFAULT_REMOTE_REPOSITORIES.get(0)).build());
+    }
+
+    for (String repoUrl : remoteRepositories) {
+      repositories.add(new RemoteRepository.Builder(null, "default", repoUrl).build());
+    }
+
+    return repositories;
+  }
+
   @Override
   public void run()
   {
-    if (aether == null) {
-      aether = getAetherClient();
+    if (repositorySystem == null) {
+      repositorySystem = getRepositorySystem();
     }
 
     final File extensionsDir = new File(extensionsConfig.getDirectory());
@@ -334,7 +317,7 @@ public void run()
     }
   }
 
-  private Artifact getArtifact(String coordinate)
+  protected Artifact getArtifact(String coordinate)
   {
     DefaultArtifact versionedArtifact;
     try {
@@ -367,6 +350,12 @@ private void downloadExtension(Artifact versionedArtifact, File toLocation, Depe
   {
     final CollectRequest collectRequest = new CollectRequest();
     collectRequest.setRoot(new Dependency(versionedArtifact, JavaScopes.RUNTIME));
+
+    List<RemoteRepository> repositories = getRemoteRepositories();
+    for (RemoteRepository repo : repositories) {
+      collectRequest.addRepository(repo);
+    }
+
     final DependencyRequest dependencyRequest = new DependencyRequest(
         collectRequest,
         DependencyFilterUtils.andFilter(
@@ -375,13 +364,7 @@ private void downloadExtension(Artifact versionedArtifact, File toLocation, Depe
               String scope = node.getDependency().getScope();
               if (scope != null) {
                 scope = StringUtils.toLowerCase(scope);
-                if ("provided".equals(scope)) {
-                  return false;
-                }
-                if ("test".equals(scope)) {
-                  return false;
-                }
-                if ("system".equals(scope)) {
+                if ("provided".equals(scope) || "test".equals(scope) || "system".equals(scope)) {
                   return false;
                 }
               }
@@ -402,7 +385,17 @@ private void downloadExtension(Artifact versionedArtifact, File toLocation, Depe
 
     try {
       log.info("Start downloading extension [%s]", versionedArtifact);
-      final List<Artifact> artifacts = aether.resolveArtifacts(dependencyRequest);
+      if (repositorySystemSession == null) {
+        repositorySystemSession = getRepositorySystemSession();
+      }
+
+      final DependencyResult result = repositorySystem.resolveDependencies(
+          repositorySystemSession,
+          dependencyRequest
+      );
+      final List<Artifact> artifacts = result.getArtifactResults().stream()
+                                             .map(ArtifactResult::getArtifact)
+                                             .collect(Collectors.toList());
 
       for (Artifact artifact : artifacts) {
         if (exclusions.contain(artifact)) {
@@ -413,138 +406,18 @@ private void downloadExtension(Artifact versionedArtifact, File toLocation, Depe
         }
       }
     }
-    catch (Exception e) {
-      log.error(e, "Unable to resolve artifacts for [%s].", dependencyRequest);
-      throw new RuntimeException(e);
-    }
-    log.info("Finish downloading extension [%s]", versionedArtifact);
-  }
-
-  @SuppressForbidden(reason = "System#out")
-  private DefaultTeslaAether getAetherClient()
-  {
-    /*
-    DefaultTeslaAether logs a bunch of stuff to System.out, which is annoying.  We choose to disable that
-    unless debug logging is turned on.  "Disabling" it, however, is kinda bass-ackwards.  We copy out a reference
-    to the current System.out, and set System.out to a noop output stream.  Then after DefaultTeslaAether has pulled
-    The reference we swap things back.
-
-    This has implications for other things that are running in parallel to this.  Namely, if anything else also grabs
-    a reference to System.out or tries to log to it while we have things adjusted like this, then they will also log
-    to nothingness.  Fortunately, the code that calls this is single-threaded and shouldn't hopefully be running
-    alongside anything else that's grabbing System.out.  But who knows.
-    */
-
-    final List<String> remoteUriList = new ArrayList<>();
-    if (!noDefaultRemoteRepositories) {
-      remoteUriList.addAll(DEFAULT_REMOTE_REPOSITORIES);
-    }
-    remoteUriList.addAll(remoteRepositories);
-
-    List<Repository> remoteRepositories = new ArrayList<>();
-    for (String uri : remoteUriList) {
-      try {
-        URI u = new URI(uri);
-        Repository r = new Repository(uri);
-
-        if (u.getUserInfo() != null) {
-          String[] auth = u.getUserInfo().split(":", 2);
-          if (auth.length == 2) {
-            r.setUsername(auth[0]);
-            r.setPassword(auth[1]);
-          } else {
-            log.warn(
-                "Invalid credentials in repository URI, expecting [<user>:<password>], got [%s] for [%s]",
-                u.getUserInfo(),
-                uri
-            );
-          }
-        }
-        remoteRepositories.add(r);
-      }
-      catch (URISyntaxException e) {
-        throw new RuntimeException(e);
+    catch (DependencyResolutionException e) {
+      if (e.getCause() instanceof ArtifactNotFoundException) {
+        log.error("Artifact not found in any configured repositories: [%s]", versionedArtifact);
+      } else {
+        log.error(e, "Unable to resolve artifacts for [%s].", dependencyRequest);
       }
     }
-
-    if (log.isTraceEnabled() || log.isDebugEnabled()) {
-      return createTeslaAether(remoteRepositories);
-    }
-
-    PrintStream oldOut = System.out;
-    try {
-      System.setOut(
-          new PrintStream(
-              new OutputStream()
-              {
-                @Override
-                public void write(int b)
-                {
-
-                }
-
-                @Override
-                public void write(byte[] b)
-                {
-
-                }
-
-                @Override
-                public void write(byte[] b, int off, int len)
-                {
-
-                }
-              },
-              false,
-              StringUtils.UTF8_STRING
-          )
-      );
-      return createTeslaAether(remoteRepositories);
-    }
-    catch (UnsupportedEncodingException e) {
-      // should never happen
-      throw new IllegalStateException(e);
-    }
-    finally {
-      System.setOut(oldOut);
-    }
-  }
-
-  private DefaultTeslaAether createTeslaAether(List<Repository> remoteRepositories)
-  {
-    if (!useProxy) {
-      return new DefaultTeslaAether(
-          localRepository,
-          remoteRepositories.toArray(new Repository[0])
-      );
-    }
-
-    if (!StringUtils.toLowerCase(proxyType).equals(Proxy.TYPE_HTTP) &&
-        !StringUtils.toLowerCase(proxyType).equals(Proxy.TYPE_HTTPS)) {
-      throw new IllegalArgumentException("invalid proxy type: " + proxyType);
+    catch (IOException e) {
+      log.error(e, "I/O error while processing artifact [%s].", versionedArtifact);
+      throw new RuntimeException(e);
     }
-
-    RepositorySystemSession repositorySystemSession =
-        new RepositorySystemSessionProvider(new File(localRepository)).get();
-    List<RemoteRepository> rl = remoteRepositories.stream().map(r -> {
-      RemoteRepository.Builder builder = new RemoteRepository.Builder(r.getId(), "default", r.getUrl());
-      if (r.getUsername() != null && r.getPassword() != null) {
-        Authentication auth = new AuthenticationBuilder().addUsername(r.getUsername())
-                                                         .addPassword(r.getPassword())
-                                                         .build();
-        builder.setAuthentication(auth);
-      }
-
-      final Authentication proxyAuth;
-      if (Strings.isNullOrEmpty(proxyUsername)) {
-        proxyAuth = null;
-      } else {
-        proxyAuth = new AuthenticationBuilder().addUsername(proxyUsername).addPassword(proxyPassword).build();
-      }
-      builder.setProxy(new Proxy(proxyType, proxyHost, proxyPort, proxyAuth));
-      return builder.build();
-    }).collect(Collectors.toList());
-    return new DefaultTeslaAether(rl, repositorySystemSession);
+    log.info("Finish downloading extension [%s]", versionedArtifact);
   }
 
   /**
@@ -567,6 +440,11 @@ private void createExtensionDirectory(String coordinate, File atLocation)
     }
   }
 
+  private boolean isBlank(final String toCheck)
+  {
+    return toCheck == null || toCheck.isEmpty();
+  }
+
   @VisibleForTesting
   static class Dependencies
   {
@@ -579,15 +457,15 @@ private Dependencies(Builder builder)
       groupIdToArtifactIds = builder.groupIdToArtifactIdsBuilder.build();
     }
 
-    boolean contain(Artifact artifact)
+    static Builder builder()
     {
-      Set<String> artifactIds = groupIdToArtifactIds.get(artifact.getGroupId());
-      return artifactIds.contains(ANY_ARTIFACT_ID) || artifactIds.contains(artifact.getArtifactId());
+      return new Builder();
     }
 
-    static Builder builder()
+    boolean contain(Artifact artifact)
     {
-      return new Builder();
+      Set<String> artifactIds = groupIdToArtifactIds.get(artifact.getGroupId());
+      return artifactIds.contains(ANY_ARTIFACT_ID) || artifactIds.contains(artifact.getArtifactId());
     }
 
     static final class Builder
diff --git a/services/src/test/java/org/apache/druid/cli/PullDependenciesTest.java b/services/src/test/java/org/apache/druid/cli/PullDependenciesTest.java
index 2d4db23b0e3a..851d0f177202 100644
--- a/services/src/test/java/org/apache/druid/cli/PullDependenciesTest.java
+++ b/services/src/test/java/org/apache/druid/cli/PullDependenciesTest.java
@@ -21,16 +21,32 @@
 
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableSet;
-import io.tesla.aether.internal.DefaultTeslaAether;
 import org.apache.druid.guice.ExtensionsConfig;
 import org.apache.druid.java.util.common.StringUtils;
+import org.apache.maven.repository.internal.MavenRepositorySystemUtils;
+import org.eclipse.aether.DefaultRepositorySystemSession;
+import org.eclipse.aether.RepositorySystem;
+import org.eclipse.aether.RepositorySystemSession;
 import org.eclipse.aether.artifact.Artifact;
 import org.eclipse.aether.artifact.DefaultArtifact;
+import org.eclipse.aether.connector.basic.BasicRepositoryConnectorFactory;
 import org.eclipse.aether.graph.DefaultDependencyNode;
 import org.eclipse.aether.graph.Dependency;
-import org.eclipse.aether.graph.DependencyFilter;
 import org.eclipse.aether.graph.DependencyNode;
+import org.eclipse.aether.impl.DefaultServiceLocator;
+import org.eclipse.aether.repository.Authentication;
+import org.eclipse.aether.repository.LocalRepository;
+import org.eclipse.aether.repository.Proxy;
+import org.eclipse.aether.repository.RemoteRepository;
+import org.eclipse.aether.resolution.ArtifactRequest;
+import org.eclipse.aether.resolution.ArtifactResult;
 import org.eclipse.aether.resolution.DependencyRequest;
+import org.eclipse.aether.resolution.DependencyResult;
+import org.eclipse.aether.spi.connector.RepositoryConnectorFactory;
+import org.eclipse.aether.spi.connector.transport.TransporterFactory;
+import org.eclipse.aether.transport.http.HttpTransporterFactory;
+import org.eclipse.aether.util.artifact.JavaScopes;
+import org.eclipse.aether.util.repository.AuthenticationBuilder;
 import org.hamcrest.CoreMatchers;
 import org.junit.Assert;
 import org.junit.Before;
@@ -50,6 +66,11 @@
 import java.util.Set;
 import java.util.stream.Collectors;
 
+import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.Mockito.any;
+import static org.mockito.Mockito.doAnswer;
+import static org.mockito.Mockito.spy;
+
 public class PullDependenciesTest
 {
   private static final String EXTENSION_A_COORDINATE = "groupX:extension_A:123";
@@ -71,23 +92,18 @@ public class PullDependenciesTest
                                    .put(DEPENDENCY_GROUPID, HADOOP_CLIENT_VULNERABLE_ARTIFACTID1)
                                    .put(DEPENDENCY_GROUPID, HADOOP_CLIENT_VULNERABLE_ARTIFACTID2)
                                    .build();
-
+  private static File localRepo; // a mock local repository that stores jars
+  private static Map<Artifact, List<String>> extensionToDependency;
   @Rule
   public final TemporaryFolder temporaryFolder = new TemporaryFolder();
-
-  private File localRepo; // a mock local repository that stores jars
-
   private final Artifact extension_A = new DefaultArtifact(EXTENSION_A_COORDINATE);
   private final Artifact extension_B = new DefaultArtifact(EXTENSION_B_COORDINATE);
   private final Artifact hadoop_client_2_3_0 = new DefaultArtifact(HADOOP_CLIENT_2_3_0_COORDINATE);
   private final Artifact hadoop_client_2_4_0 = new DefaultArtifact(HADOOP_CLIENT_2_4_0_COORDINATE);
-
   private PullDependencies pullDependencies;
   private File rootExtensionsDir;
   private File rootHadoopDependenciesDir;
 
-  private Map<Artifact, List<String>> extensionToDependency;
-
   @Before
   public void setUp() throws Exception
   {
@@ -105,18 +121,22 @@ public void setUp() throws Exception
     rootExtensionsDir = temporaryFolder.newFolder("extensions");
     rootHadoopDependenciesDir = temporaryFolder.newFolder("druid_hadoop_dependencies");
 
+    RepositorySystem realRepositorySystem = RealRepositorySystemUtil.newRepositorySystem();
+    RepositorySystem spyMockRepositorySystem = spy(realRepositorySystem);
+    RepositorySystemSession repositorySystemSession = RealRepositorySystemUtil.newRepositorySystemSession(
+        spyMockRepositorySystem,
+        localRepo.getPath()
+    );
+
+    doAnswer(invocation -> {
+      DependencyRequest request = invocation.getArgument(1);
+      return mockDependencyResult(request.getCollectRequest().getRoot().getArtifact());
+    }).when(spyMockRepositorySystem).resolveDependencies(eq(repositorySystemSession), any(DependencyRequest.class));
+
+
     pullDependencies = new PullDependencies(
-        new DefaultTeslaAether()
-        {
-          @Override
-          public List<Artifact> resolveArtifacts(DependencyRequest request)
-          {
-            return getArtifactsForExtension(
-                request.getCollectRequest().getRoot().getArtifact(),
-                request.getFilter()
-            );
-          }
-        },
+        spyMockRepositorySystem,
+        repositorySystemSession,
         new ExtensionsConfig()
         {
           @Override
@@ -140,14 +160,15 @@ public String getHadoopDependenciesDir()
         HADOOP_CLIENT_2_4_0_COORDINATE
     );
 
-    // Because --clean is specified, pull-deps will first remove existing root extensions and hadoop dependencies
     pullDependencies.clean = true;
   }
 
-  private List<Artifact> getArtifactsForExtension(Artifact artifact, DependencyFilter filter)
+  private DependencyResult mockDependencyResult(Artifact artifact)
   {
-    final List<String> names = extensionToDependency.get(artifact);
-    final List<Artifact> artifacts = new ArrayList<>();
+    final List<String> names = extensionToDependency.getOrDefault(artifact, Collections.emptyList());
+    final List<ArtifactResult> artifacts = new ArrayList<>();
+    List<DependencyNode> children = new ArrayList<>();
+
     for (String name : names) {
       final File jarFile = new File(localRepo, name + ".jar");
       try {
@@ -156,18 +177,23 @@ private List<Artifact> getArtifactsForExtension(Artifact artifact, DependencyFil
       catch (IOException e) {
         throw new RuntimeException(e);
       }
-
-      DependencyNode node = new DefaultDependencyNode(
-          new Dependency(
-              new DefaultArtifact(DEPENDENCY_GROUPID, name, null, "jar", "1.0", null, jarFile),
-              "compile"
-          )
+      Artifact depArtifact = new DefaultArtifact("groupid", name, null, "jar", "1.0",
+                                                 null, jarFile
       );
-      if (filter.accept(node, Collections.emptyList())) {
-        artifacts.add(node.getArtifact());
-      }
+      DependencyNode depNode = new DefaultDependencyNode(new Dependency(depArtifact, JavaScopes.COMPILE));
+      children.add(depNode);
+      ArtifactResult artifactResult = new ArtifactResult(new ArtifactRequest(depNode));
+      artifactResult.setArtifact(depArtifact);
+      artifacts.add(artifactResult);
     }
-    return artifacts;
+
+    DependencyNode rootNode = new DefaultDependencyNode(new Dependency(artifact, JavaScopes.COMPILE));
+    rootNode.setChildren(children);
+
+    DependencyResult result = new DependencyResult(new DependencyRequest());
+    result.setRoot(rootNode);
+    result.setArtifactResults(artifacts);
+    return result;
   }
 
   private List<File> getExpectedJarFiles(Artifact artifact)
@@ -299,4 +325,157 @@ public void testPullDependeciesExcludesHadoopSecurityVulnerabilities()
     Assert.assertThat(dependencies, CoreMatchers.not(CoreMatchers.hasItem(HADOOP_CLIENT_VULNERABLE_JAR1)));
     Assert.assertThat(dependencies, CoreMatchers.not(CoreMatchers.hasItem(HADOOP_CLIENT_VULNERABLE_JAR2)));
   }
+
+  @Test
+  public void testPullDependenciesCleanFlag() throws IOException
+  {
+    File dummyFile1 = new File(rootExtensionsDir, "dummy.txt");
+    File dummyFile2 = new File(rootHadoopDependenciesDir, "dummy.txt");
+    Assert.assertTrue(dummyFile1.createNewFile());
+    Assert.assertTrue(dummyFile2.createNewFile());
+
+    pullDependencies.clean = true;
+    pullDependencies.run();
+
+    Assert.assertFalse(dummyFile1.exists());
+    Assert.assertFalse(dummyFile2.exists());
+  }
+
+  @Test
+  public void testPullDependenciesNoDefaultRemoteRepositories()
+  {
+    pullDependencies.noDefaultRemoteRepositories = true;
+    pullDependencies.remoteRepositories = ImmutableList.of("https://custom.repo");
+
+    pullDependencies.run();
+
+    List<RemoteRepository> repositories = pullDependencies.getRemoteRepositories();
+    Assert.assertEquals(1, repositories.size());
+    Assert.assertEquals("https://custom.repo", repositories.get(0).getUrl());
+  }
+
+  @Test
+  public void testPullDependenciesDirectoryCreationFailure() throws IOException
+  {
+    if (rootExtensionsDir.exists()) {
+      rootExtensionsDir.delete();
+    }
+    Assert.assertTrue(rootExtensionsDir.createNewFile());
+
+    Assert.assertThrows(IllegalArgumentException.class, () -> pullDependencies.run());
+  }
+
+  @Test
+  public void testGetArtifactWithValidCoordinate()
+  {
+    String coordinate = "groupX:artifactX:1.0.0";
+    DefaultArtifact artifact = (DefaultArtifact) pullDependencies.getArtifact(coordinate);
+    Assert.assertEquals("groupX", artifact.getGroupId());
+    Assert.assertEquals("artifactX", artifact.getArtifactId());
+    Assert.assertEquals("1.0.0", artifact.getVersion());
+  }
+
+  @Test
+  public void testGetArtifactwithCoordinateWithoutDefaultVersion()
+  {
+    String coordinate = "groupY:artifactY";
+    Assert.assertThrows(
+        "Bad artifact coordinates groupY:artifactY, expected format is <groupId>:<artifactId>[:<extension>[:<classifier>]]:<version>",
+        IllegalArgumentException.class,
+        () -> pullDependencies.getArtifact(coordinate)
+    );
+
+  }
+
+  @Test
+  public void testGetArtifactWithCoordinateWithoutVersion()
+  {
+    pullDependencies.defaultVersion = "2.0.0";
+    String coordinate = "groupY:artifactY";
+    DefaultArtifact artifact = (DefaultArtifact) pullDependencies.getArtifact(coordinate);
+    Assert.assertEquals("groupY", artifact.getGroupId());
+    Assert.assertEquals("artifactY", artifact.getArtifactId());
+    Assert.assertEquals("2.0.0", artifact.getVersion());
+  }
+
+  @Test
+  public void testGetRemoteRepositoriesWithDefaultRepositories()
+  {
+    pullDependencies.noDefaultRemoteRepositories = false; // Use default remote repositories
+    pullDependencies.remoteRepositories = ImmutableList.of("https://custom.repo");
+
+    List<RemoteRepository> repositories = pullDependencies.getRemoteRepositories();
+    Assert.assertEquals(2, repositories.size());
+    Assert.assertEquals("https://repo1.maven.org/maven2/", repositories.get(0).getUrl());
+    Assert.assertEquals("https://custom.repo", repositories.get(1).getUrl());
+  }
+
+  @Test
+  public void testGetRepositorySystemSessionWithProxyConfiguration()
+  {
+    pullDependencies.useProxy = true;
+    pullDependencies.proxyType = "http";
+    pullDependencies.proxyHost = "localhost";
+    pullDependencies.proxyPort = 8080;
+    pullDependencies.proxyUsername = "user";
+    pullDependencies.proxyPassword = "password";
+
+    DefaultRepositorySystemSession session = (DefaultRepositorySystemSession) pullDependencies.getRepositorySystemSession();
+
+    LocalRepository localRepo = session.getLocalRepositoryManager().getRepository();
+    Assert.assertEquals(pullDependencies.localRepository, localRepo.getBasedir().getAbsolutePath());
+
+    Proxy proxy = session.getProxySelector().getProxy(
+        new RemoteRepository.Builder("test", "default", "http://example.com").build()
+    );
+    RemoteRepository testRepository = new RemoteRepository.Builder("test", "default", "http://example.com")
+        .setProxy(proxy)
+        .build();
+
+    Assert.assertNotNull(proxy);
+    Assert.assertEquals("localhost", proxy.getHost());
+    Assert.assertEquals(8080, proxy.getPort());
+    Assert.assertEquals("http", proxy.getType());
+
+    Authentication auth = new AuthenticationBuilder().addUsername("user").addPassword("password").build();
+    Assert.assertEquals(auth, proxy.getAuthentication());
+  }
+
+  @Test
+  public void testGetRepositorySystemSessionWithoutProxyConfiguration()
+  {
+    pullDependencies.useProxy = false;
+    DefaultRepositorySystemSession session = (DefaultRepositorySystemSession) pullDependencies.getRepositorySystemSession();
+    LocalRepository localRepo = session.getLocalRepositoryManager().getRepository();
+    Assert.assertEquals(pullDependencies.localRepository, localRepo.getBasedir().getAbsolutePath());
+    Proxy proxy = session.getProxySelector().getProxy(
+        new RemoteRepository.Builder("test", "default", "http://example.com").build()
+    );
+    Assert.assertNull(proxy);
+  }
+
+  private static class RealRepositorySystemUtil
+  {
+    public static RepositorySystem newRepositorySystem()
+    {
+      DefaultServiceLocator locator = MavenRepositorySystemUtils.newServiceLocator();
+      locator.addService(RepositoryConnectorFactory.class, BasicRepositoryConnectorFactory.class);
+      locator.addService(TransporterFactory.class, HttpTransporterFactory.class);
+      return locator.getService(RepositorySystem.class);
+    }
+
+    public static DefaultRepositorySystemSession newRepositorySystemSession(
+        RepositorySystem system,
+        String localRepoPath
+    )
+    {
+      DefaultRepositorySystemSession session = MavenRepositorySystemUtils.newSession();
+
+      LocalRepository localRepo = new LocalRepository(localRepoPath);
+      session.setLocalRepositoryManager(system.newLocalRepositoryManager(session, localRepo));
+
+      return session;
+    }
+  }
+
 }

From 5cb2156774d423bf0f3f5b2c82263f398458e453 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Mon, 16 Sep 2024 05:30:09 -0700
Subject: [PATCH 29/47] MSQ: Wake up the main controller thread on workerError.
 (#17075)

This isn't necessary when using MSQWorkerTaskLauncher as the WorkerManager
implementation, because in that case, task failure also wakes up the
main thread. However, when using workers that are not task-based, we don't
want to rely on the WorkerManager for this.
---
 .../main/java/org/apache/druid/msq/exec/ControllerImpl.java | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
index 6d1ef21abbf2..2a29d40b9fea 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
@@ -117,6 +117,7 @@
 import org.apache.druid.msq.indexing.error.TooManySegmentsInTimeChunkFault;
 import org.apache.druid.msq.indexing.error.TooManyWarningsFault;
 import org.apache.druid.msq.indexing.error.UnknownFault;
+import org.apache.druid.msq.indexing.error.WorkerFailedFault;
 import org.apache.druid.msq.indexing.error.WorkerRpcFailedFault;
 import org.apache.druid.msq.indexing.processor.SegmentGeneratorFrameProcessorFactory;
 import org.apache.druid.msq.indexing.report.MSQSegmentReport;
@@ -754,6 +755,11 @@ public void workerError(MSQErrorReport errorReport)
     }
 
     workerErrorRef.compareAndSet(null, mapQueryColumnNameToOutputColumnName(errorReport));
+
+    // Wake up the main controller thread.
+    addToKernelManipulationQueue(kernel -> {
+      throw new MSQException(new WorkerFailedFault(errorReport.getTaskId(), null));
+    });
   }
 
   /**

From 483f009f7addc3c2521dede574c331515fa84420 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Mon, 16 Sep 2024 05:30:47 -0700
Subject: [PATCH 30/47] Remove unused WorkerManagerClient interface. (#17073)

---
 .../druid/msq/exec/WorkerManagerClient.java   |  57 ----------
 .../client/IndexerWorkerManagerClient.java    | 105 ------------------
 .../IndexerWorkerManagerClientTest.java       | 104 -----------------
 3 files changed, 266 deletions(-)
 delete mode 100644 extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerManagerClient.java
 delete mode 100644 extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerWorkerManagerClient.java
 delete mode 100644 extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/client/IndexerWorkerManagerClientTest.java

diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerManagerClient.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerManagerClient.java
deleted file mode 100644
index 415c93a85999..000000000000
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerManagerClient.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.druid.msq.exec;
-
-import org.apache.druid.indexer.TaskLocation;
-import org.apache.druid.indexer.TaskStatus;
-import org.apache.druid.msq.indexing.MSQWorkerTask;
-
-import java.io.Closeable;
-import java.util.Map;
-import java.util.Set;
-
-/**
- * Generic interface to the "worker manager" mechanism which starts, cancels and monitors worker tasks.
- */
-public interface WorkerManagerClient extends Closeable
-{
-  String run(String taskId, MSQWorkerTask task);
-
-  /**
-   * @param workerId the task ID
-   *
-   * @return a {@code TaskLocation} associated with the task or
-   * {@code TaskLocation.unknown()} if no associated entry could be found
-   */
-  TaskLocation location(String workerId);
-
-  /**
-   * Fetches status map corresponding to a group of task ids
-   */
-  Map<String, TaskStatus> statuses(Set<String> taskIds);
-
-  /**
-   * Cancel the task corresponding to the provided workerId
-   */
-  void cancel(String workerId);
-
-  @Override
-  void close();
-}
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerWorkerManagerClient.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerWorkerManagerClient.java
deleted file mode 100644
index 927130e0ca7c..000000000000
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerWorkerManagerClient.java
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.druid.msq.indexing.client;
-
-import com.google.common.collect.ImmutableSet;
-import org.apache.druid.client.indexing.TaskStatusResponse;
-import org.apache.druid.common.guava.FutureUtils;
-import org.apache.druid.indexer.TaskLocation;
-import org.apache.druid.indexer.TaskStatus;
-import org.apache.druid.msq.exec.WorkerManagerClient;
-import org.apache.druid.msq.indexing.MSQWorkerTask;
-import org.apache.druid.rpc.indexing.OverlordClient;
-
-import java.util.Map;
-import java.util.Set;
-
-/**
- * Worker manager client backed by the Indexer service. Glues together
- * three different mechanisms to provide the single multi-stage query interface.
- */
-public class IndexerWorkerManagerClient implements WorkerManagerClient
-{
-  private final OverlordClient overlordClient;
-  private final TaskLocationFetcher locationFetcher = new TaskLocationFetcher();
-
-  public IndexerWorkerManagerClient(final OverlordClient overlordClient)
-  {
-    this.overlordClient = overlordClient;
-  }
-
-  @Override
-  public String run(String taskId, MSQWorkerTask task)
-  {
-    FutureUtils.getUnchecked(overlordClient.runTask(taskId, task), true);
-    return taskId;
-  }
-
-  @Override
-  public void cancel(String taskId)
-  {
-    FutureUtils.getUnchecked(overlordClient.cancelTask(taskId), true);
-  }
-
-  @Override
-  public Map<String, TaskStatus> statuses(Set<String> taskIds)
-  {
-    return FutureUtils.getUnchecked(overlordClient.taskStatuses(taskIds), true);
-  }
-
-  @Override
-  public TaskLocation location(String workerId)
-  {
-    return locationFetcher.getLocation(workerId);
-  }
-
-  @Override
-  public void close()
-  {
-    // Nothing to do. The OverlordServiceClient is closed by the JVM lifecycle.
-  }
-
-  private class TaskLocationFetcher
-  {
-    TaskLocation getLocation(String workerId)
-    {
-      final TaskStatus taskStatus = FutureUtils.getUnchecked(
-          overlordClient.taskStatuses(ImmutableSet.of(workerId)),
-          true
-      ).get(workerId);
-
-      if (taskStatus != null
-          && !TaskLocation.unknown().equals(taskStatus.getLocation())) {
-        return taskStatus.getLocation();
-      }
-
-      // Retry with the single status API
-      final TaskStatusResponse statusResponse = FutureUtils.getUnchecked(
-          overlordClient.taskStatus(workerId),
-          true
-      );
-      if (statusResponse == null || statusResponse.getStatus() == null) {
-        return TaskLocation.unknown();
-      } else {
-        return statusResponse.getStatus().getLocation();
-      }
-    }
-  }
-}
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/client/IndexerWorkerManagerClientTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/client/IndexerWorkerManagerClientTest.java
deleted file mode 100644
index 4b53420cbb9d..000000000000
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/client/IndexerWorkerManagerClientTest.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.druid.msq.indexing.client;
-
-import com.google.common.util.concurrent.Futures;
-import org.apache.druid.client.indexing.TaskStatusResponse;
-import org.apache.druid.indexer.TaskLocation;
-import org.apache.druid.indexer.TaskState;
-import org.apache.druid.indexer.TaskStatus;
-import org.apache.druid.indexer.TaskStatusPlus;
-import org.apache.druid.java.util.common.DateTimes;
-import org.apache.druid.rpc.indexing.OverlordClient;
-import org.junit.Assert;
-import org.junit.Test;
-import org.mockito.ArgumentMatchers;
-import org.mockito.Mockito;
-
-import java.util.Collections;
-
-public class IndexerWorkerManagerClientTest
-{
-
-  @Test
-  public void testGetLocationCallsMultiStatusApiByDefault()
-  {
-    final OverlordClient overlordClient = Mockito.mock(OverlordClient.class);
-
-    final String taskId = "worker1";
-    final TaskLocation expectedLocation = new TaskLocation("localhost", 1000, 1100, null);
-    Mockito.when(overlordClient.taskStatuses(Collections.singleton(taskId))).thenReturn(
-        Futures.immediateFuture(
-            Collections.singletonMap(
-                taskId,
-                new TaskStatus(taskId, TaskState.RUNNING, 100L, null, expectedLocation)
-            )
-        )
-    );
-
-    final IndexerWorkerManagerClient managerClient = new IndexerWorkerManagerClient(overlordClient);
-    Assert.assertEquals(managerClient.location(taskId), expectedLocation);
-
-    Mockito.verify(overlordClient, Mockito.times(1)).taskStatuses(ArgumentMatchers.anySet());
-    Mockito.verify(overlordClient, Mockito.never()).taskStatus(ArgumentMatchers.anyString());
-  }
-
-  @Test
-  public void testGetLocationFallsBackToSingleTaskApiIfLocationIsUnknown()
-  {
-    final OverlordClient overlordClient = Mockito.mock(OverlordClient.class);
-
-    final String taskId = "worker1";
-    Mockito.when(overlordClient.taskStatuses(Collections.singleton(taskId))).thenReturn(
-        Futures.immediateFuture(
-            Collections.singletonMap(
-                taskId,
-                new TaskStatus(taskId, TaskState.RUNNING, 100L, null, TaskLocation.unknown())
-            )
-        )
-    );
-
-    final TaskLocation expectedLocation = new TaskLocation("localhost", 1000, 1100, null);
-    final TaskStatusPlus taskStatus = new TaskStatusPlus(
-        taskId,
-        null,
-        null,
-        DateTimes.nowUtc(),
-        DateTimes.nowUtc(),
-        TaskState.RUNNING,
-        null,
-        100L,
-        expectedLocation,
-        "wiki",
-        null
-    );
-
-    Mockito.when(overlordClient.taskStatus(taskId)).thenReturn(
-        Futures.immediateFuture(new TaskStatusResponse(taskId, taskStatus))
-    );
-
-    final IndexerWorkerManagerClient managerClient = new IndexerWorkerManagerClient(overlordClient);
-    Assert.assertEquals(managerClient.location(taskId), expectedLocation);
-
-    Mockito.verify(overlordClient, Mockito.times(1)).taskStatuses(ArgumentMatchers.anySet());
-    Mockito.verify(overlordClient, Mockito.times(1)).taskStatus(ArgumentMatchers.anyString());
-  }
-
-}

From ad7f0be49609a86362ec0dec4863056f008c5f9b Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Mon, 16 Sep 2024 06:08:36 -0700
Subject: [PATCH 31/47] Remove close method on MSQWarningReportPublisher.
 (#17071)

It didn't do anything and also wasn't called.
---
 .../indexing/error/MSQWarningReportLimiterPublisher.java  | 6 ------
 .../msq/indexing/error/MSQWarningReportPublisher.java     | 8 +-------
 .../indexing/error/MSQWarningReportSimplePublisher.java   | 6 ------
 3 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportLimiterPublisher.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportLimiterPublisher.java
index 9a8b3f79f6d2..ffc74077502e 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportLimiterPublisher.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportLimiterPublisher.java
@@ -117,10 +117,4 @@ public void publishException(int stageNumber, Throwable e)
     }
     delegate.publishException(stageNumber, e);
   }
-
-  @Override
-  public void close() throws IOException
-  {
-    delegate.close();
-  }
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportPublisher.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportPublisher.java
index 9dbb51b65698..882ce4e98a64 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportPublisher.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportPublisher.java
@@ -19,17 +19,11 @@
 
 package org.apache.druid.msq.indexing.error;
 
-import java.io.Closeable;
-import java.io.IOException;
-
 /**
  * Provides an interface for a worker to publish warnings to an external source.
  * For example, the worker uses this interface to send warnings to the controller.
  */
-public interface MSQWarningReportPublisher extends Closeable
+public interface MSQWarningReportPublisher
 {
   void publishException(int stageNumber, Throwable e);
-
-  @Override
-  void close() throws IOException;
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportSimplePublisher.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportSimplePublisher.java
index 1353f4040412..0a7554045daf 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportSimplePublisher.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportSimplePublisher.java
@@ -63,10 +63,4 @@ public void publishException(int stageNumber, Throwable e)
       throw new RuntimeException(e2);
     }
   }
-
-  @Override
-  public void close()
-  {
-
-  }
 }

From fd055b2f3b1076aadfbae52ce527c577b88bdcc3 Mon Sep 17 00:00:00 2001
From: Victoria Lim <vtlim@users.noreply.github.com>
Date: Mon, 16 Sep 2024 15:52:37 -0700
Subject: [PATCH 32/47] docs: Refresh docs for SQL input source (#17031)

Co-authored-by: Charles Smith <techdocsmith@gmail.com>
---
 .../extensions-core/druid-lookups.md          |  4 +-
 docs/development/extensions-core/mysql.md     | 73 +++++++++++-------
 .../development/extensions-core/postgresql.md | 13 ++--
 docs/ingestion/input-sources.md               | 74 +++++++++----------
 docs/querying/lookups-cached-global.md        |  2 +-
 5 files changed, 95 insertions(+), 71 deletions(-)

diff --git a/docs/development/extensions-core/druid-lookups.md b/docs/development/extensions-core/druid-lookups.md
index d6219b8c7428..06283ec4d722 100644
--- a/docs/development/extensions-core/druid-lookups.md
+++ b/docs/development/extensions-core/druid-lookups.md
@@ -31,9 +31,9 @@ This module can be used side to side with other lookup module like the global ca
 To use this Apache Druid extension, [include](../../configuration/extensions.md#loading-extensions) `druid-lookups-cached-single` in the extensions load list.
 
 :::info
- If using JDBC, you will need to add your database's client JAR files to the extension's directory.
+To use JDBC, you must add your database client JAR files to the extension's directory.
  For Postgres, the connector JAR is already included.
- See the MySQL extension documentation for instructions to obtain [MySQL](./mysql.md#installing-the-mysql-connector-library) or [MariaDB](./mysql.md#alternative-installing-the-mariadb-connector-library) connector libraries.
+ See the MySQL extension documentation for instructions to obtain [MySQL](./mysql.md#install-mysql-connectorj) or [MariaDB](./mysql.md#install-mariadb-connectorj) connector libraries.
  Copy or symlink the downloaded file to `extensions/druid-lookups-cached-single` under the distribution root directory.
 :::
 
diff --git a/docs/development/extensions-core/mysql.md b/docs/development/extensions-core/mysql.md
index bc6012dbb5a3..a3678f65056f 100644
--- a/docs/development/extensions-core/mysql.md
+++ b/docs/development/extensions-core/mysql.md
@@ -1,6 +1,6 @@
 ---
 id: mysql
-title: "MySQL Metadata Store"
+title: "MySQL metadata store"
 ---
 
 <!--
@@ -25,41 +25,58 @@ title: "MySQL Metadata Store"
 
 To use this Apache Druid extension, [include](../../configuration/extensions.md#loading-extensions) `mysql-metadata-storage` in the extensions load list.
 
-:::info
- The MySQL extension requires the MySQL Connector/J library or MariaDB Connector/J library, neither of which are included in the Druid distribution.
- Refer to the following section for instructions on how to install this library.
-:::
+With the MySQL extension, you can use MySQL as a metadata store or ingest from a MySQL database.
 
-## Installing the MySQL connector library
+The extension requires a connector library that's not included with Druid.
+See the [Prerequisites](#prerequisites) for installation instructions.
 
-This extension can use Oracle's MySQL JDBC driver which is not included in the Druid distribution. You must
-install it separately. There are a few ways to obtain this library:
+## Prerequisites
 
-- It can be downloaded from the MySQL site at: https://dev.mysql.com/downloads/connector/j/
-- It can be fetched from Maven Central at: https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.2.0/mysql-connector-j-8.2.0.jar
-- It may be available through your package manager, e.g. as `libmysql-java` on APT for a Debian-based OS
+To use the MySQL extension, you need to install one of the following libraries:
+* [MySQL Connector/J](#install-mysql-connectorj)
+* [MariaDB Connector/J](#install-mariadb-connectorj)
 
-This fetches the MySQL connector JAR file with a name like `mysql-connector-j-8.2.0.jar`.
+### Install MySQL Connector/J
 
-Copy or symlink this file inside the folder `extensions/mysql-metadata-storage` under the distribution root directory.
+The MySQL extension uses Oracle's MySQL JDBC driver.
+The current version of Druid uses version 8.2.0.
+Other versions may not work with this extension.
 
-## Alternative: Installing the MariaDB connector library
+You can download the library from one of the following sources:
 
-This extension also supports using the MariaDB connector jar, though it is also not included in the Druid distribution, so you must install it separately.
+- [MySQL website](https://dev.mysql.com/downloads/connector/j/)  
+  Visit the archives page to access older product versions.
+- [Maven Central (direct download)](https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.2.0/mysql-connector-j-8.2.0.jar)
+- Your package manager. For example, `libmysql-java` on APT for a Debian-based OS.
 
-- Download from the MariaDB site: https://mariadb.com/downloads/connector
-- Download from Maven Central: https://repo1.maven.org/maven2/org/mariadb/jdbc/mariadb-java-client/2.7.3/mariadb-java-client-2.7.3.jar
+The download includes the MySQL connector JAR file with a name like `mysql-connector-j-8.2.0.jar`.
+Copy or create a symbolic link to this file inside the `lib` folder in the distribution root directory.
 
-This fetches the MariaDB connector JAR file with a name like `maria-java-client-2.7.3.jar`.
+### Install MariaDB Connector/J
 
-Copy or symlink this file to `extensions/mysql-metadata-storage` under the distribution root directory.
+This extension also supports using the MariaDB connector jar.
+The current version of Druid uses version 2.7.3.
+Other versions may not work with this extension.
+
+You can download the library from one of the following sources:
+
+- [MariaDB website](https://mariadb.com/downloads/connectors/connectors-data-access/java8-connector)  
+  Click **Show All Files** to access older product versions.
+- [Maven Central (direct download)](https://repo1.maven.org/maven2/org/mariadb/jdbc/mariadb-java-client/2.7.3/mariadb-java-client-2.7.3.jar)
+
+The download includes the MariaDB connector JAR file with a name like `maria-java-client-2.7.3.jar`.
+Copy or create a symbolic link to this file inside the `lib` folder in the distribution root directory.
 
 To configure the `mysql-metadata-storage` extension to use the MariaDB connector library instead of MySQL, set `druid.metadata.mysql.driver.driverClassName=org.mariadb.jdbc.Driver`.
 
-Depending on the MariaDB client library version, the connector supports both `jdbc:mysql:` and `jdbc:mariadb:` connection URIs. However, the parameters to configure the connection vary between implementations, so be sure to [check the documentation](https://mariadb.com/kb/en/about-mariadb-connector-j/#connection-strings) for details.
+The protocol of the connection string is `jdbc:mysql:` or `jdbc:mariadb:`,
+depending on your specific version of the MariaDB client library.
+For more information on the parameters to configure a connection,
+[see the MariaDB documentation](https://mariadb.com/kb/en/about-mariadb-connector-j/#connection-strings)
+for your connector version.
 
 
-## Setting up MySQL
+## Set up MySQL
 
 To avoid issues with upgrades that require schema changes to a large metadata table, consider a MySQL version that supports instant ADD COLUMN semantics. For example, MySQL 8.
 
@@ -90,7 +107,7 @@ This extension also supports using MariaDB server, https://mariadb.org/download/
   CREATE DATABASE druid DEFAULT CHARACTER SET utf8mb4;
 
   -- create a druid user
-  CREATE USER 'druid'@'localhost' IDENTIFIED BY 'diurd';
+  CREATE USER 'druid'@'localhost' IDENTIFIED BY 'password';
 
   -- grant the user all the permissions on the database we just created
   GRANT ALL PRIVILEGES ON druid.* TO 'druid'@'localhost';
@@ -111,10 +128,11 @@ This extension also supports using MariaDB server, https://mariadb.org/download/
 
 If using the MariaDB connector library, set `druid.metadata.mysql.driver.driverClassName=org.mariadb.jdbc.Driver`.
 
-## Encrypting MySQL connections
-  This extension provides support for encrypting MySQL connections. To get more information about encrypting MySQL connections using TLS/SSL in general, please refer to this [guide](https://dev.mysql.com/doc/refman/5.7/en/using-encrypted-connections.html).
+## Encrypt MySQL connections
 
-## Configuration
+This extension provides support for encrypting MySQL connections. To get more information about encrypting MySQL connections using TLS/SSL in general, please refer to this [guide](https://dev.mysql.com/doc/refman/5.7/en/using-encrypted-connections.html).
+
+## Configuration properties
 
 |Property|Description|Default|Required|
 |--------|-----------|-------|--------|
@@ -129,7 +147,10 @@ If using the MariaDB connector library, set `druid.metadata.mysql.driver.driverC
 |`druid.metadata.mysql.ssl.enabledSSLCipherSuites`|Overrides the existing cipher suites with these cipher suites.|none|no|
 |`druid.metadata.mysql.ssl.enabledTLSProtocols`|Overrides the TLS protocols with these protocols.|none|no|
 
-### MySQL InputSource
+## MySQL input source
+
+The MySQL extension provides an implementation of an SQL input source to ingest data into Druid from a MySQL database.
+For more information on the input source parameters, see [SQL input source](../../ingestion/input-sources.md#sql-input-source).
 
 ```json
 {
diff --git a/docs/development/extensions-core/postgresql.md b/docs/development/extensions-core/postgresql.md
index 919bf372b844..006a65ed4279 100644
--- a/docs/development/extensions-core/postgresql.md
+++ b/docs/development/extensions-core/postgresql.md
@@ -1,6 +1,6 @@
 ---
 id: postgresql
-title: "PostgreSQL Metadata Store"
+title: "PostgreSQL metadata store"
 ---
 
 <!--
@@ -25,7 +25,9 @@ title: "PostgreSQL Metadata Store"
 
 To use this Apache Druid extension, [include](../../configuration/extensions.md#loading-extensions) `postgresql-metadata-storage` in the extensions load list.
 
-## Setting up PostgreSQL
+With the  PostgreSQL extension, you can use PostgreSQL as a metadata store or ingest from a PostgreSQL database.
+
+## Set up PostgreSQL
 
 To avoid issues with upgrades that require schema changes to a large metadata table, consider a PostgreSQL version that supports instant ADD COLUMN semantics.
 
@@ -69,7 +71,7 @@ To avoid issues with upgrades that require schema changes to a large metadata ta
   druid.metadata.storage.connector.password=diurd
   ```
 
-## Configuration
+## Configuration properties
 
 In most cases, the configuration options map directly to the [postgres JDBC connection options](https://jdbc.postgresql.org/documentation/use/#connecting-to-the-database).
 
@@ -87,9 +89,10 @@ In most cases, the configuration options map directly to the [postgres JDBC conn
 | `druid.metadata.postgres.ssl.sslPasswordCallback` | The classname of the SSL password provider. | none | no |
 | `druid.metadata.postgres.dbTableSchema` | druid meta table schema | `public` | no |
 
-### PostgreSQL InputSource
+## PostgreSQL input source
 
-The PostgreSQL extension provides an implementation of an [SQL input source](../../ingestion/input-sources.md) which can be used to ingest data into Druid from a PostgreSQL database.
+The PostgreSQL extension provides an implementation of an SQL input source to ingest data into Druid from a PostgreSQL database.
+For more information on the input source parameters, see [SQL input source](../../ingestion/input-sources.md#sql-input-source).
 
 ```json
 {
diff --git a/docs/ingestion/input-sources.md b/docs/ingestion/input-sources.md
index 71340abc2c0b..495b3fd87333 100644
--- a/docs/ingestion/input-sources.md
+++ b/docs/ingestion/input-sources.md
@@ -29,10 +29,8 @@ For general information on native batch indexing and parallel task indexing, see
 
 ## S3 input source
 
-:::info
-
-You need to include the [`druid-s3-extensions`](../development/extensions-core/s3.md) as an extension to use the S3 input source.
-
+:::info Required extension
+To use the S3 input source, load the extension [`druid-s3-extensions`](../development/extensions-core/s3.md) in your `common.runtime.properties` file.
 :::
 
 The S3 input source reads objects directly from S3. You can specify either:
@@ -41,7 +39,7 @@ The S3 input source reads objects directly from S3. You can specify either:
 * a list of S3 location prefixes that attempts to list the contents and ingest
 all objects contained within the locations.
 
-The S3 input source is splittable. Therefore, you can use it with the [Parallel task](./native-batch.md). Each worker task of `index_parallel` reads one or multiple objects.
+The S3 input source is splittable. Therefore, you can use it with the [parallel task](./native-batch.md). Each worker task of `index_parallel` reads one or multiple objects.
 
 Sample specs:
 
@@ -219,16 +217,14 @@ If `accessKeyId` and `secretAccessKey` are not given, the default [S3 credential
 
 ## Google Cloud Storage input source
 
-:::info
-
-You need to include the [`druid-google-extensions`](../development/extensions-core/google.md) as an extension to use the Google Cloud Storage input source.
-
+:::info Required extension
+To use the Google Cloud Storage input source, load the extension [`druid-google-extensions`](../development/extensions-core/google.md) in your `common.runtime.properties` file.
 :::
 
 The Google Cloud Storage input source is to support reading objects directly
 from Google Cloud Storage. Objects can be specified as list of Google
 Cloud Storage URI strings. The Google Cloud Storage input source is splittable
-and can be used by the [Parallel task](./native-batch.md), where each worker task of `index_parallel` will read
+and can be used by the [parallel task](./native-batch.md), where each worker task of `index_parallel` will read
 one or multiple objects.
 
 Sample specs:
@@ -307,14 +303,12 @@ Google Cloud Storage object:
 
 ## Azure input source
 
-:::info
-
-You need to include the [`druid-azure-extensions`](../development/extensions-core/azure.md) as an extension to use the Azure input source.
-
+:::info Required extension
+To use the Azure input source, load the extension [`druid-azure-extensions`](../development/extensions-core/azure.md) in your `common.runtime.properties` file.
 :::
 
 The Azure input source (that uses the type `azureStorage`) reads objects directly from Azure Blob store or Azure Data Lake sources. You can
-specify objects as a list of file URI strings or prefixes. You can split the Azure input source for use with [Parallel task](./native-batch.md) indexing and each worker task reads one chunk of the split data.
+specify objects as a list of file URI strings or prefixes. You can split the Azure input source for use with [parallel task](./native-batch.md) indexing and each worker task reads one chunk of the split data.
 
 The `azureStorage` input source is a new schema for Azure input sources that allows you to specify which storage account files should be ingested from. We recommend that you update any specs that use the old `azure` schema to use the new `azureStorage` schema. The new schema provides more functionality than the older `azure` schema.
 
@@ -491,15 +485,13 @@ The `objects` property is:
 
 ## HDFS input source
 
-:::info
-
-You need to include the [`druid-hdfs-storage`](../development/extensions-core/hdfs.md) as an extension to use the HDFS input source.
-
+:::info Required extension
+To use the HDFS input source, load the extension [`druid-hdfs-storage`](../development/extensions-core/hdfs.md) in your `common.runtime.properties` file.
 :::
 
 The HDFS input source is to support reading files directly
 from HDFS storage. File paths can be specified as an HDFS URI string or a list
-of HDFS URI strings. The HDFS input source is splittable and can be used by the [Parallel task](./native-batch.md),
+of HDFS URI strings. The HDFS input source is splittable and can be used by the [parallel task](./native-batch.md),
 where each worker task of `index_parallel` will read one or multiple files.
 
 Sample specs:
@@ -593,7 +585,7 @@ The `http` input source is not limited to the HTTP or HTTPS protocols. It uses t
 
 For more information about security best practices, see [Security overview](../operations/security-overview.md#best-practices).
 
-The HTTP input source is _splittable_ and can be used by the [Parallel task](./native-batch.md),
+The HTTP input source is _splittable_ and can be used by the [parallel task](./native-batch.md),
 where each worker task of `index_parallel` will read only one file. This input source does not support Split Hint Spec.
 
 Sample specs:
@@ -701,7 +693,7 @@ Sample spec:
 
 The Local input source is to support reading files directly from local storage,
 and is mainly intended for proof-of-concept testing.
-The Local input source is _splittable_ and can be used by the [Parallel task](./native-batch.md),
+The Local input source is _splittable_ and can be used by the [parallel task](./native-batch.md),
 where each worker task of `index_parallel` will read one or multiple files.
 
 Sample spec:
@@ -736,7 +728,7 @@ Sample spec:
 
 The Druid input source is to support reading data directly from existing Druid segments,
 potentially using a new schema and changing the name, dimensions, metrics, rollup, etc. of the segment.
-The Druid input source is _splittable_ and can be used by the [Parallel task](./native-batch.md).
+The Druid input source is _splittable_ and can be used by the [parallel task](./native-batch.md).
 This input source has a fixed input format for reading from Druid segments;
 no `inputFormat` field needs to be specified in the ingestion spec when using this input source.
 
@@ -833,17 +825,29 @@ For more information on the `maxNumConcurrentSubTasks` field, see [Implementatio
 
 ## SQL input source
 
+:::info Required extension
+To use the SQL input source, you must load the appropriate extension in your `common.runtime.properties` file.
+* To connect to MySQL, load the extension [`mysql-metadata-storage`](../development/extensions-core/mysql.md).
+* To connect to PostgreSQL, load the extension [`postgresql-metadata-storage`](../development/extensions-core/postgresql.md).
+
+The MySQL extension requires a JDBC driver.
+For more information, see the [Installing the MySQL connector library](../development/extensions-core/mysql.md).
+:::
+
 The SQL input source is used to read data directly from RDBMS.
-The SQL input source is _splittable_ and can be used by the [Parallel task](./native-batch.md), where each worker task will read from one SQL query from the list of queries.
+You can _split_ the ingestion tasks for a SQL input source. When you use the [parallel task](./native-batch.md) type, each worker task reads from one SQL query from the list of queries.
 This input source does not support Split Hint Spec.
-Since this input source has a fixed input format for reading events, no `inputFormat` field needs to be specified in the ingestion spec when using this input source.
-Please refer to the Recommended practices section below before using this input source.
+
+The SQL input source has a fixed input format for reading events.
+Don't specify `inputFormat` when using this input source.
+
+Refer to the [recommended practices](#recommended-practices) before using this input source.
 
 |Property|Description|Required|
 |--------|-----------|---------|
 |type|Set the value to `sql`.|Yes|
-|database|Specifies the database connection details. The database type corresponds to the extension that supplies the `connectorConfig` support. The specified extension must be loaded into Druid:<br/><br/><ul><li>[mysql-metadata-storage](../development/extensions-core/mysql.md) for `mysql`</li><li> [postgresql-metadata-storage](../development/extensions-core/postgresql.md) extension for `postgresql`.</li></ul><br/><br/>You can selectively allow JDBC properties in `connectURI`. See [JDBC connections security config](../configuration/index.md#jdbc-connections-to-external-databases) for more details.|Yes|
-|foldCase|Toggle case folding of database column names. This may be enabled in cases where the database returns case insensitive column names in query results.|No|
+|database|Specifies the database connection details. The database type corresponds to the extension that supplies the `connectorConfig` support.<br/><br/>You can selectively allow JDBC properties in `connectURI`. See [JDBC connections security config](../configuration/index.md#jdbc-connections-to-external-databases) for more details.|Yes|
+|foldCase|Boolean to toggle case folding of database column names. For example, to ingest a database column named `Entry_Date` as `entry_date`, set `foldCase` to true and include `entry_date` in the [`dimensionsSpec`](ingestion-spec.md#dimensionsspec).|No|
 |sqls|List of SQL queries where each SQL query would retrieve the data to be indexed.|Yes|
 
 The following is an example of an SQL input source spec:
@@ -887,7 +891,7 @@ Compared to the other native batch input sources, SQL input source behaves diffe
 
 The Combining input source lets you read data from multiple input sources.
 It identifies the splits from delegate input sources and uses a worker task to process each split.
-Use the Combining input source only if all the delegates are splittable and can be used by the [Parallel task](./native-batch.md).
+Each delegate input source must be splittable and compatible with the [parallel task type](./native-batch.md).
 
 Similar to other input sources, the Combining input source supports a single `inputFormat`.
 Delegate input sources that require an `inputFormat` must have the same format for input data.
@@ -931,10 +935,8 @@ The following is an example of a Combining input source spec:
 
 ## Iceberg input source
 
-:::info
-
-To use the Iceberg input source, load the extension [`druid-iceberg-extensions`](../development/extensions-contrib/iceberg.md).
-
+:::info Required extension
+To use the Iceberg input source, load the extension [`druid-iceberg-extensions`](../development/extensions-contrib/iceberg.md) in your `common.runtime.properties` file.
 :::
 
 You use the Iceberg input source to read data stored in the Iceberg table format. For a given table, the input source scans up to the latest Iceberg snapshot from the configured Hive catalog. Druid ingests the underlying live data files using the existing input source formats.
@@ -1138,10 +1140,8 @@ This input source provides the following filters: `and`, `equals`, `interval`, a
 
 ## Delta Lake input source
 
-:::info
-
-To use the Delta Lake input source, load the extension [`druid-deltalake-extensions`](../development/extensions-contrib/delta-lake.md).
-
+:::info Required extension
+To use the Delta Lake input source, load the extension [`druid-deltalake-extensions`](../development/extensions-contrib/delta-lake.md) in your `common.runtime.properties` file.
 :::
 
 You can use the Delta input source to read data stored in a Delta Lake table. For a given table, the input source scans
diff --git a/docs/querying/lookups-cached-global.md b/docs/querying/lookups-cached-global.md
index 72c4189c2dad..a0208b17bc30 100644
--- a/docs/querying/lookups-cached-global.md
+++ b/docs/querying/lookups-cached-global.md
@@ -377,7 +377,7 @@ The JDBC lookups will poll a database to populate its local cache. If the `tsCol
 :::info
  If using JDBC, you will need to add your database's client JAR files to the extension's directory.
  For Postgres, the connector JAR is already included.
- See the MySQL extension documentation for instructions to obtain [MySQL](../development/extensions-core/mysql.md#installing-the-mysql-connector-library) or [MariaDB](../development/extensions-core/mysql.md#alternative-installing-the-mariadb-connector-library) connector libraries.
+ See the MySQL extension documentation for instructions to obtain [MySQL](../development/extensions-core/mysql.md#install-mysql-connectorj) or [MariaDB](../development/extensions-core/mysql.md#install-mariadb-connectorj) connector libraries.
  The connector JAR should reside in the classpath of Druid's main class loader.
  To add the connector JAR to the classpath, you can copy the downloaded file to `lib/` under the distribution root directory. Alternatively, create a symbolic link to the connector in the `lib` directory.
 :::

From 2a47b38eff5a3d4754c123717ccb9639303a2709 Mon Sep 17 00:00:00 2001
From: Laksh Singla <lakshsingla@gmail.com>
Date: Tue, 17 Sep 2024 10:06:24 +0530
Subject: [PATCH 33/47] Support maxSubqueryBytes for window functions (#16800)

Window queries now acknowledge maxSubqueryBytes.
---
 .../druid/query/ResultSerializationMode.java  |  39 +++++
 .../WindowOperatorQueryQueryToolChest.java    | 134 ++++++++++-----
 ...WindowOperatorQueryQueryToolChestTest.java | 162 ++++++++++++++++++
 .../server/ClientQuerySegmentWalker.java      |  33 ++--
 .../server/ClientQuerySegmentWalkerUtils.java |  22 ++-
 .../server/ClientQuerySegmentWalkerTest.java  |   4 +-
 .../sql/calcite/BaseCalciteQueryTest.java     |  82 ++++++++-
 .../sql/calcite/CalciteWindowQueryTest.java   |  52 ++++--
 .../tests/window/arrayAggWithOrderBy.sqlTest  |   4 +-
 .../tests/window/arrayConcatAgg.sqlTest       |   4 +-
 .../tests/window/rank_handling.sqlTest        |   3 -
 11 files changed, 459 insertions(+), 80 deletions(-)
 create mode 100644 processing/src/main/java/org/apache/druid/query/ResultSerializationMode.java
 create mode 100644 processing/src/test/java/org/apache/druid/query/operator/WindowOperatorQueryQueryToolChestTest.java

diff --git a/processing/src/main/java/org/apache/druid/query/ResultSerializationMode.java b/processing/src/main/java/org/apache/druid/query/ResultSerializationMode.java
new file mode 100644
index 000000000000..3d42e8cbc828
--- /dev/null
+++ b/processing/src/main/java/org/apache/druid/query/ResultSerializationMode.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.query;
+
+/**
+ * Serialization medium of the query results on the broker. It is currently used to communicate the result's format between
+ * the main query processing walker and the individual toolchests while materializing subquery's rows
+ */
+public enum ResultSerializationMode
+{
+  /**
+   * Materialize the inner results as rows
+   */
+  ROWS,
+
+  /**
+   * Materialize the inner results as frames
+   */
+  FRAMES;
+
+  public static final String CTX_SERIALIZATION_PARAMETER = "serialization";
+}
diff --git a/processing/src/main/java/org/apache/druid/query/operator/WindowOperatorQueryQueryToolChest.java b/processing/src/main/java/org/apache/druid/query/operator/WindowOperatorQueryQueryToolChest.java
index bec529eedefa..7fb67e8732db 100644
--- a/processing/src/main/java/org/apache/druid/query/operator/WindowOperatorQueryQueryToolChest.java
+++ b/processing/src/main/java/org/apache/druid/query/operator/WindowOperatorQueryQueryToolChest.java
@@ -23,24 +23,30 @@
 import com.google.common.base.Function;
 import com.google.common.base.Functions;
 import com.google.common.collect.ImmutableMap;
+import org.apache.druid.error.DruidException;
+import org.apache.druid.frame.allocation.MemoryAllocatorFactory;
 import org.apache.druid.java.util.common.guava.Sequence;
 import org.apache.druid.java.util.common.guava.Sequences;
 import org.apache.druid.query.DefaultQueryMetrics;
+import org.apache.druid.query.FrameSignaturePair;
 import org.apache.druid.query.QueryMetrics;
 import org.apache.druid.query.QueryPlus;
 import org.apache.druid.query.QueryRunner;
 import org.apache.druid.query.QueryToolChest;
+import org.apache.druid.query.ResultSerializationMode;
 import org.apache.druid.query.aggregation.MetricManipulationFn;
 import org.apache.druid.query.context.ResponseContext;
 import org.apache.druid.query.rowsandcols.RowsAndColumns;
 import org.apache.druid.query.rowsandcols.column.Column;
 import org.apache.druid.query.rowsandcols.column.ColumnAccessor;
 import org.apache.druid.query.rowsandcols.column.NullColumn;
+import org.apache.druid.query.rowsandcols.semantic.FrameMaker;
 import org.apache.druid.segment.column.ColumnType;
 import org.apache.druid.segment.column.RowSignature;
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Optional;
 import java.util.function.Supplier;
 
 public class WindowOperatorQueryQueryToolChest extends QueryToolChest<RowsAndColumns, WindowOperatorQuery>
@@ -50,7 +56,7 @@ public class WindowOperatorQueryQueryToolChest extends QueryToolChest<RowsAndCol
   @SuppressWarnings("unchecked")
   public QueryRunner<RowsAndColumns> mergeResults(QueryRunner<RowsAndColumns> runner)
   {
-    return new RowsAndColumnsUnravelingQueryRunner(
+    return new RowsAndColumnsSerializingQueryRunner(
         (queryPlus, responseContext) -> {
           final WindowOperatorQuery query = (WindowOperatorQuery) queryPlus.getQuery();
           final List<OperatorFactory> opFactories = query.getOperators();
@@ -61,7 +67,7 @@ public QueryRunner<RowsAndColumns> mergeResults(QueryRunner<RowsAndColumns> runn
           Supplier<Operator> opSupplier = () -> {
             Operator retVal = new SequenceOperator(
                 runner.run(
-                    queryPlus.withQuery(query.withOperators(new ArrayList<OperatorFactory>())),
+                    queryPlus.withQuery(query.withOperators(new ArrayList<>())),
                     responseContext
                 )
             );
@@ -112,16 +118,29 @@ public Sequence<Object[]> resultsAsArrays(
       Sequence<RowsAndColumns> resultSequence
   )
   {
-    // Dark magic; see RowsAndColumnsUnravelingQueryRunner.
+    // Dark magic; see RowsAndColumnsSerializingQueryRunner.
     return (Sequence) resultSequence;
   }
 
+  @Override
+  @SuppressWarnings({"unchecked", "rawtypes"})
+  public Optional<Sequence<FrameSignaturePair>> resultsAsFrames(
+      WindowOperatorQuery query,
+      Sequence<RowsAndColumns> resultSequence,
+      MemoryAllocatorFactory memoryAllocatorFactory,
+      boolean useNestedForUnknownTypes
+  )
+  {
+    // see RowsAndColumnsSerializingQueryRunner
+    return Optional.of((Sequence) resultSequence);
+  }
+
   /**
-   * This class exists to unravel the RowsAndColumns that are used in this query and make it the return Sequence
-   * actually be a Sequence of rows.  This is relatively broken in a number of regards, the most obvious of which
-   * is that it is going to run counter to the stated class on the Generic of the QueryToolChest.  That is, the
-   * code makes it look like you are getting a Sequence of RowsAndColumns, but, by using this, the query will
-   * actually ultimately produce a Sequence of Object[].  This works because of type Erasure in Java (it's all Object
+   * This class exists to serialize the RowsAndColumns that are used in this query and make it the return Sequence
+   * actually be a Sequence of rows or frames, as the query requires.
+   * This is relatively broken in a number of regards, the most obvious of which is that it is going to run counter to the stated class on the Generic of the QueryToolChest.
+   * That is, the code makes it look like you are getting a Sequence of RowsAndColumns, but, by using this, the query will
+   * actually ultimately produce a Sequence of Object[] or Frames.  This works because of type Erasure in Java (it's all Object
    * at the end of the day).
    * <p>
    * While it might seem like this will break all sorts of things, the Generic type is actually there more as a type
@@ -132,12 +151,12 @@ public Sequence<Object[]> resultsAsArrays(
    * Not our proudest moment, but we use the tools available to us.
    */
   @SuppressWarnings({"unchecked", "rawtypes"})
-  private static class RowsAndColumnsUnravelingQueryRunner implements QueryRunner
+  private static class RowsAndColumnsSerializingQueryRunner implements QueryRunner
   {
 
     private final QueryRunner<RowsAndColumns> baseQueryRunner;
 
-    private RowsAndColumnsUnravelingQueryRunner(
+    private RowsAndColumnsSerializingQueryRunner(
         QueryRunner<RowsAndColumns> baseQueryRunner
     )
     {
@@ -158,42 +177,77 @@ public Sequence run(
             queryPlus.withQuery(query.withOverriddenContext(ImmutableMap.of("unravel", false))),
             responseContext
         );
+        final ResultSerializationMode serializationMode = query.context().getEnum(
+            ResultSerializationMode.CTX_SERIALIZATION_PARAMETER,
+            ResultSerializationMode.class,
+            ResultSerializationMode.ROWS
+        );
+        switch (serializationMode) {
+          case ROWS:
+            return asRows(baseSequence, query);
+          case FRAMES:
+            return asFrames(baseSequence);
+          default:
+            throw DruidException.defensive("Serialization mode[%s] not supported", serializationMode);
+        }
+      }
 
-        final RowSignature rowSignature = query.getRowSignature();
-        return baseSequence.flatMap(
-            rac -> {
-              List<Object[]> results = new ArrayList<>(rac.numRows());
-
-              ColumnAccessor[] accessors = new ColumnAccessor[rowSignature.size()];
-              int index = 0;
-              for (String columnName : rowSignature.getColumnNames()) {
-                final Column column = rac.findColumn(columnName);
-                if (column == null) {
-                  final ColumnType columnType = rowSignature
-                      .getColumnType(columnName)
-                      .orElse(ColumnType.UNKNOWN_COMPLEX);
-
-                  accessors[index] = new NullColumn.Accessor(columnType, rac.numRows());
-                } else {
-                  accessors[index] = column.toAccessor();
-                }
-                ++index;
-              }
+      return baseQueryRunner.run(queryPlus, responseContext);
+    }
 
-              for (int i = 0; i < rac.numRows(); ++i) {
-                Object[] objArr = new Object[accessors.length];
-                for (int j = 0; j < accessors.length; j++) {
-                  objArr[j] = accessors[j].getObject(i);
-                }
-                results.add(objArr);
+    /**
+     * Translates Sequence of RACs to a Sequence of Object[]
+     */
+    private static Sequence asRows(final Sequence<RowsAndColumns> baseSequence, final WindowOperatorQuery query)
+    {
+      final RowSignature rowSignature = query.getRowSignature();
+      return baseSequence.flatMap(
+          rac -> {
+            List<Object[]> results = new ArrayList<>(rac.numRows());
+
+            ColumnAccessor[] accessors = new ColumnAccessor[rowSignature.size()];
+            int index = 0;
+            for (String columnName : rowSignature.getColumnNames()) {
+              final Column column = rac.findColumn(columnName);
+              if (column == null) {
+                final ColumnType columnType = rowSignature
+                    .getColumnType(columnName)
+                    .orElse(ColumnType.UNKNOWN_COMPLEX);
+
+                accessors[index] = new NullColumn.Accessor(columnType, rac.numRows());
+              } else {
+                accessors[index] = column.toAccessor();
               }
+              ++index;
+            }
 
-              return Sequences.simple(results);
+            for (int i = 0; i < rac.numRows(); ++i) {
+              Object[] objArr = new Object[accessors.length];
+              for (int j = 0; j < accessors.length; j++) {
+                objArr[j] = accessors[j].getObject(i);
+              }
+              results.add(objArr);
             }
-        );
-      }
 
-      return baseQueryRunner.run(queryPlus, responseContext);
+            return Sequences.simple(results);
+          }
+      );
+    }
+
+    /**
+     * Translates a sequence of RACs to a Sequence of Frames
+     */
+    private static Sequence asFrames(final Sequence<RowsAndColumns> baseSequence)
+    {
+      return baseSequence.map(
+          rac -> {
+            FrameMaker frameMaker = FrameMaker.fromRAC(rac);
+            return new FrameSignaturePair(
+                frameMaker.toColumnBasedFrame(),
+                frameMaker.computeSignature()
+            );
+          }
+      );
     }
   }
 }
diff --git a/processing/src/test/java/org/apache/druid/query/operator/WindowOperatorQueryQueryToolChestTest.java b/processing/src/test/java/org/apache/druid/query/operator/WindowOperatorQueryQueryToolChestTest.java
new file mode 100644
index 000000000000..2100b36e57dc
--- /dev/null
+++ b/processing/src/test/java/org/apache/druid/query/operator/WindowOperatorQueryQueryToolChestTest.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.query.operator;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import org.apache.druid.frame.read.FrameReader;
+import org.apache.druid.frame.testutil.FrameTestUtil;
+import org.apache.druid.java.util.common.Intervals;
+import org.apache.druid.java.util.common.guava.Sequences;
+import org.apache.druid.query.Druids;
+import org.apache.druid.query.FrameSignaturePair;
+import org.apache.druid.query.QueryDataSource;
+import org.apache.druid.query.QueryPlus;
+import org.apache.druid.query.ResultSerializationMode;
+import org.apache.druid.query.TableDataSource;
+import org.apache.druid.query.operator.window.WindowOperatorFactory;
+import org.apache.druid.query.operator.window.ranking.WindowRowNumberProcessor;
+import org.apache.druid.query.rowsandcols.MapOfColumnsRowsAndColumns;
+import org.apache.druid.query.rowsandcols.column.IntArrayColumn;
+import org.apache.druid.query.scan.ScanQuery;
+import org.apache.druid.query.spec.LegacySegmentSpec;
+import org.apache.druid.segment.column.ColumnType;
+import org.apache.druid.segment.column.RowSignature;
+import org.apache.druid.testing.InitializedNullHandlingTest;
+import org.junit.Assert;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+
+public class WindowOperatorQueryQueryToolChestTest extends InitializedNullHandlingTest
+{
+
+  private final WindowOperatorQueryQueryToolChest toolchest = new WindowOperatorQueryQueryToolChest();
+
+  @Test
+  public void mergeResultsWithRowResultSerializationMode()
+  {
+    RowSignature inputSignature = RowSignature.builder()
+                                              .add("length", ColumnType.LONG)
+                                              .build();
+    RowSignature outputSignature = RowSignature.builder()
+                                               .addAll(inputSignature)
+                                               .add("w0", ColumnType.LONG)
+                                               .build();
+
+    final WindowOperatorQuery query = new WindowOperatorQuery(
+        new QueryDataSource(
+            Druids.newScanQueryBuilder()
+                  .dataSource(new TableDataSource("test"))
+                  .intervals(new LegacySegmentSpec(Intervals.ETERNITY))
+                  .columns("length")
+                  .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_COMPACTED_LIST)
+                  .context(new HashMap<>())
+                  .build()
+        ),
+        new LegacySegmentSpec(Intervals.ETERNITY),
+        new HashMap<>(),
+        outputSignature,
+        ImmutableList.of(
+            new WindowOperatorFactory(new WindowRowNumberProcessor("w0"))
+        ),
+        ImmutableList.of()
+    );
+    List results = toolchest.mergeResults(
+        (queryPlus, responseContext) -> Sequences.simple(
+            Collections.singletonList(
+                MapOfColumnsRowsAndColumns.fromMap(
+                    ImmutableMap.of("length", new IntArrayColumn(new int[]{1, 5, 10}))
+                )
+            )
+        )
+    ).run(QueryPlus.wrap(query)).toList();
+
+    Assert.assertTrue(results.get(0) instanceof Object[]);
+    Assert.assertEquals(3, results.size());
+    List<Object[]> expectedResults = ImmutableList.of(
+        new Object[]{1, 1},
+        new Object[]{5, 2},
+        new Object[]{10, 3}
+    );
+
+    for (int i = 0; i < 3; ++i) {
+      Assert.assertArrayEquals(expectedResults.get(i), (Object[]) results.get(i));
+    }
+  }
+
+  @Test
+  public void mergeResultsWithFrameResultSerializationMode()
+  {
+    RowSignature inputSignature = RowSignature.builder()
+                                              .add("length", ColumnType.LONG)
+                                              .build();
+    RowSignature outputSignature = RowSignature.builder()
+                                               .addAll(inputSignature)
+                                               .add("w0", ColumnType.LONG)
+                                               .build();
+
+    final WindowOperatorQuery query = new WindowOperatorQuery(
+        new QueryDataSource(
+            Druids.newScanQueryBuilder()
+                  .dataSource(new TableDataSource("test"))
+                  .intervals(new LegacySegmentSpec(Intervals.ETERNITY))
+                  .columns("length")
+                  .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_COMPACTED_LIST)
+                  .context(new HashMap<>())
+                  .build()
+        ),
+        new LegacySegmentSpec(Intervals.ETERNITY),
+        Collections.singletonMap(ResultSerializationMode.CTX_SERIALIZATION_PARAMETER, ResultSerializationMode.FRAMES.toString()),
+        outputSignature,
+        ImmutableList.of(
+            new WindowOperatorFactory(new WindowRowNumberProcessor("w0"))
+        ),
+        ImmutableList.of()
+    );
+    List results = toolchest.mergeResults(
+        (queryPlus, responseContext) -> Sequences.simple(
+            Collections.singletonList(
+                MapOfColumnsRowsAndColumns.fromMap(
+                    ImmutableMap.of("length", new IntArrayColumn(new int[]{1, 5, 10}))
+                )
+            )
+        )
+    ).run(QueryPlus.wrap(query)).toList();
+
+    Assert.assertTrue(results.get(0) instanceof FrameSignaturePair);
+    Assert.assertEquals(1, results.size());
+
+    FrameReader reader = FrameReader.create(((FrameSignaturePair) results.get(0)).getRowSignature());
+    List<List<Object>> resultRows = FrameTestUtil.readRowsFromCursorFactory(
+        reader.makeCursorFactory(((FrameSignaturePair) results.get(0)).getFrame())
+    ).toList();
+
+    List<List<Object>> expectedResults = ImmutableList.of(
+        ImmutableList.of(1L, 1L),
+        ImmutableList.of(5L, 2L),
+        ImmutableList.of(10L, 3L)
+    );
+    Assertions.assertEquals(expectedResults, resultRows);
+  }
+}
diff --git a/server/src/main/java/org/apache/druid/server/ClientQuerySegmentWalker.java b/server/src/main/java/org/apache/druid/server/ClientQuerySegmentWalker.java
index 990878eda6e3..37ae14f56c30 100644
--- a/server/src/main/java/org/apache/druid/server/ClientQuerySegmentWalker.java
+++ b/server/src/main/java/org/apache/druid/server/ClientQuerySegmentWalker.java
@@ -56,6 +56,7 @@
 import org.apache.druid.query.QueryToolChestWarehouse;
 import org.apache.druid.query.ResourceLimitExceededException;
 import org.apache.druid.query.ResultLevelCachingQueryRunner;
+import org.apache.druid.query.ResultSerializationMode;
 import org.apache.druid.query.RetryQueryRunner;
 import org.apache.druid.query.RetryQueryRunnerConfig;
 import org.apache.druid.query.SegmentDescriptor;
@@ -359,7 +360,7 @@ private DataSource globalizeIfPossible(
    * @param dryRun                      if true, does not actually execute any subqueries, but will inline empty result sets.
    */
   @SuppressWarnings({"rawtypes", "unchecked"}) // Subquery, toolchest, runner handling all use raw types
-  private DataSource inlineIfNecessary(
+  private <T> DataSource inlineIfNecessary(
       final DataSource dataSource,
       @Nullable final QueryToolChest toolChestIfOutermost,
       final AtomicInteger subqueryRowLimitAccumulator,
@@ -434,11 +435,17 @@ private DataSource inlineIfNecessary(
         if (dryRun) {
           queryResults = Sequences.empty();
         } else {
-          final QueryRunner subqueryRunner = subQuery.getRunner(this);
-          queryResults = subqueryRunner.run(
-              QueryPlus.wrap(subQuery),
-              DirectDruidClient.makeResponseContextForQuery()
+          Query subQueryWithSerialization = subQuery.withOverriddenContext(
+              Collections.singletonMap(
+                  ResultSerializationMode.CTX_SERIALIZATION_PARAMETER,
+                  ClientQuerySegmentWalkerUtils.getLimitType(maxSubqueryMemory, cannotMaterializeToFrames.get())
+                                               .serializationMode()
+                                               .toString()
+              )
           );
+          queryResults = subQueryWithSerialization
+              .getRunner(this)
+              .run(QueryPlus.wrap(subQueryWithSerialization), DirectDruidClient.makeResponseContextForQuery());
         }
 
         return toInlineDataSource(
@@ -647,14 +654,11 @@ private DataSource insertSubqueryIds(
                                                            .collect(Collectors.toList()));
   }
 
-  /**
-   */
   /**
    *
    * Convert the results of a particular query into a materialized (List-based) InlineDataSource.
    *
    * @param query            the query
-   * @param results          query results
    * @param toolChest        toolchest for the query
    * @param limitAccumulator an accumulator for tracking the number of accumulated rows in all subqueries for a
    *                         particular master query
@@ -671,7 +675,7 @@ private DataSource insertSubqueryIds(
    */
   private static <T, QueryType extends Query<T>> DataSource toInlineDataSource(
       final QueryType query,
-      final Sequence<T> results,
+      final Sequence<T> queryResults,
       final QueryToolChest<T, QueryType> toolChest,
       final AtomicInteger limitAccumulator,
       final AtomicLong memoryLimitAccumulator,
@@ -697,7 +701,7 @@ private static <T, QueryType extends Query<T>> DataSource toInlineDataSource(
         subqueryStatsProvider.incrementSubqueriesWithRowLimit();
         dataSource = materializeResultsAsArray(
             query,
-            results,
+            queryResults,
             toolChest,
             limitAccumulator,
             limit,
@@ -713,7 +717,7 @@ private static <T, QueryType extends Query<T>> DataSource toInlineDataSource(
         }
         Optional<DataSource> maybeDataSource = materializeResultsAsFrames(
             query,
-            results,
+            queryResults,
             toolChest,
             limitAccumulator,
             memoryLimitAccumulator,
@@ -734,7 +738,7 @@ private static <T, QueryType extends Query<T>> DataSource toInlineDataSource(
           subqueryStatsProvider.incrementSubqueriesFallingBackToRowLimit();
           dataSource = materializeResultsAsArray(
               query,
-              results,
+              queryResults,
               toolChest,
               limitAccumulator,
               limit,
@@ -770,11 +774,9 @@ private static <T, QueryType extends Query<T>> Optional<DataSource> materializeR
       final ServiceEmitter emitter
   )
   {
-    Optional<Sequence<FrameSignaturePair>> framesOptional;
-
     boolean startedAccumulating = false;
     try {
-      framesOptional = toolChest.resultsAsFrames(
+      Optional<Sequence<FrameSignaturePair>> framesOptional = toolChest.resultsAsFrames(
           query,
           results,
           new ArenaMemoryAllocatorFactory(FRAME_SIZE),
@@ -912,5 +914,4 @@ private static String rowLimitExceededMessage(final int rowLimitUsed)
         QueryContexts.MAX_SUBQUERY_ROWS_KEY
     );
   }
-
 }
diff --git a/server/src/main/java/org/apache/druid/server/ClientQuerySegmentWalkerUtils.java b/server/src/main/java/org/apache/druid/server/ClientQuerySegmentWalkerUtils.java
index 6667cd961129..0435ed231935 100644
--- a/server/src/main/java/org/apache/druid/server/ClientQuerySegmentWalkerUtils.java
+++ b/server/src/main/java/org/apache/druid/server/ClientQuerySegmentWalkerUtils.java
@@ -19,6 +19,8 @@
 
 package org.apache.druid.server;
 
+import org.apache.druid.query.ResultSerializationMode;
+
 /**
  * Utilities for {@link ClientQuerySegmentWalker}
  */
@@ -35,7 +37,13 @@ public enum SubqueryResultLimit
      * walker ensures that the cumulative number of rows of the results of subqueries of the given query donot exceed
      * the limit specified in the context or as the server default
      */
-    ROW_LIMIT,
+    ROW_LIMIT {
+      @Override
+      public ResultSerializationMode serializationMode()
+      {
+        return ResultSerializationMode.ROWS;
+      }
+    },
 
     /**
      * Subqueries limited by the BYTE_LIMIT are materialized as {@link org.apache.druid.frame.Frame}s on heap. Frames
@@ -44,10 +52,18 @@ public enum SubqueryResultLimit
      * Frames in the broker memory) of a given query do not exceed the limit specified in the context or as the server
      * default
      */
-    MEMORY_LIMIT
+    MEMORY_LIMIT {
+      @Override
+      public ResultSerializationMode serializationMode()
+      {
+        return ResultSerializationMode.FRAMES;
+      }
+    };
+
+    public abstract ResultSerializationMode serializationMode();
   }
 
-  /**
+   /**
    * Returns the limit type to be used for a given subquery.
    * It returns MEMORY_LIMIT only if:
    *  1. The user has enabled the 'maxSubqueryBytes' explicitly in the query context or as the server default
diff --git a/server/src/test/java/org/apache/druid/server/ClientQuerySegmentWalkerTest.java b/server/src/test/java/org/apache/druid/server/ClientQuerySegmentWalkerTest.java
index 467f375f9f7b..fa5585c374ed 100644
--- a/server/src/test/java/org/apache/druid/server/ClientQuerySegmentWalkerTest.java
+++ b/server/src/test/java/org/apache/druid/server/ClientQuerySegmentWalkerTest.java
@@ -52,6 +52,7 @@
 import org.apache.druid.query.QuerySegmentWalker;
 import org.apache.druid.query.QueryToolChestTestHelper;
 import org.apache.druid.query.ResourceLimitExceededException;
+import org.apache.druid.query.ResultSerializationMode;
 import org.apache.druid.query.SegmentDescriptor;
 import org.apache.druid.query.TableDataSource;
 import org.apache.druid.query.UnionDataSource;
@@ -1721,7 +1722,8 @@ private static class ExpectedQuery
                     .put(GroupByQueryConfig.CTX_KEY_APPLY_LIMIT_PUSH_DOWN, true)
                     .put(GroupingEngine.CTX_KEY_OUTERMOST, true)
                     .put(GroupingEngine.CTX_KEY_FUDGE_TIMESTAMP, "1979")
-                    .put(QueryContexts.QUERY_RESOURCE_ID, "dummy");
+                    .put(QueryContexts.QUERY_RESOURCE_ID, "dummy")
+                    .put(ResultSerializationMode.CTX_SERIALIZATION_PARAMETER, "blast");
 
       modifiedQuery = query.withOverriddenContext(contextBuilder.build());
 
diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/BaseCalciteQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/BaseCalciteQueryTest.java
index 676bf8b4dd42..4b26af38adb5 100644
--- a/sql/src/test/java/org/apache/druid/sql/calcite/BaseCalciteQueryTest.java
+++ b/sql/src/test/java/org/apache/druid/sql/calcite/BaseCalciteQueryTest.java
@@ -35,12 +35,14 @@
 import org.apache.druid.error.DruidExceptionMatcher;
 import org.apache.druid.hll.VersionOneHyperLogLogCollector;
 import org.apache.druid.java.util.common.DateTimes;
+import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.RE;
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.granularity.Granularity;
 import org.apache.druid.java.util.common.logger.Logger;
 import org.apache.druid.math.expr.Evals;
+import org.apache.druid.math.expr.ExprEval;
 import org.apache.druid.query.DataSource;
 import org.apache.druid.query.Druids;
 import org.apache.druid.query.JoinDataSource;
@@ -982,23 +984,55 @@ void validate(int row, int column, ValueType type, Object expectedCell, Object r
               mismatchMessage(row, column),
               (Float) expectedCell,
               (Float) resultCell,
-              ASSERTION_EPSILON);
+              ASSERTION_EPSILON
+          );
         } else if (expectedCell instanceof Double) {
           assertEquals(
               mismatchMessage(row, column),
               (Double) expectedCell,
               (Double) resultCell,
-              ASSERTION_EPSILON);
+              ASSERTION_EPSILON
+          );
+        } else if (expectedCell instanceof Object[] || expectedCell instanceof List) {
+          final Object[] expectedCellCasted = homogenizeArray(expectedCell);
+          final Object[] resultCellCasted = homogenizeArray(resultCell);
+          if (expectedCellCasted.length != resultCellCasted.length) {
+            throw new RE(
+                "Mismatched array lengths: expected[%s] with length[%d], actual[%s] with length[%d]",
+                Arrays.toString(expectedCellCasted),
+                expectedCellCasted.length,
+                Arrays.toString(resultCellCasted),
+                resultCellCasted.length
+            );
+          }
+          for (int i = 0; i < expectedCellCasted.length; ++i) {
+            validate(row, column, type, expectedCellCasted[i], resultCellCasted[i]);
+          }
         } else {
           EQUALS.validate(row, column, type, expectedCell, resultCell);
         }
       }
     },
+
+    RELAX_NULLS_EPS {
+      @Override
+      void validate(int row, int column, ValueType type, Object expectedCell, Object resultCell)
+      {
+        if (expectedCell == null) {
+          if (resultCell == null) {
+            return;
+          }
+          expectedCell = NullHandling.defaultValueForType(type);
+        }
+        EQUALS_EPS.validate(row, column, type, expectedCell, resultCell);
+      }
+    },
+
     /**
      * Comparision which accepts 1000 units of least precision.
      */
     EQUALS_RELATIVE_1000_ULPS {
-      static final int ASSERTION_ERROR_ULPS = 1000;
+      private static final int ASSERTION_ERROR_ULPS = 1000;
 
       @Override
       void validate(int row, int column, ValueType type, Object expectedCell, Object resultCell)
@@ -1019,10 +1053,43 @@ void validate(int row, int column, ValueType type, Object expectedCell, Object r
               (Double) resultCell,
               eps
           );
+        } else if (expectedCell instanceof Object[] || expectedCell instanceof List) {
+          final Object[] expectedCellCasted = homogenizeArray(expectedCell);
+          final Object[] resultCellCasted = homogenizeArray(resultCell);
+
+          if (expectedCellCasted.length != resultCellCasted.length) {
+            throw new RE(
+                "Mismatched array lengths: expected[%s] with length[%d], actual[%s] with length[%d]",
+                Arrays.toString(expectedCellCasted),
+                expectedCellCasted.length,
+                Arrays.toString(resultCellCasted),
+                resultCellCasted.length
+            );
+          }
+          for (int i = 0; i < expectedCellCasted.length; ++i) {
+            validate(row, column, type, expectedCellCasted[i], resultCellCasted[i]);
+          }
         } else {
           EQUALS.validate(row, column, type, expectedCell, resultCell);
         }
       }
+    },
+
+    /**
+     * Relax nulls which accepts 1000 units of least precision.
+     */
+    RELAX_NULLS_RELATIVE_1000_ULPS {
+      @Override
+      void validate(int row, int column, ValueType type, Object expectedCell, Object resultCell)
+      {
+        if (expectedCell == null) {
+          if (resultCell == null) {
+            return;
+          }
+          expectedCell = NullHandling.defaultValueForType(type);
+        }
+        EQUALS_RELATIVE_1000_ULPS.validate(row, column, type, expectedCell, resultCell);
+      }
     };
 
     abstract void validate(int row, int column, ValueType type, Object expectedCell, Object resultCell);
@@ -1032,6 +1099,15 @@ private static String mismatchMessage(int row, int column)
       return StringUtils.format("column content mismatch at %d,%d", row, column);
     }
 
+    private static Object[] homogenizeArray(Object array)
+    {
+      if (array instanceof Object[]) {
+        return (Object[]) array;
+      } else if (array instanceof List) {
+        return ExprEval.coerceListToArray((List) array, true).rhs;
+      }
+      throw new ISE("Found array[%s] of type[%s] which is not handled", array.toString(), array.getClass().getName());
+    }
   }
 
   /**
diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteWindowQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteWindowQueryTest.java
index ccf459e743e7..5850be0bd1c5 100644
--- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteWindowQueryTest.java
+++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteWindowQueryTest.java
@@ -71,9 +71,17 @@ public class CalciteWindowQueryTest extends BaseCalciteQueryTest
 
   private static final Map<String, Object> DEFAULT_QUERY_CONTEXT = ImmutableMap.of(
       PlannerContext.CTX_ENABLE_WINDOW_FNS, true,
-      QueryContexts.ENABLE_DEBUG, true
+      QueryContexts.ENABLE_DEBUG, true,
+      QueryContexts.CTX_SQL_STRINGIFY_ARRAYS, false
   );
 
+  private static final Map<String, Object> DEFAULT_QUERY_CONTEXT_WITH_SUBQUERY_BYTES =
+      ImmutableMap.<String, Object>builder()
+                  .putAll(DEFAULT_QUERY_CONTEXT)
+                  .put(QueryContexts.MAX_SUBQUERY_BYTES_KEY, "100000")
+                  .put(QueryContexts.MAX_SUBQUERY_ROWS_KEY, "0")
+                  .build();
+
   public static Object[] parametersForWindowQueryTest() throws Exception
   {
     final URL windowFolderUrl = ClassLoader.getSystemResource("calcite/tests/window");
@@ -161,7 +169,7 @@ public void verifyResults(QueryResults results) throws Exception
           }
         }
       }
-      assertResultsValid(ResultMatchMode.RELAX_NULLS, input.expectedResults, results);
+      assertResultsValid(ResultMatchMode.RELAX_NULLS_EPS, input.expectedResults, results);
     }
 
     private void validateOperators(List<OperatorFactory> expectedOperators, List<OperatorFactory> currentOperators)
@@ -223,6 +231,30 @@ public void windowQueryTest(String filename) throws Exception
     }
   }
 
+  @MethodSource("parametersForWindowQueryTest")
+  @ParameterizedTest(name = "{0}")
+  @SuppressWarnings("unchecked")
+  public void windowQueryTestsWithSubqueryBytes(String filename) throws Exception
+  {
+    TestCase testCase = new TestCase(filename);
+
+    assumeTrue(testCase.getType() != TestType.failingTest);
+
+    if (testCase.getType() == TestType.operatorValidation) {
+      testBuilder()
+          .skipVectorize(true)
+          .sql(testCase.getSql())
+          .queryContext(
+              ImmutableMap.<String, Object>builder()
+                          .putAll(DEFAULT_QUERY_CONTEXT_WITH_SUBQUERY_BYTES)
+                          .putAll(testCase.getQueryContext())
+                          .build()
+          )
+          .addCustomVerification(QueryVerification.ofResults(testCase))
+          .run();
+    }
+  }
+
   @Test
   public void testWithArrayConcat()
   {
@@ -237,14 +269,14 @@ public void testWithArrayConcat()
         .expectedResults(
             ResultMatchMode.RELAX_NULLS,
             ImmutableList.of(
-              new Object[]{"Austria", null, "#de.wikipedia", "[\"abc\",\"#de.wikipedia\"]"},
-              new Object[]{"Republic of Korea", null, "#en.wikipedia", "[\"abc\",\"#de.wikipedia\",\"abc\",\"#en.wikipedia\",\"abc\",\"#ja.wikipedia\",\"abc\",\"#ko.wikipedia\"]"},
-              new Object[]{"Republic of Korea", null, "#ja.wikipedia", "[\"abc\",\"#de.wikipedia\",\"abc\",\"#en.wikipedia\",\"abc\",\"#ja.wikipedia\",\"abc\",\"#ko.wikipedia\"]"},
-              new Object[]{"Republic of Korea", null, "#ko.wikipedia", "[\"abc\",\"#de.wikipedia\",\"abc\",\"#en.wikipedia\",\"abc\",\"#ja.wikipedia\",\"abc\",\"#ko.wikipedia\"]"},
-              new Object[]{"Republic of Korea", "Seoul", "#ko.wikipedia", "[\"abc\",\"#ko.wikipedia\"]"},
-              new Object[]{"Austria", "Vienna", "#de.wikipedia", "[\"abc\",\"#de.wikipedia\",\"abc\",\"#es.wikipedia\",\"abc\",\"#tr.wikipedia\"]"},
-              new Object[]{"Austria", "Vienna", "#es.wikipedia", "[\"abc\",\"#de.wikipedia\",\"abc\",\"#es.wikipedia\",\"abc\",\"#tr.wikipedia\"]"},
-              new Object[]{"Austria", "Vienna", "#tr.wikipedia", "[\"abc\",\"#de.wikipedia\",\"abc\",\"#es.wikipedia\",\"abc\",\"#tr.wikipedia\"]"}
+              new Object[]{"Austria", null, "#de.wikipedia", ImmutableList.of("abc", "#de.wikipedia")},
+              new Object[]{"Republic of Korea", null, "#en.wikipedia", ImmutableList.of("abc", "#de.wikipedia", "abc", "#en.wikipedia", "abc", "#ja.wikipedia", "abc", "#ko.wikipedia")},
+              new Object[]{"Republic of Korea", null, "#ja.wikipedia", ImmutableList.of("abc", "#de.wikipedia", "abc", "#en.wikipedia", "abc", "#ja.wikipedia", "abc", "#ko.wikipedia")},
+              new Object[]{"Republic of Korea", null, "#ko.wikipedia", ImmutableList.of("abc", "#de.wikipedia", "abc", "#en.wikipedia", "abc", "#ja.wikipedia", "abc", "#ko.wikipedia")},
+              new Object[]{"Republic of Korea", "Seoul", "#ko.wikipedia", ImmutableList.of("abc", "#ko.wikipedia")},
+              new Object[]{"Austria", "Vienna", "#de.wikipedia", ImmutableList.of("abc", "#de.wikipedia", "abc", "#es.wikipedia", "abc", "#tr.wikipedia")},
+              new Object[]{"Austria", "Vienna", "#es.wikipedia", ImmutableList.of("abc", "#de.wikipedia", "abc", "#es.wikipedia", "abc", "#tr.wikipedia")},
+              new Object[]{"Austria", "Vienna", "#tr.wikipedia", ImmutableList.of("abc", "#de.wikipedia", "abc", "#es.wikipedia", "abc", "#tr.wikipedia")}
             )
         )
         .run();
diff --git a/sql/src/test/resources/calcite/tests/window/arrayAggWithOrderBy.sqlTest b/sql/src/test/resources/calcite/tests/window/arrayAggWithOrderBy.sqlTest
index bee3baeac0c4..3cbdf42af6fd 100644
--- a/sql/src/test/resources/calcite/tests/window/arrayAggWithOrderBy.sqlTest
+++ b/sql/src/test/resources/calcite/tests/window/arrayAggWithOrderBy.sqlTest
@@ -10,5 +10,5 @@ sql: |
   ORDER BY d1, f1, m1
 
 expectedResults:
-  - [2,"[1.0]","[1.0]","[1.0]"]
-  - [2,"[1.7]","[0.1]","[2.0]"]
+  - [2,[1.0],[1.0],[1.0]]
+  - [2,[1.7],[0.1],[2.0]]
diff --git a/sql/src/test/resources/calcite/tests/window/arrayConcatAgg.sqlTest b/sql/src/test/resources/calcite/tests/window/arrayConcatAgg.sqlTest
index 9ec451a94d94..2a648abc17bf 100644
--- a/sql/src/test/resources/calcite/tests/window/arrayConcatAgg.sqlTest
+++ b/sql/src/test/resources/calcite/tests/window/arrayConcatAgg.sqlTest
@@ -9,5 +9,5 @@ sql: |
   GROUP BY cityName
 
 expectedResults:
-  - ["Horsching","[\"Horsching\"]"]
-  - ["Vienna","[\"Vienna\"]"]
+  - ["Horsching",["Horsching"]]
+  - ["Vienna",["Vienna"]]
diff --git a/sql/src/test/resources/calcite/tests/window/rank_handling.sqlTest b/sql/src/test/resources/calcite/tests/window/rank_handling.sqlTest
index 1e4de22dfca5..0e66ed874601 100644
--- a/sql/src/test/resources/calcite/tests/window/rank_handling.sqlTest
+++ b/sql/src/test/resources/calcite/tests/window/rank_handling.sqlTest
@@ -1,8 +1,5 @@
 type: "operatorValidation"
 
-queryContext:
-  maxSubqueryBytes: 100000
-
 sql: |
   SELECT
       __time

From f6998a9d19ab267a248fc2dbd63c447f3163820b Mon Sep 17 00:00:00 2001
From: Sree Charan Manamala <sree.manamala@imply.io>
Date: Tue, 17 Sep 2024 10:14:40 +0530
Subject: [PATCH 34/47] Add serde for ColumnBasedRowsAndColumns to fix window
 queries without group by (#16658)

Register a Ser-De for RowsAndColumns so that the window operator query running on leaf operators would be transferred properly on the wire. Would fix the empty response given by window queries without group by on the native engine.
---
 .../DruidDefaultSerializersModule.java        |  18 +--
 ...WindowOperatorQueryQueryRunnerFactory.java |  18 +--
 .../rowsandcols/AppendableMapOfColumns.java   |  10 --
 .../rowsandcols/ConcatRowsAndColumns.java     |   7 --
 .../CursorFactoryRowsAndColumns.java          |   2 +-
 .../rowsandcols/EmptyRowsAndColumns.java      |   8 --
 .../LazilyDecoratedRowsAndColumns.java        |  14 +--
 .../rowsandcols/LimitedRowsAndColumns.java    |   9 --
 .../MapOfColumnsRowsAndColumns.java           |   2 +-
 .../rowsandcols/RearrangedRowsAndColumns.java |   7 --
 .../query/rowsandcols/RowsAndColumns.java     |  86 +++++++++++++-
 .../concrete/AbstractFrameRowsAndColumns.java | 106 ++++++++++++++++++
 .../ColumnBasedFrameRowsAndColumns.java       |  54 ++-------
 .../FrameRowsAndColumns.java}                 |  18 +--
 .../concrete/RowBasedFrameRowsAndColumns.java |  44 +-------
 .../jackson/DefaultObjectMapperTest.java      |  26 +++++
 .../query/rowsandcols/NoAsRowsAndColumns.java |   9 --
 .../ColumnBasedFrameRowsAndColumnsTest.java   |  10 +-
 .../semantic/RowsAndColumnsDecoratorTest.java |  37 ++++++
 .../sql/calcite/run/NativeSqlEngine.java      |   2 +-
 .../druid/sql/calcite/CalciteQueryTest.java   |   1 +
 .../sql/calcite/CalciteWindowQueryTest.java   |   2 +-
 .../druid/sql/calcite/NotYetSupported.java    |   1 +
 .../wikipediaFramedAggregations.sqlTest       |   2 +-
 24 files changed, 296 insertions(+), 197 deletions(-)
 create mode 100644 processing/src/main/java/org/apache/druid/query/rowsandcols/concrete/AbstractFrameRowsAndColumns.java
 rename processing/src/main/java/org/apache/druid/query/rowsandcols/{semantic/WireTransferable.java => concrete/FrameRowsAndColumns.java} (67%)

diff --git a/processing/src/main/java/org/apache/druid/jackson/DruidDefaultSerializersModule.java b/processing/src/main/java/org/apache/druid/jackson/DruidDefaultSerializersModule.java
index 30cc388f1d9d..cad8fdfd8315 100644
--- a/processing/src/main/java/org/apache/druid/jackson/DruidDefaultSerializersModule.java
+++ b/processing/src/main/java/org/apache/druid/jackson/DruidDefaultSerializersModule.java
@@ -37,7 +37,6 @@
 import org.apache.druid.query.context.ResponseContext;
 import org.apache.druid.query.context.ResponseContextDeserializer;
 import org.apache.druid.query.rowsandcols.RowsAndColumns;
-import org.apache.druid.query.rowsandcols.semantic.WireTransferable;
 import org.joda.time.DateTimeZone;
 
 import java.io.IOException;
@@ -189,20 +188,7 @@ public ByteOrder deserialize(JsonParser jp, DeserializationContext ctxt) throws
     );
     addDeserializer(ResponseContext.class, new ResponseContextDeserializer());
 
-    addSerializer(RowsAndColumns.class, new JsonSerializer<RowsAndColumns>()
-    {
-      @Override
-      public void serialize(
-          RowsAndColumns value,
-          JsonGenerator gen,
-          SerializerProvider serializers
-      ) throws IOException
-      {
-        // It would be really cool if jackson offered an output stream that would allow us to push bytes
-        // through, but it doesn't right now, so we have to build a byte[] instead.  Maybe something to contribute
-        // back to Jackson at some point.
-        gen.writeBinary(WireTransferable.fromRAC(value).bytesToTransfer());
-      }
-    });
+    addSerializer(RowsAndColumns.class, new RowsAndColumns.RowsAndColumnsSerializer());
+    addDeserializer(RowsAndColumns.class, new RowsAndColumns.RowsAndColumnsDeserializer());
   }
 }
diff --git a/processing/src/main/java/org/apache/druid/query/operator/WindowOperatorQueryQueryRunnerFactory.java b/processing/src/main/java/org/apache/druid/query/operator/WindowOperatorQueryQueryRunnerFactory.java
index d18f6c252c1a..f86f91be18be 100644
--- a/processing/src/main/java/org/apache/druid/query/operator/WindowOperatorQueryQueryRunnerFactory.java
+++ b/processing/src/main/java/org/apache/druid/query/operator/WindowOperatorQueryQueryRunnerFactory.java
@@ -21,7 +21,6 @@
 
 import com.google.common.base.Function;
 import org.apache.druid.error.DruidException;
-import org.apache.druid.frame.Frame;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.guava.Sequence;
 import org.apache.druid.java.util.common.guava.Sequences;
@@ -31,10 +30,8 @@
 import org.apache.druid.query.QueryToolChest;
 import org.apache.druid.query.rowsandcols.LazilyDecoratedRowsAndColumns;
 import org.apache.druid.query.rowsandcols.RowsAndColumns;
-import org.apache.druid.query.rowsandcols.concrete.ColumnBasedFrameRowsAndColumns;
-import org.apache.druid.query.rowsandcols.semantic.WireTransferable;
+import org.apache.druid.query.rowsandcols.concrete.FrameRowsAndColumns;
 import org.apache.druid.segment.Segment;
-import org.apache.druid.segment.column.RowSignature;
 import org.joda.time.Interval;
 
 import javax.annotation.Nullable;
@@ -100,19 +97,8 @@ public Sequence<RowsAndColumns> apply(
                       @Override
                       public RowsAndColumns apply(@Nullable RowsAndColumns input)
                       {
-                        // This is interim code to force a materialization by synthesizing the wire transfer
-                        // that will need to naturally happen as we flesh out this code more.  For now, we
-                        // materialize the bytes on-heap and then read them back in as a frame.
                         if (input instanceof LazilyDecoratedRowsAndColumns) {
-                          final WireTransferable wire = WireTransferable.fromRAC(input);
-                          final byte[] frameBytes = wire.bytesToTransfer();
-
-                          RowSignature.Builder sigBob = RowSignature.builder();
-                          for (String column : input.getColumnNames()) {
-                            sigBob.add(column, input.findColumn(column).toAccessor().getType());
-                          }
-
-                          return new ColumnBasedFrameRowsAndColumns(Frame.wrap(frameBytes), sigBob.build());
+                          return input.as(FrameRowsAndColumns.class);
                         }
                         return input;
                       }
diff --git a/processing/src/main/java/org/apache/druid/query/rowsandcols/AppendableMapOfColumns.java b/processing/src/main/java/org/apache/druid/query/rowsandcols/AppendableMapOfColumns.java
index 61f6855cd01c..d83f56c7ba5f 100644
--- a/processing/src/main/java/org/apache/druid/query/rowsandcols/AppendableMapOfColumns.java
+++ b/processing/src/main/java/org/apache/druid/query/rowsandcols/AppendableMapOfColumns.java
@@ -79,14 +79,4 @@ public Column findColumn(String name)
     }
     return retVal;
   }
-
-  @Override
-  @SuppressWarnings("unchecked")
-  public <T> T as(Class<T> clazz)
-  {
-    if (AppendableRowsAndColumns.class.equals(clazz)) {
-      return (T) this;
-    }
-    return null;
-  }
 }
diff --git a/processing/src/main/java/org/apache/druid/query/rowsandcols/ConcatRowsAndColumns.java b/processing/src/main/java/org/apache/druid/query/rowsandcols/ConcatRowsAndColumns.java
index c6ced60849d4..3f70f82a2537 100644
--- a/processing/src/main/java/org/apache/druid/query/rowsandcols/ConcatRowsAndColumns.java
+++ b/processing/src/main/java/org/apache/druid/query/rowsandcols/ConcatRowsAndColumns.java
@@ -141,13 +141,6 @@ public Column findColumn(String name)
     }
   }
 
-  @Nullable
-  @Override
-  public <T> T as(Class<T> clazz)
-  {
-    return null;
-  }
-
   private class ConcatedidColumn implements Column
   {
 
diff --git a/processing/src/main/java/org/apache/druid/query/rowsandcols/CursorFactoryRowsAndColumns.java b/processing/src/main/java/org/apache/druid/query/rowsandcols/CursorFactoryRowsAndColumns.java
index 6fa74660f7df..46fda857516f 100644
--- a/processing/src/main/java/org/apache/druid/query/rowsandcols/CursorFactoryRowsAndColumns.java
+++ b/processing/src/main/java/org/apache/druid/query/rowsandcols/CursorFactoryRowsAndColumns.java
@@ -61,7 +61,7 @@ public <T> T as(Class<T> clazz)
     if (CursorFactory.class == clazz) {
       return (T) cursorFactory;
     }
-    return null;
+    return RowsAndColumns.super.as(clazz);
   }
 
   @Override
diff --git a/processing/src/main/java/org/apache/druid/query/rowsandcols/EmptyRowsAndColumns.java b/processing/src/main/java/org/apache/druid/query/rowsandcols/EmptyRowsAndColumns.java
index dd0c7dab1cda..56647e0f5687 100644
--- a/processing/src/main/java/org/apache/druid/query/rowsandcols/EmptyRowsAndColumns.java
+++ b/processing/src/main/java/org/apache/druid/query/rowsandcols/EmptyRowsAndColumns.java
@@ -21,7 +21,6 @@
 
 import org.apache.druid.query.rowsandcols.column.Column;
 
-import javax.annotation.Nullable;
 import java.util.ArrayList;
 import java.util.Collection;
 
@@ -44,11 +43,4 @@ public Column findColumn(String name)
   {
     return null;
   }
-
-  @Nullable
-  @Override
-  public <T> T as(Class<T> clazz)
-  {
-    return null;
-  }
 }
diff --git a/processing/src/main/java/org/apache/druid/query/rowsandcols/LazilyDecoratedRowsAndColumns.java b/processing/src/main/java/org/apache/druid/query/rowsandcols/LazilyDecoratedRowsAndColumns.java
index a05b31dc2cb4..bb35f6837976 100644
--- a/processing/src/main/java/org/apache/druid/query/rowsandcols/LazilyDecoratedRowsAndColumns.java
+++ b/processing/src/main/java/org/apache/druid/query/rowsandcols/LazilyDecoratedRowsAndColumns.java
@@ -39,10 +39,10 @@
 import org.apache.druid.query.rowsandcols.column.Column;
 import org.apache.druid.query.rowsandcols.column.ColumnAccessor;
 import org.apache.druid.query.rowsandcols.concrete.ColumnBasedFrameRowsAndColumns;
+import org.apache.druid.query.rowsandcols.concrete.FrameRowsAndColumns;
 import org.apache.druid.query.rowsandcols.semantic.ColumnSelectorFactoryMaker;
 import org.apache.druid.query.rowsandcols.semantic.DefaultRowsAndColumnsDecorator;
 import org.apache.druid.query.rowsandcols.semantic.RowsAndColumnsDecorator;
-import org.apache.druid.query.rowsandcols.semantic.WireTransferable;
 import org.apache.druid.segment.ColumnSelectorFactory;
 import org.apache.druid.segment.Cursor;
 import org.apache.druid.segment.CursorBuildSpec;
@@ -150,16 +150,10 @@ public RowsAndColumnsDecorator toRowsAndColumnsDecorator()
 
   @SuppressWarnings("unused")
   @SemanticCreator
-  public WireTransferable toWireTransferable()
+  public FrameRowsAndColumns toFrameRowsAndColumns()
   {
-    return () -> {
-      final Pair<byte[], RowSignature> materialized = materialize();
-      if (materialized == null) {
-        return new byte[]{};
-      } else {
-        return materialized.lhs;
-      }
-    };
+    maybeMaterialize();
+    return base.as(FrameRowsAndColumns.class);
   }
 
   private void maybeMaterialize()
diff --git a/processing/src/main/java/org/apache/druid/query/rowsandcols/LimitedRowsAndColumns.java b/processing/src/main/java/org/apache/druid/query/rowsandcols/LimitedRowsAndColumns.java
index abb3d4649b1a..8cfadecb4dd2 100644
--- a/processing/src/main/java/org/apache/druid/query/rowsandcols/LimitedRowsAndColumns.java
+++ b/processing/src/main/java/org/apache/druid/query/rowsandcols/LimitedRowsAndColumns.java
@@ -23,7 +23,6 @@
 import org.apache.druid.query.rowsandcols.column.Column;
 import org.apache.druid.query.rowsandcols.column.LimitedColumn;
 
-import javax.annotation.Nullable;
 import java.util.Collection;
 
 public class LimitedRowsAndColumns implements RowsAndColumns
@@ -66,12 +65,4 @@ public Column findColumn(String name)
 
     return new LimitedColumn(column, start, end);
   }
-
-  @Nullable
-  @Override
-  public <T> T as(Class<T> clazz)
-  {
-    return null;
-  }
-
 }
diff --git a/processing/src/main/java/org/apache/druid/query/rowsandcols/MapOfColumnsRowsAndColumns.java b/processing/src/main/java/org/apache/druid/query/rowsandcols/MapOfColumnsRowsAndColumns.java
index 29f092f67440..d6bc1026a98d 100644
--- a/processing/src/main/java/org/apache/druid/query/rowsandcols/MapOfColumnsRowsAndColumns.java
+++ b/processing/src/main/java/org/apache/druid/query/rowsandcols/MapOfColumnsRowsAndColumns.java
@@ -164,7 +164,7 @@ public <T> T as(Class<T> clazz)
     if (AppendableRowsAndColumns.class.equals(clazz)) {
       return (T) new AppendableMapOfColumns(this);
     }
-    return null;
+    return RowsAndColumns.super.as(clazz);
   }
 
   public static class Builder
diff --git a/processing/src/main/java/org/apache/druid/query/rowsandcols/RearrangedRowsAndColumns.java b/processing/src/main/java/org/apache/druid/query/rowsandcols/RearrangedRowsAndColumns.java
index f1793f8fd0e4..e64f086edd7f 100644
--- a/processing/src/main/java/org/apache/druid/query/rowsandcols/RearrangedRowsAndColumns.java
+++ b/processing/src/main/java/org/apache/druid/query/rowsandcols/RearrangedRowsAndColumns.java
@@ -164,11 +164,4 @@ public int compareRows(int lhsRowNum, int rhsRowNum)
       );
     }
   }
-
-  @Nullable
-  @Override
-  public <T> T as(Class<T> clazz)
-  {
-    return null;
-  }
 }
diff --git a/processing/src/main/java/org/apache/druid/query/rowsandcols/RowsAndColumns.java b/processing/src/main/java/org/apache/druid/query/rowsandcols/RowsAndColumns.java
index 7b6a1f6215d3..a34d0e463c07 100644
--- a/processing/src/main/java/org/apache/druid/query/rowsandcols/RowsAndColumns.java
+++ b/processing/src/main/java/org/apache/druid/query/rowsandcols/RowsAndColumns.java
@@ -19,12 +19,30 @@
 
 package org.apache.druid.query.rowsandcols;
 
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.deser.std.StdDeserializer;
+import com.fasterxml.jackson.databind.ser.std.StdSerializer;
+import org.apache.druid.error.DruidException;
+import org.apache.druid.frame.Frame;
+import org.apache.druid.frame.FrameType;
+import org.apache.druid.frame.channel.ByteTracker;
+import org.apache.druid.java.util.common.jackson.JacksonUtils;
 import org.apache.druid.query.rowsandcols.column.Column;
+import org.apache.druid.query.rowsandcols.concrete.ColumnBasedFrameRowsAndColumns;
+import org.apache.druid.query.rowsandcols.concrete.FrameRowsAndColumns;
+import org.apache.druid.query.rowsandcols.concrete.RowBasedFrameRowsAndColumns;
 import org.apache.druid.query.rowsandcols.semantic.AppendableRowsAndColumns;
 import org.apache.druid.query.rowsandcols.semantic.FramedOnHeapAggregatable;
+import org.apache.druid.segment.column.RowSignature;
 
 import javax.annotation.Nonnull;
 import javax.annotation.Nullable;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.nio.channels.Channels;
 import java.util.Collection;
 
 /**
@@ -110,6 +128,72 @@ static AppendableRowsAndColumns expectAppendable(RowsAndColumns input)
    * @return A concrete implementation of the interface, or null if there is no meaningful optimization to be had
    * through a local implementation of the interface.
    */
+  @SuppressWarnings("unchecked")
   @Nullable
-  <T> T as(Class<T> clazz);
+  default <T> T as(Class<T> clazz)
+  {
+    if (clazz.isInstance(this)) {
+      return (T) this;
+    }
+    return null;
+  }
+
+  /**
+   * Serializer for {@link RowsAndColumns} by converting the instance to {@link FrameRowsAndColumns}
+   */
+  class RowsAndColumnsSerializer extends StdSerializer<RowsAndColumns>
+  {
+    public RowsAndColumnsSerializer()
+    {
+      super(RowsAndColumns.class);
+    }
+
+    @Override
+    public void serialize(
+        RowsAndColumns rac,
+        JsonGenerator jsonGenerator,
+        SerializerProvider serializerProvider
+    ) throws IOException
+    {
+      FrameRowsAndColumns frameRAC = rac.as(FrameRowsAndColumns.class);
+      if (frameRAC == null) {
+        throw DruidException.defensive("Unable to serialize RAC");
+      }
+      JacksonUtils.writeObjectUsingSerializerProvider(jsonGenerator, serializerProvider, frameRAC.getSignature());
+
+      Frame frame = frameRAC.getFrame();
+      final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+      frame.writeTo(Channels.newChannel(baos), false, null, ByteTracker.unboundedTracker());
+
+      jsonGenerator.writeBinary(baos.toByteArray());
+    }
+  }
+
+  /**
+   * Deserializer for {@link RowsAndColumns} returning as an instance of {@link FrameRowsAndColumns}
+   */
+  class RowsAndColumnsDeserializer extends StdDeserializer<RowsAndColumns>
+  {
+    public RowsAndColumnsDeserializer()
+    {
+      super(RowsAndColumns.class);
+    }
+
+    @Override
+    public FrameRowsAndColumns deserialize(JsonParser jsonParser, DeserializationContext deserializationContext)
+        throws IOException
+    {
+      RowSignature sig = jsonParser.readValueAs(RowSignature.class);
+      jsonParser.nextValue();
+
+      ByteArrayOutputStream baos = new ByteArrayOutputStream();
+      jsonParser.readBinaryValue(baos);
+      Frame frame = Frame.wrap(baos.toByteArray());
+      if (frame.type() == FrameType.COLUMNAR) {
+        return new ColumnBasedFrameRowsAndColumns(frame, sig);
+      } else {
+        return new RowBasedFrameRowsAndColumns(frame, sig);
+      }
+    }
+  }
 }
diff --git a/processing/src/main/java/org/apache/druid/query/rowsandcols/concrete/AbstractFrameRowsAndColumns.java b/processing/src/main/java/org/apache/druid/query/rowsandcols/concrete/AbstractFrameRowsAndColumns.java
new file mode 100644
index 000000000000..5295326c8622
--- /dev/null
+++ b/processing/src/main/java/org/apache/druid/query/rowsandcols/concrete/AbstractFrameRowsAndColumns.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.query.rowsandcols.concrete;
+
+import com.google.common.base.Objects;
+import org.apache.druid.frame.Frame;
+import org.apache.druid.frame.read.FrameReader;
+import org.apache.druid.query.rowsandcols.column.Column;
+import org.apache.druid.segment.CloseableShapeshifter;
+import org.apache.druid.segment.CursorFactory;
+import org.apache.druid.segment.column.RowSignature;
+
+import javax.annotation.Nullable;
+import java.util.Collection;
+import java.util.LinkedHashMap;
+
+public abstract class AbstractFrameRowsAndColumns implements FrameRowsAndColumns, AutoCloseable, CloseableShapeshifter
+{
+  final Frame frame;
+  final RowSignature signature;
+  final LinkedHashMap<String, Column> colCache = new LinkedHashMap<>();
+
+  public AbstractFrameRowsAndColumns(Frame frame, RowSignature signature)
+  {
+    this.frame = frame;
+    this.signature = signature;
+  }
+
+  @Override
+  public Frame getFrame()
+  {
+    return frame;
+  }
+
+  @Override
+  public RowSignature getSignature()
+  {
+    return signature;
+  }
+
+  @Override
+  public Collection<String> getColumnNames()
+  {
+    return signature.getColumnNames();
+  }
+
+  @Override
+  public int numRows()
+  {
+    return frame.numRows();
+  }
+
+  @SuppressWarnings("unchecked")
+  @Nullable
+  @Override
+  public <T> T as(Class<T> clazz)
+  {
+    if (CursorFactory.class.equals(clazz)) {
+      return (T) FrameReader.create(signature).makeCursorFactory(frame);
+    }
+    return FrameRowsAndColumns.super.as(clazz);
+  }
+
+  @Override
+  public void close()
+  {
+    // nothing to close
+  }
+
+  @Override
+  public int hashCode()
+  {
+    return Objects.hashCode(frame, signature);
+  }
+
+  @Override
+  public boolean equals(Object o)
+  {
+    if (this == o) {
+      return true;
+    }
+    if (!(o instanceof AbstractFrameRowsAndColumns)) {
+      return false;
+    }
+    AbstractFrameRowsAndColumns otherFrame = (AbstractFrameRowsAndColumns) o;
+
+    return frame.writableMemory().equals(otherFrame.frame.writableMemory()) && signature.equals(otherFrame.signature);
+  }
+}
diff --git a/processing/src/main/java/org/apache/druid/query/rowsandcols/concrete/ColumnBasedFrameRowsAndColumns.java b/processing/src/main/java/org/apache/druid/query/rowsandcols/concrete/ColumnBasedFrameRowsAndColumns.java
index e99a3f7f3139..c4a4577dc1af 100644
--- a/processing/src/main/java/org/apache/druid/query/rowsandcols/concrete/ColumnBasedFrameRowsAndColumns.java
+++ b/processing/src/main/java/org/apache/druid/query/rowsandcols/concrete/ColumnBasedFrameRowsAndColumns.java
@@ -19,44 +19,21 @@
 
 package org.apache.druid.query.rowsandcols.concrete;
 
+import org.apache.druid.error.DruidException;
 import org.apache.druid.frame.Frame;
 import org.apache.druid.frame.FrameType;
-import org.apache.druid.frame.read.FrameReader;
 import org.apache.druid.frame.read.columnar.FrameColumnReaders;
-import org.apache.druid.java.util.common.ISE;
-import org.apache.druid.query.rowsandcols.RowsAndColumns;
 import org.apache.druid.query.rowsandcols.column.Column;
-import org.apache.druid.segment.CloseableShapeshifter;
-import org.apache.druid.segment.CursorFactory;
 import org.apache.druid.segment.column.ColumnType;
 import org.apache.druid.segment.column.RowSignature;
 
 import javax.annotation.Nullable;
-import java.util.Collection;
-import java.util.LinkedHashMap;
 
-public class ColumnBasedFrameRowsAndColumns implements RowsAndColumns, AutoCloseable, CloseableShapeshifter
+public class ColumnBasedFrameRowsAndColumns extends AbstractFrameRowsAndColumns
 {
-  private final Frame frame;
-  private final RowSignature signature;
-  private final LinkedHashMap<String, Column> colCache = new LinkedHashMap<>();
-
   public ColumnBasedFrameRowsAndColumns(Frame frame, RowSignature signature)
   {
-    this.frame = FrameType.COLUMNAR.ensureType(frame);
-    this.signature = signature;
-  }
-
-  @Override
-  public Collection<String> getColumnNames()
-  {
-    return signature.getColumnNames();
-  }
-
-  @Override
-  public int numRows()
-  {
-    return frame.numRows();
+    super(FrameType.COLUMNAR.ensureType(frame), signature);
   }
 
   @Nullable
@@ -71,28 +48,17 @@ public Column findColumn(String name)
       } else {
         final ColumnType columnType = signature
             .getColumnType(columnIndex)
-            .orElseThrow(() -> new ISE("just got the id, why is columnType not there?"));
+            .orElseThrow(
+                () -> DruidException.defensive(
+                    "just got the id [%s][%s], why is columnType not there?",
+                    columnIndex,
+                    name
+                )
+            );
 
         colCache.put(name, FrameColumnReaders.create(name, columnIndex, columnType).readRACColumn(frame));
       }
     }
     return colCache.get(name);
   }
-
-  @SuppressWarnings("unchecked")
-  @Nullable
-  @Override
-  public <T> T as(Class<T> clazz)
-  {
-    if (CursorFactory.class.equals(clazz)) {
-      return (T) FrameReader.create(signature).makeCursorFactory(frame);
-    }
-    return null;
-  }
-
-  @Override
-  public void close()
-  {
-    // nothing to close
-  }
 }
diff --git a/processing/src/main/java/org/apache/druid/query/rowsandcols/semantic/WireTransferable.java b/processing/src/main/java/org/apache/druid/query/rowsandcols/concrete/FrameRowsAndColumns.java
similarity index 67%
rename from processing/src/main/java/org/apache/druid/query/rowsandcols/semantic/WireTransferable.java
rename to processing/src/main/java/org/apache/druid/query/rowsandcols/concrete/FrameRowsAndColumns.java
index a7d55f599293..022a0f91ac16 100644
--- a/processing/src/main/java/org/apache/druid/query/rowsandcols/semantic/WireTransferable.java
+++ b/processing/src/main/java/org/apache/druid/query/rowsandcols/concrete/FrameRowsAndColumns.java
@@ -17,21 +17,15 @@
  * under the License.
  */
 
-package org.apache.druid.query.rowsandcols.semantic;
+package org.apache.druid.query.rowsandcols.concrete;
 
-import org.apache.druid.java.util.common.ISE;
+import org.apache.druid.frame.Frame;
 import org.apache.druid.query.rowsandcols.RowsAndColumns;
+import org.apache.druid.segment.column.RowSignature;
 
-public interface WireTransferable
+public interface FrameRowsAndColumns extends RowsAndColumns
 {
-  static WireTransferable fromRAC(RowsAndColumns rac)
-  {
-    WireTransferable retVal = rac.as(WireTransferable.class);
-    if (retVal == null) {
-      throw new ISE("Rac[%s] cannot be transferred over the wire", rac.getClass());
-    }
-    return retVal;
-  }
+  Frame getFrame();
 
-  byte[] bytesToTransfer();
+  RowSignature getSignature();
 }
diff --git a/processing/src/main/java/org/apache/druid/query/rowsandcols/concrete/RowBasedFrameRowsAndColumns.java b/processing/src/main/java/org/apache/druid/query/rowsandcols/concrete/RowBasedFrameRowsAndColumns.java
index 865a24e5d6da..c702c210775c 100644
--- a/processing/src/main/java/org/apache/druid/query/rowsandcols/concrete/RowBasedFrameRowsAndColumns.java
+++ b/processing/src/main/java/org/apache/druid/query/rowsandcols/concrete/RowBasedFrameRowsAndColumns.java
@@ -24,40 +24,17 @@
 import org.apache.druid.frame.FrameType;
 import org.apache.druid.frame.field.FieldReader;
 import org.apache.druid.frame.field.FieldReaders;
-import org.apache.druid.frame.read.FrameReader;
-import org.apache.druid.query.rowsandcols.RowsAndColumns;
 import org.apache.druid.query.rowsandcols.column.Column;
-import org.apache.druid.segment.CloseableShapeshifter;
-import org.apache.druid.segment.CursorFactory;
 import org.apache.druid.segment.column.ColumnType;
 import org.apache.druid.segment.column.RowSignature;
 
 import javax.annotation.Nullable;
-import java.util.Collection;
-import java.util.LinkedHashMap;
 
-public class RowBasedFrameRowsAndColumns implements RowsAndColumns, AutoCloseable, CloseableShapeshifter
+public class RowBasedFrameRowsAndColumns extends AbstractFrameRowsAndColumns
 {
-  private final Frame frame;
-  private final RowSignature signature;
-  private final LinkedHashMap<String, Column> colCache = new LinkedHashMap<>();
-
   public RowBasedFrameRowsAndColumns(Frame frame, RowSignature signature)
   {
-    this.frame = FrameType.ROW_BASED.ensureType(frame);
-    this.signature = signature;
-  }
-
-  @Override
-  public Collection<String> getColumnNames()
-  {
-    return signature.getColumnNames();
-  }
-
-  @Override
-  public int numRows()
-  {
-    return frame.numRows();
+    super(FrameType.ROW_BASED.ensureType(frame), signature);
   }
 
   @Nullable
@@ -86,21 +63,4 @@ public Column findColumn(String name)
     }
     return colCache.get(name);
   }
-
-  @SuppressWarnings("unchecked")
-  @Nullable
-  @Override
-  public <T> T as(Class<T> clazz)
-  {
-    if (CursorFactory.class.equals(clazz)) {
-      return (T) FrameReader.create(signature).makeCursorFactory(frame);
-    }
-    return null;
-  }
-
-  @Override
-  public void close()
-  {
-    // nothing to close
-  }
 }
diff --git a/processing/src/test/java/org/apache/druid/jackson/DefaultObjectMapperTest.java b/processing/src/test/java/org/apache/druid/jackson/DefaultObjectMapperTest.java
index 989d137770ee..92c8a2cb2989 100644
--- a/processing/src/test/java/org/apache/druid/jackson/DefaultObjectMapperTest.java
+++ b/processing/src/test/java/org/apache/druid/jackson/DefaultObjectMapperTest.java
@@ -22,12 +22,18 @@
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.exc.InvalidTypeIdException;
+import com.google.common.collect.ImmutableMap;
 import org.apache.druid.java.util.common.DateTimes;
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.guava.Sequence;
 import org.apache.druid.java.util.common.guava.Sequences;
 import org.apache.druid.java.util.common.guava.Yielders;
 import org.apache.druid.query.Query;
+import org.apache.druid.query.rowsandcols.MapOfColumnsRowsAndColumns;
+import org.apache.druid.query.rowsandcols.RowsAndColumns;
+import org.apache.druid.query.rowsandcols.column.IntArrayColumn;
+import org.apache.druid.query.rowsandcols.concrete.ColumnBasedFrameRowsAndColumns;
+import org.apache.druid.query.rowsandcols.concrete.ColumnBasedFrameRowsAndColumnsTest;
 import org.joda.time.DateTime;
 import org.joda.time.DateTimeZone;
 import org.junit.Assert;
@@ -35,6 +41,8 @@
 
 import java.util.Arrays;
 
+import static org.junit.Assert.assertEquals;
+
 /**
  *
  */
@@ -102,4 +110,22 @@ public void testUnknownTypeWithUnknownService() throws JsonProcessingException
     }
     Assert.fail("We expect InvalidTypeIdException to be thrown");
   }
+
+  @Test
+  public void testColumnBasedFrameRowsAndColumns() throws Exception
+  {
+    DefaultObjectMapper om = new DefaultObjectMapper("test");
+
+    MapOfColumnsRowsAndColumns input = (MapOfColumnsRowsAndColumns.fromMap(
+        ImmutableMap.of(
+            "colA", new IntArrayColumn(new int[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}),
+            "colB", new IntArrayColumn(new int[]{4, -4, 3, -3, 4, 82, -90, 4, 0, 0})
+        )));
+
+    ColumnBasedFrameRowsAndColumns frc = ColumnBasedFrameRowsAndColumnsTest.buildFrame(input);
+    byte[] bytes = om.writeValueAsBytes(frc);
+
+    ColumnBasedFrameRowsAndColumns frc2 = (ColumnBasedFrameRowsAndColumns) om.readValue(bytes, RowsAndColumns.class);
+    assertEquals(frc, frc2);
+  }
 }
diff --git a/processing/src/test/java/org/apache/druid/query/rowsandcols/NoAsRowsAndColumns.java b/processing/src/test/java/org/apache/druid/query/rowsandcols/NoAsRowsAndColumns.java
index 422c87c8b7c6..16cd44e870ba 100644
--- a/processing/src/test/java/org/apache/druid/query/rowsandcols/NoAsRowsAndColumns.java
+++ b/processing/src/test/java/org/apache/druid/query/rowsandcols/NoAsRowsAndColumns.java
@@ -21,7 +21,6 @@
 
 import org.apache.druid.query.rowsandcols.column.Column;
 
-import javax.annotation.Nullable;
 import java.util.Collection;
 
 public class NoAsRowsAndColumns implements RowsAndColumns
@@ -50,12 +49,4 @@ public Column findColumn(String name)
   {
     return rac.findColumn(name);
   }
-
-  @Nullable
-  @Override
-  public <T> T as(Class<T> clazz)
-  {
-    // Pretend like this doesn't implement any semantic interfaces
-    return null;
-  }
 }
diff --git a/processing/src/test/java/org/apache/druid/query/rowsandcols/concrete/ColumnBasedFrameRowsAndColumnsTest.java b/processing/src/test/java/org/apache/druid/query/rowsandcols/concrete/ColumnBasedFrameRowsAndColumnsTest.java
index acfcbe6f83ed..f6a10e011464 100644
--- a/processing/src/test/java/org/apache/druid/query/rowsandcols/concrete/ColumnBasedFrameRowsAndColumnsTest.java
+++ b/processing/src/test/java/org/apache/druid/query/rowsandcols/concrete/ColumnBasedFrameRowsAndColumnsTest.java
@@ -37,7 +37,15 @@ public ColumnBasedFrameRowsAndColumnsTest()
 
   public static ColumnBasedFrameRowsAndColumns buildFrame(MapOfColumnsRowsAndColumns input)
   {
-    LazilyDecoratedRowsAndColumns rac = new LazilyDecoratedRowsAndColumns(input, null, null, null, OffsetLimit.limit(Integer.MAX_VALUE), null, null);
+    LazilyDecoratedRowsAndColumns rac = new LazilyDecoratedRowsAndColumns(
+        input,
+        null,
+        null,
+        null,
+        OffsetLimit.limit(Integer.MAX_VALUE),
+        null,
+        null
+    );
 
     rac.numRows(); // materialize
     return (ColumnBasedFrameRowsAndColumns) rac.getBase();
diff --git a/processing/src/test/java/org/apache/druid/query/rowsandcols/semantic/RowsAndColumnsDecoratorTest.java b/processing/src/test/java/org/apache/druid/query/rowsandcols/semantic/RowsAndColumnsDecoratorTest.java
index f90c2ea19172..41295f480176 100644
--- a/processing/src/test/java/org/apache/druid/query/rowsandcols/semantic/RowsAndColumnsDecoratorTest.java
+++ b/processing/src/test/java/org/apache/druid/query/rowsandcols/semantic/RowsAndColumnsDecoratorTest.java
@@ -19,6 +19,7 @@
 
 package org.apache.druid.query.rowsandcols.semantic;
 
+import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.ImmutableSet;
 import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.Intervals;
@@ -32,6 +33,9 @@
 import org.apache.druid.query.rowsandcols.MapOfColumnsRowsAndColumns;
 import org.apache.druid.query.rowsandcols.RowsAndColumns;
 import org.apache.druid.query.rowsandcols.column.ColumnAccessor;
+import org.apache.druid.query.rowsandcols.column.IntArrayColumn;
+import org.apache.druid.query.rowsandcols.concrete.ColumnBasedFrameRowsAndColumns;
+import org.apache.druid.query.rowsandcols.concrete.ColumnBasedFrameRowsAndColumnsTest;
 import org.apache.druid.segment.ArrayListSegment;
 import org.apache.druid.segment.ColumnValueSelector;
 import org.apache.druid.segment.Cursor;
@@ -214,6 +218,39 @@ public void testDecorationWithListOfResultRows()
     }
   }
 
+  @Test
+  public void testDecoratorWithColumnBasedFrameRAC()
+  {
+    RowSignature siggy = RowSignature.builder()
+                                     .add("colA", ColumnType.LONG)
+                                     .add("colB", ColumnType.LONG)
+                                     .build();
+
+    Object[][] vals = new Object[][]{
+        {1L, 4L},
+        {2L, -4L},
+        {3L, 3L},
+        {4L, -3L},
+        {5L, 4L},
+        {6L, 82L},
+        {7L, -90L},
+        {8L, 4L},
+        {9L, 0L},
+        {10L, 0L}
+        };
+
+    MapOfColumnsRowsAndColumns input = MapOfColumnsRowsAndColumns.fromMap(
+        ImmutableMap.of(
+            "colA", new IntArrayColumn(new int[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}),
+            "colB", new IntArrayColumn(new int[]{4, -4, 3, -3, 4, 82, -90, 4, 0, 0})
+        )
+    );
+
+    ColumnBasedFrameRowsAndColumns frc = ColumnBasedFrameRowsAndColumnsTest.buildFrame(input);
+
+    validateDecorated(frc, siggy, vals, null, null, OffsetLimit.NONE, null);
+  }
+
   private void validateDecorated(
       RowsAndColumns base,
       RowSignature siggy,
diff --git a/sql/src/main/java/org/apache/druid/sql/calcite/run/NativeSqlEngine.java b/sql/src/main/java/org/apache/druid/sql/calcite/run/NativeSqlEngine.java
index 2477ac38dec1..d02d302437b8 100644
--- a/sql/src/main/java/org/apache/druid/sql/calcite/run/NativeSqlEngine.java
+++ b/sql/src/main/java/org/apache/druid/sql/calcite/run/NativeSqlEngine.java
@@ -109,7 +109,6 @@ public boolean featureAvailable(EngineFeature feature)
       case ALLOW_TOP_LEVEL_UNION_ALL:
       case TIME_BOUNDARY_QUERY:
       case GROUPBY_IMPLICITLY_SORTS:
-      case WINDOW_LEAF_OPERATOR:
         return true;
       case CAN_INSERT:
       case CAN_REPLACE:
@@ -117,6 +116,7 @@ public boolean featureAvailable(EngineFeature feature)
       case WRITE_EXTERNAL_DATA:
       case SCAN_ORDER_BY_NON_TIME:
       case SCAN_NEEDS_SIGNATURE:
+      case WINDOW_LEAF_OPERATOR:
         return false;
       default:
         throw SqlEngines.generateUnrecognizedFeatureException(NativeSqlEngine.class.getSimpleName(), feature);
diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteQueryTest.java
index a8dcc35ea7ad..732681de238a 100644
--- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteQueryTest.java
+++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteQueryTest.java
@@ -16086,6 +16086,7 @@ public void testScanAndSortOnJoin()
         .run();
   }
 
+  @NotYetSupported(Modes.UNSUPPORTED_DATASOURCE)
   @Test
   public void testWindowingOverJoin()
   {
diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteWindowQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteWindowQueryTest.java
index 5850be0bd1c5..cd6aa514675f 100644
--- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteWindowQueryTest.java
+++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteWindowQueryTest.java
@@ -298,7 +298,7 @@ public void testFailure_partitionByMVD()
     );
 
     assertEquals(
-        "Encountered a multi value column [v0]. Window processing does not support MVDs. "
+        "Encountered a multi value column. Window processing does not support MVDs. "
         + "Consider using UNNEST or MV_TO_ARRAY.",
         e.getMessage()
     );
diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/NotYetSupported.java b/sql/src/test/java/org/apache/druid/sql/calcite/NotYetSupported.java
index da1431f433d4..f0c48ff44f2f 100644
--- a/sql/src/test/java/org/apache/druid/sql/calcite/NotYetSupported.java
+++ b/sql/src/test/java/org/apache/druid/sql/calcite/NotYetSupported.java
@@ -89,6 +89,7 @@ enum Modes
     RESULT_MISMATCH(AssertionError.class, "(assertResulEquals|AssertionError: column content mismatch)"),
     LONG_CASTING(AssertionError.class, "expected: java.lang.Long"),
     UNSUPPORTED_NULL_ORDERING(DruidException.class, "(A|DE)SCENDING ordering with NULLS (LAST|FIRST)"),
+    UNSUPPORTED_DATASOURCE(DruidException.class, "WindowOperatorQuery must run on top of a query or inline data source"),
     UNION_WITH_COMPLEX_OPERAND(DruidException.class, "Only Table and Values are supported as inputs for Union"),
     UNION_MORE_STRICT_ROWTYPE_CHECK(DruidException.class, "Row signature mismatch in Union inputs"),
     JOIN_CONDITION_NOT_PUSHED_CONDITION(DruidException.class, "SQL requires a join with '.*' condition"),
diff --git a/sql/src/test/resources/calcite/tests/window/wikipediaFramedAggregations.sqlTest b/sql/src/test/resources/calcite/tests/window/wikipediaFramedAggregations.sqlTest
index 87873d44c485..104cb0d2422d 100644
--- a/sql/src/test/resources/calcite/tests/window/wikipediaFramedAggregations.sqlTest
+++ b/sql/src/test/resources/calcite/tests/window/wikipediaFramedAggregations.sqlTest
@@ -2,7 +2,7 @@ type: "operatorValidation"
 
 sql: |
     SELECT
-      countryIsoCode, 
+      countryIsoCode,
       CAST (FLOOR(__time TO HOUR) AS BIGINT) t,
       SUM(delta) delta, 
       SUM(SUM(delta)) OVER (PARTITION BY countryIsoCode ORDER BY CAST (FLOOR(__time TO HOUR) AS BIGINT) ROWS BETWEEN 3 PRECEDING AND 2 FOLLOWING) windowedDelta

From 6e9f6ae3526b11ccef89b959fa20f771415dcfde Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Tue, 17 Sep 2024 01:37:21 -0700
Subject: [PATCH 35/47] MSQ: Include worker context maps in WorkOrders.
 (#17076)

* MSQ: Include worker context maps in WorkOrders.

This provides a mechanism to send contexts to workers in long-lived,
shared JVMs that are not part of the task system.

* Style, coverage.
---
 .../druid/msq/exec/ControllerContext.java     |  5 +-
 .../apache/druid/msq/exec/ControllerImpl.java |  5 +-
 .../org/apache/druid/msq/exec/WorkerImpl.java | 80 +++++++++++------
 .../indexing/IndexerControllerContext.java    | 78 +++++++++++-----
 .../msq/indexing/IndexerWorkerContext.java    |  5 +-
 .../apache/druid/msq/kernel/WorkOrder.java    | 70 ++++++++++++++-
 .../controller/ControllerQueryKernel.java     |  3 +-
 .../ControllerQueryKernelConfig.java          | 62 +++++++++----
 .../msq/util/MultiStageQueryContext.java      | 18 ++--
 .../druid/msq/exec/QueryValidatorTest.java    |  3 +-
 .../apache/druid/msq/exec/WorkerImplTest.java | 88 +++++++++++++++++++
 .../ControllerQueryKernelConfigTest.java      | 83 +++++++++++++++++
 .../msq/test/MSQTestControllerContext.java    |  3 +-
 13 files changed, 408 insertions(+), 95 deletions(-)
 create mode 100644 extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/WorkerImplTest.java
 create mode 100644 extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernelConfigTest.java

diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java
index bc449d141203..44b22af36663 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java
@@ -28,7 +28,6 @@
 import org.apache.druid.msq.input.InputSpecSlicer;
 import org.apache.druid.msq.input.table.SegmentsInputSlice;
 import org.apache.druid.msq.input.table.TableInputSpec;
-import org.apache.druid.msq.kernel.QueryDefinition;
 import org.apache.druid.msq.kernel.controller.ControllerQueryKernelConfig;
 import org.apache.druid.msq.querykit.QueryKit;
 import org.apache.druid.msq.util.MultiStageQueryContext;
@@ -43,7 +42,7 @@ public interface ControllerContext
   /**
    * Configuration for {@link org.apache.druid.msq.kernel.controller.ControllerQueryKernel}.
    */
-  ControllerQueryKernelConfig queryKernelConfig(MSQSpec querySpec, QueryDefinition queryDef);
+  ControllerQueryKernelConfig queryKernelConfig(String queryId, MSQSpec querySpec);
 
   /**
    * Callback from the controller implementation to "register" the controller. Used in the indexing task implementation
@@ -88,7 +87,7 @@ public interface ControllerContext
    *
    * @param queryId               query ID
    * @param querySpec             query spec
-   * @param queryKernelConfig     config from {@link #queryKernelConfig(MSQSpec, QueryDefinition)}
+   * @param queryKernelConfig     config from {@link #queryKernelConfig(String, MSQSpec)}
    * @param workerFailureListener listener that receives callbacks when workers fail
    */
   WorkerManager newWorkerManager(
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
index 2a29d40b9fea..8eda56ad8576 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
@@ -562,8 +562,8 @@ public void addToKernelManipulationQueue(Consumer<ControllerQueryKernel> kernelC
   private QueryDefinition initializeQueryDefAndState(final Closer closer)
   {
     this.selfDruidNode = context.selfNode();
-    this.netClient = new ExceptionWrappingWorkerClient(context.newWorkerClient());
-    closer.register(netClient);
+    this.netClient = closer.register(new ExceptionWrappingWorkerClient(context.newWorkerClient()));
+    this.queryKernelConfig = context.queryKernelConfig(queryId, querySpec);
 
     final QueryContext queryContext = querySpec.getQuery().context();
     final QueryDefinition queryDef = makeQueryDefinition(
@@ -594,7 +594,6 @@ private QueryDefinition initializeQueryDefAndState(final Closer closer)
     QueryValidator.validateQueryDef(queryDef);
     queryDefRef.set(queryDef);
 
-    queryKernelConfig = context.queryKernelConfig(querySpec, queryDef);
     workerManager = context.newWorkerManager(
         queryId,
         querySpec,
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
index 74e3850c6e96..702302f7ea1a 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
@@ -48,7 +48,6 @@
 import org.apache.druid.msq.counters.CounterTracker;
 import org.apache.druid.msq.indexing.InputChannelFactory;
 import org.apache.druid.msq.indexing.MSQWorkerTask;
-import org.apache.druid.msq.indexing.destination.MSQSelectDestination;
 import org.apache.druid.msq.indexing.error.CanceledFault;
 import org.apache.druid.msq.indexing.error.CannotParseExternalDataFault;
 import org.apache.druid.msq.indexing.error.MSQErrorReport;
@@ -388,7 +387,6 @@ private void handleNewWorkOrder(
     final InputChannelFactory inputChannelFactory =
         makeBaseInputChannelFactory(workOrder, controllerClient, kernelHolder.processorCloser);
 
-    final QueryContext queryContext = task != null ? QueryContext.of(task.getContext()) : QueryContext.empty();
     final boolean includeAllCounters = context.includeAllCounters();
     final RunWorkOrder runWorkOrder = new RunWorkOrder(
         workOrder,
@@ -402,8 +400,8 @@ private void handleNewWorkOrder(
         context,
         frameContext,
         makeRunWorkOrderListener(workOrder, controllerClient, criticalWarningCodes, maxVerboseParseExceptions),
-        MultiStageQueryContext.isReindex(queryContext),
-        MultiStageQueryContext.removeNullBytes(queryContext)
+        MultiStageQueryContext.isReindex(workOrder.getWorkerContext()),
+        MultiStageQueryContext.removeNullBytes(workOrder.getWorkerContext())
     );
 
     // Set up processorCloser (called when processing is done).
@@ -560,6 +558,13 @@ public ListenableFuture<InputStream> readStageOutput(
     return getOrCreateStageOutputHolder(stageId, partitionNumber).readRemotelyFrom(offset);
   }
 
+  /**
+   * Accept a new {@link WorkOrder} for execution.
+   *
+   * For backwards-compatibility purposes, this method populates {@link WorkOrder#getOutputChannelMode()}
+   * and {@link WorkOrder#getWorkerContext()} if the controller did not set them. (They are there for newer controllers,
+   * but not older ones.)
+   */
   @Override
   public void postWorkOrder(final WorkOrder workOrder)
   {
@@ -577,28 +582,11 @@ public void postWorkOrder(final WorkOrder workOrder)
       );
     }
 
-    final OutputChannelMode outputChannelMode;
-
-    // This stack of conditions can be removed once we can rely on OutputChannelMode always being in the WorkOrder.
-    // (It will be there for newer controllers; this is a backwards-compatibility thing.)
-    if (workOrder.hasOutputChannelMode()) {
-      outputChannelMode = workOrder.getOutputChannelMode();
-    } else {
-      final MSQSelectDestination selectDestination =
-          task != null
-          ? MultiStageQueryContext.getSelectDestination(QueryContext.of(task.getContext()))
-          : MSQSelectDestination.TASKREPORT;
-
-      outputChannelMode = ControllerQueryKernelUtils.getOutputChannelMode(
-          workOrder.getQueryDefinition(),
-          workOrder.getStageNumber(),
-          selectDestination,
-          task != null && MultiStageQueryContext.isDurableStorageEnabled(QueryContext.of(task.getContext())),
-          false
-      );
-    }
+    final WorkOrder workOrderToUse = makeWorkOrderToUse(
+        workOrder,
+        task != null && task.getContext() != null ? QueryContext.of(task.getContext()) : QueryContext.empty()
+    );
 
-    final WorkOrder workOrderToUse = workOrder.withOutputChannelMode(outputChannelMode);
     kernelManipulationQueue.add(
         kernelHolders ->
             kernelHolders.addKernel(WorkerStageKernel.create(workOrderToUse))
@@ -1009,6 +997,48 @@ private void doCancel()
     );
   }
 
+  /**
+   * Returns a work order based on the provided "originalWorkOrder", but where {@link WorkOrder#hasOutputChannelMode()}
+   * and {@link WorkOrder#hasWorkerContext()} are both true. If the original work order didn't have those fields, they
+   * are populated from the "taskContext". Otherwise the "taskContext" is ignored.
+   *
+   * This method can be removed once we can rely on these fields always being set in the WorkOrder.
+   * (They will be there for newer controllers; this is a backwards-compatibility method.)
+   *
+   * @param originalWorkOrder work order from controller
+   * @param taskContext       task context
+   */
+  static WorkOrder makeWorkOrderToUse(final WorkOrder originalWorkOrder, @Nullable final QueryContext taskContext)
+  {
+    // This condition can be removed once we can rely on QueryContext always being in the WorkOrder.
+    // (It will be there for newer controllers; this is a backwards-compatibility thing.)
+    final QueryContext queryContext;
+    if (originalWorkOrder.hasWorkerContext()) {
+      queryContext = originalWorkOrder.getWorkerContext();
+    } else if (taskContext != null) {
+      queryContext = taskContext;
+    } else {
+      queryContext = QueryContext.empty();
+    }
+
+    // This stack of conditions can be removed once we can rely on OutputChannelMode always being in the WorkOrder.
+    // (It will be there for newer controllers; this is a backwards-compatibility thing.)
+    final OutputChannelMode outputChannelMode;
+    if (originalWorkOrder.hasOutputChannelMode()) {
+      outputChannelMode = originalWorkOrder.getOutputChannelMode();
+    } else {
+      outputChannelMode = ControllerQueryKernelUtils.getOutputChannelMode(
+          originalWorkOrder.getQueryDefinition(),
+          originalWorkOrder.getStageNumber(),
+          MultiStageQueryContext.getSelectDestination(queryContext),
+          MultiStageQueryContext.isDurableStorageEnabled(queryContext),
+          false
+      );
+    }
+
+    return originalWorkOrder.withWorkerContext(queryContext).withOutputChannelMode(outputChannelMode);
+  }
+
   /**
    * Log (at DEBUG level) a string explaining the status of all work assigned to this worker.
    */
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java
index 42808f647426..589b17d632b1 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java
@@ -46,7 +46,7 @@
 import org.apache.druid.msq.indexing.error.UnknownFault;
 import org.apache.druid.msq.input.InputSpecSlicer;
 import org.apache.druid.msq.input.table.TableInputSpecSlicer;
-import org.apache.druid.msq.kernel.QueryDefinition;
+import org.apache.druid.msq.kernel.WorkOrder;
 import org.apache.druid.msq.kernel.controller.ControllerQueryKernelConfig;
 import org.apache.druid.msq.util.MultiStageQueryContext;
 import org.apache.druid.query.DruidMetrics;
@@ -66,6 +66,8 @@
  */
 public class IndexerControllerContext implements ControllerContext
 {
+  public static final int DEFAULT_MAX_CONCURRENT_STAGES = 1;
+
   private static final Logger log = new Logger(IndexerControllerContext.class);
 
   private final MSQControllerTask task;
@@ -96,21 +98,21 @@ public IndexerControllerContext(
 
   @Override
   public ControllerQueryKernelConfig queryKernelConfig(
-      final MSQSpec querySpec,
-      final QueryDefinition queryDef
+      final String queryId,
+      final MSQSpec querySpec
   )
   {
     final ControllerMemoryParameters memoryParameters =
         ControllerMemoryParameters.createProductionInstance(
             memoryIntrospector,
-            queryDef.getFinalStageDefinition().getMaxWorkerCount()
+            querySpec.getTuningConfig().getMaxNumWorkers()
         );
 
     final ControllerQueryKernelConfig config = makeQueryKernelConfig(querySpec, memoryParameters);
 
     log.debug(
         "Query[%s] using %s[%s], %s[%s], %s[%s].",
-        queryDef.getQueryId(),
+        queryId,
         MultiStageQueryContext.CTX_DURABLE_SHUFFLE_STORAGE,
         config.isDurableStorage(),
         MultiStageQueryContext.CTX_FAULT_TOLERANCE,
@@ -210,7 +212,7 @@ public int defaultTargetPartitionsPerWorker()
   }
 
   /**
-   * Helper method for {@link #queryKernelConfig(MSQSpec, QueryDefinition)}. Also used in tests.
+   * Helper method for {@link #queryKernelConfig(String, MSQSpec)}. Also used in tests.
    */
   public static ControllerQueryKernelConfig makeQueryKernelConfig(
       final MSQSpec querySpec,
@@ -218,7 +220,8 @@ public static ControllerQueryKernelConfig makeQueryKernelConfig(
   )
   {
     final QueryContext queryContext = querySpec.getQuery().context();
-    final int maxConcurrentStages = MultiStageQueryContext.getMaxConcurrentStages(queryContext);
+    final int maxConcurrentStages =
+        MultiStageQueryContext.getMaxConcurrentStagesWithDefault(queryContext, DEFAULT_MAX_CONCURRENT_STAGES);
     final boolean isFaultToleranceEnabled = MultiStageQueryContext.isFaultToleranceEnabled(queryContext);
     final boolean isDurableStorageEnabled;
 
@@ -256,9 +259,44 @@ public static ControllerQueryKernelConfig makeQueryKernelConfig(
         .destination(querySpec.getDestination())
         .maxConcurrentStages(maxConcurrentStages)
         .maxRetainedPartitionSketchBytes(memoryParameters.getPartitionStatisticsMaxRetainedBytes())
+        .workerContextMap(makeWorkerContextMap(querySpec, isDurableStorageEnabled, maxConcurrentStages))
         .build();
   }
 
+  /**
+   * Helper method for {@link #makeQueryKernelConfig} and {@link #makeTaskContext}. Makes the worker context map,
+   * i.e., the map that will become {@link WorkOrder#getWorkerContext()}.
+   */
+  public static Map<String, Object> makeWorkerContextMap(
+      final MSQSpec querySpec,
+      final boolean durableStorageEnabled,
+      final int maxConcurrentStages
+  )
+  {
+    final QueryContext queryContext = querySpec.getQuery().context();
+    final long maxParseExceptions = MultiStageQueryContext.getMaxParseExceptions(queryContext);
+    final boolean removeNullBytes = MultiStageQueryContext.removeNullBytes(queryContext);
+    final boolean includeAllCounters = MultiStageQueryContext.getIncludeAllCounters(queryContext);
+    final ImmutableMap.Builder<String, Object> builder = ImmutableMap.builder();
+
+    builder
+        .put(MultiStageQueryContext.CTX_DURABLE_SHUFFLE_STORAGE, durableStorageEnabled)
+        .put(MSQWarnings.CTX_MAX_PARSE_EXCEPTIONS_ALLOWED, maxParseExceptions)
+        .put(MultiStageQueryContext.CTX_IS_REINDEX, MSQControllerTask.isReplaceInputDataSourceTask(querySpec))
+        .put(MultiStageQueryContext.CTX_MAX_CONCURRENT_STAGES, maxConcurrentStages)
+        .put(MultiStageQueryContext.CTX_REMOVE_NULL_BYTES, removeNullBytes)
+        .put(MultiStageQueryContext.CTX_INCLUDE_ALL_COUNTERS, includeAllCounters);
+
+    if (querySpec.getDestination().toSelectDestination() != null) {
+      builder.put(
+          MultiStageQueryContext.CTX_SELECT_DESTINATION,
+          querySpec.getDestination().toSelectDestination().getName()
+      );
+    }
+
+    return builder.build();
+  }
+
   /**
    * Helper method for {@link #newWorkerManager}, split out to be used in tests.
    *
@@ -271,17 +309,16 @@ public static Map<String, Object> makeTaskContext(
   )
   {
     final ImmutableMap.Builder<String, Object> taskContextOverridesBuilder = ImmutableMap.builder();
-    final long maxParseExceptions = MultiStageQueryContext.getMaxParseExceptions(querySpec.getQuery().context());
-    final boolean removeNullBytes = MultiStageQueryContext.removeNullBytes(querySpec.getQuery().context());
-    final boolean includeAllCounters = MultiStageQueryContext.getIncludeAllCounters(querySpec.getQuery().context());
 
-    taskContextOverridesBuilder
-        .put(MultiStageQueryContext.CTX_DURABLE_SHUFFLE_STORAGE, queryKernelConfig.isDurableStorage())
-        .put(MSQWarnings.CTX_MAX_PARSE_EXCEPTIONS_ALLOWED, maxParseExceptions)
-        .put(MultiStageQueryContext.CTX_IS_REINDEX, MSQControllerTask.isReplaceInputDataSourceTask(querySpec))
-        .put(MultiStageQueryContext.CTX_MAX_CONCURRENT_STAGES, queryKernelConfig.getMaxConcurrentStages())
-        .put(MultiStageQueryContext.CTX_REMOVE_NULL_BYTES, removeNullBytes)
-        .put(MultiStageQueryContext.CTX_INCLUDE_ALL_COUNTERS, includeAllCounters);
+    // Put worker context into the task context. That way, workers can get these context keys either from
+    // WorkOrder#getContext or Task#getContext.
+    taskContextOverridesBuilder.putAll(
+        makeWorkerContextMap(
+            querySpec,
+            queryKernelConfig.isDurableStorage(),
+            queryKernelConfig.getMaxConcurrentStages()
+        )
+    );
 
     // Put the lookup loading info in the task context to facilitate selective loading of lookups.
     if (controllerTaskContext.get(LookupLoadingSpec.CTX_LOOKUP_LOADING_MODE) != null) {
@@ -297,13 +334,6 @@ public static Map<String, Object> makeTaskContext(
       );
     }
 
-    if (querySpec.getDestination().toSelectDestination() != null) {
-      taskContextOverridesBuilder.put(
-          MultiStageQueryContext.CTX_SELECT_DESTINATION,
-          querySpec.getDestination().toSelectDestination().getName()
-      );
-    }
-
     // propagate the controller's tags to the worker task for enhanced metrics reporting
     @SuppressWarnings("unchecked")
     Map<String, Object> tags = (Map<String, Object>) controllerTaskContext.get(DruidMetrics.TAGS);
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java
index 2a7d91c40af2..fbb0bff95563 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerWorkerContext.java
@@ -116,7 +116,10 @@ public IndexerWorkerContext(
     this.dataServerQueryHandlerFactory = dataServerQueryHandlerFactory;
 
     final QueryContext queryContext = QueryContext.of(task.getContext());
-    this.maxConcurrentStages = MultiStageQueryContext.getMaxConcurrentStages(queryContext);
+    this.maxConcurrentStages = MultiStageQueryContext.getMaxConcurrentStagesWithDefault(
+        queryContext,
+        IndexerControllerContext.DEFAULT_MAX_CONCURRENT_STAGES
+    );
     this.includeAllCounters = MultiStageQueryContext.getIncludeAllCounters(queryContext);
   }
 
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/WorkOrder.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/WorkOrder.java
index 0c8578702103..2a45605826be 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/WorkOrder.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/WorkOrder.java
@@ -26,9 +26,11 @@
 import org.apache.druid.msq.exec.ControllerClient;
 import org.apache.druid.msq.exec.OutputChannelMode;
 import org.apache.druid.msq.input.InputSlice;
+import org.apache.druid.query.QueryContext;
 
 import javax.annotation.Nullable;
 import java.util.List;
+import java.util.Map;
 import java.util.Objects;
 
 /**
@@ -51,9 +53,18 @@ public class WorkOrder
   @Nullable
   private final List<String> workerIds;
 
+  /**
+   * Always non-null for newer controllers. This is marked nullable for backwards-compatibility reasons.
+   */
   @Nullable
   private final OutputChannelMode outputChannelMode;
 
+  /**
+   * Always non-null for newer controllers. This is marked nullable for backwards-compatibility reasons.
+   */
+  @Nullable
+  private final QueryContext workerContext;
+
   @JsonCreator
   @SuppressWarnings("rawtypes")
   public WorkOrder(
@@ -63,7 +74,8 @@ public WorkOrder(
       @JsonProperty("input") final List<InputSlice> workerInputs,
       @JsonProperty("extra") @Nullable final ExtraInfoHolder extraInfoHolder,
       @JsonProperty("workers") @Nullable final List<String> workerIds,
-      @JsonProperty("output") @Nullable final OutputChannelMode outputChannelMode
+      @JsonProperty("output") @Nullable final OutputChannelMode outputChannelMode,
+      @JsonProperty("context") @Nullable final Map<String, Object> workerContext
   )
   {
     this.queryDefinition = Preconditions.checkNotNull(queryDefinition, "queryDefinition");
@@ -73,6 +85,7 @@ public WorkOrder(
     this.extraInfoHolder = extraInfoHolder;
     this.workerIds = workerIds;
     this.outputChannelMode = outputChannelMode;
+    this.workerContext = workerContext != null ? QueryContext.of(workerContext) : null;
   }
 
   @JsonProperty("query")
@@ -124,6 +137,10 @@ public boolean hasOutputChannelMode()
     return outputChannelMode != null;
   }
 
+  /**
+   * Retrieves the output channel mode set by the controller. Null means the controller didn't set it, which means
+   * we're dealing with an older controller.
+   */
   @Nullable
   @JsonProperty("output")
   @JsonInclude(JsonInclude.Include.NON_NULL)
@@ -132,6 +149,29 @@ public OutputChannelMode getOutputChannelMode()
     return outputChannelMode;
   }
 
+  public boolean hasWorkerContext()
+  {
+    return workerContext != null;
+  }
+
+  /**
+   * Retrieves the query context set by the controller. Null means the controller didn't set it, which means
+   * we're dealing with an older controller.
+   */
+  @Nullable
+  public QueryContext getWorkerContext()
+  {
+    return workerContext;
+  }
+
+  @Nullable
+  @JsonProperty("context")
+  @JsonInclude(JsonInclude.Include.NON_NULL)
+  public Map<String, Object> getContextForSerialization()
+  {
+    return workerContext != null ? workerContext.asMap() : null;
+  }
+
   @Nullable
   public Object getExtraInfo()
   {
@@ -155,7 +195,26 @@ public WorkOrder withOutputChannelMode(final OutputChannelMode newOutputChannelM
           workerInputs,
           extraInfoHolder,
           workerIds,
-          newOutputChannelMode
+          newOutputChannelMode,
+          workerContext != null ? workerContext.asMap() : null
+      );
+    }
+  }
+
+  public WorkOrder withWorkerContext(final QueryContext newContext)
+  {
+    if (Objects.equals(newContext, this.workerContext)) {
+      return this;
+    } else {
+      return new WorkOrder(
+          queryDefinition,
+          stageNumber,
+          workerNumber,
+          workerInputs,
+          extraInfoHolder,
+          workerIds,
+          outputChannelMode,
+          newContext.asMap()
       );
     }
   }
@@ -176,7 +235,8 @@ public boolean equals(Object o)
            && Objects.equals(workerInputs, workOrder.workerInputs)
            && Objects.equals(extraInfoHolder, workOrder.extraInfoHolder)
            && Objects.equals(workerIds, workOrder.workerIds)
-           && Objects.equals(outputChannelMode, workOrder.outputChannelMode);
+           && Objects.equals(outputChannelMode, workOrder.outputChannelMode)
+           && Objects.equals(workerContext, workOrder.workerContext);
   }
 
   @Override
@@ -189,7 +249,8 @@ public int hashCode()
         workerInputs,
         extraInfoHolder,
         workerIds,
-        outputChannelMode
+        outputChannelMode,
+        workerContext
     );
   }
 
@@ -204,6 +265,7 @@ public String toString()
            ", extraInfoHolder=" + extraInfoHolder +
            ", workerIds=" + workerIds +
            ", outputChannelMode=" + outputChannelMode +
+           ", context=" + workerContext +
            '}';
   }
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernel.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernel.java
index b01091f9ad7a..62a133269093 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernel.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernel.java
@@ -302,7 +302,8 @@ public Int2ObjectMap<WorkOrder> createWorkOrders(
           workerInputs.inputsForWorker(workerNumber),
           extraInfoHolder,
           config.getWorkerIds(),
-          outputChannelMode
+          outputChannelMode,
+          config.getWorkerContextMap()
       );
 
       QueryValidator.validateWorkOrder(workOrder);
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernelConfig.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernelConfig.java
index 5c754aedd4f4..f7516c63c929 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernelConfig.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernelConfig.java
@@ -21,9 +21,12 @@
 
 import org.apache.druid.java.util.common.IAE;
 import org.apache.druid.msq.indexing.destination.MSQDestination;
+import org.apache.druid.msq.kernel.WorkOrder;
 
 import javax.annotation.Nullable;
+import java.util.Collections;
 import java.util.List;
+import java.util.Map;
 import java.util.Objects;
 
 /**
@@ -37,22 +40,22 @@ public class ControllerQueryKernelConfig
   private final boolean durableStorage;
   private final boolean faultTolerance;
   private final MSQDestination destination;
-
   @Nullable
-  private final String controllerId;
-
+  private final String controllerHost;
   @Nullable
   private final List<String> workerIds;
+  private final Map<String, Object> workerContextMap;
 
-  private ControllerQueryKernelConfig(
+  ControllerQueryKernelConfig(
       int maxRetainedPartitionSketchBytes,
       int maxConcurrentStages,
       boolean pipeline,
       boolean durableStorage,
       boolean faultTolerance,
       MSQDestination destination,
-      @Nullable String controllerId,
-      @Nullable List<String> workerIds
+      @Nullable String controllerHost,
+      @Nullable List<String> workerIds,
+      Map<String, Object> workerContextMap
   )
   {
     if (maxRetainedPartitionSketchBytes <= 0) {
@@ -85,8 +88,9 @@ private ControllerQueryKernelConfig(
     this.durableStorage = durableStorage;
     this.faultTolerance = faultTolerance;
     this.destination = destination;
-    this.controllerId = controllerId;
+    this.controllerHost = controllerHost;
     this.workerIds = workerIds;
+    this.workerContextMap = workerContextMap;
   }
 
   public static Builder builder()
@@ -130,6 +134,14 @@ public List<String> getWorkerIds()
     return workerIds;
   }
 
+  /**
+   * Map to include in {@link WorkOrder}, as {@link WorkOrder#getWorkerContext()}.
+   */
+  public Map<String, Object> getWorkerContextMap()
+  {
+    return workerContextMap;
+  }
+
   @Override
   public boolean equals(Object o)
   {
@@ -145,8 +157,10 @@ public boolean equals(Object o)
            && pipeline == that.pipeline
            && durableStorage == that.durableStorage
            && faultTolerance == that.faultTolerance
-           && Objects.equals(controllerId, that.controllerId)
-           && Objects.equals(workerIds, that.workerIds);
+           && Objects.equals(destination, that.destination)
+           && Objects.equals(controllerHost, that.controllerHost)
+           && Objects.equals(workerIds, that.workerIds)
+           && Objects.equals(workerContextMap, that.workerContextMap);
   }
 
   @Override
@@ -158,8 +172,10 @@ public int hashCode()
         pipeline,
         durableStorage,
         faultTolerance,
-        controllerId,
-        workerIds
+        destination,
+        controllerHost,
+        workerIds,
+        workerContextMap
     );
   }
 
@@ -171,9 +187,11 @@ public String toString()
            ", maxConcurrentStages=" + maxConcurrentStages +
            ", pipeline=" + pipeline +
            ", durableStorage=" + durableStorage +
-           ", faultTolerant=" + faultTolerance +
-           ", controllerId='" + controllerId + '\'' +
+           ", faultTolerance=" + faultTolerance +
+           ", destination=" + destination +
+           ", controllerHost='" + controllerHost + '\'' +
            ", workerIds=" + workerIds +
+           ", workerContextMap=" + workerContextMap +
            '}';
   }
 
@@ -185,8 +203,9 @@ public static class Builder
     private boolean durableStorage;
     private boolean faultTolerant;
     private MSQDestination destination;
-    private String controllerId;
+    private String controllerHost;
     private List<String> workerIds;
+    private Map<String, Object> workerContextMap = Collections.emptyMap();
 
     /**
      * Use {@link #builder()}.
@@ -231,9 +250,9 @@ public Builder destination(final MSQDestination destination)
       return this;
     }
 
-    public Builder controllerId(final String controllerId)
+    public Builder controllerHost(final String controllerHost)
     {
-      this.controllerId = controllerId;
+      this.controllerHost = controllerHost;
       return this;
     }
 
@@ -243,6 +262,12 @@ public Builder workerIds(final List<String> workerIds)
       return this;
     }
 
+    public Builder workerContextMap(final Map<String, Object> workerContextMap)
+    {
+      this.workerContextMap = workerContextMap;
+      return this;
+    }
+
     public ControllerQueryKernelConfig build()
     {
       return new ControllerQueryKernelConfig(
@@ -252,8 +277,9 @@ public ControllerQueryKernelConfig build()
           durableStorage,
           faultTolerant,
           destination,
-          controllerId,
-          workerIds
+          controllerHost,
+          workerIds,
+          workerContextMap
       );
     }
   }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/util/MultiStageQueryContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/util/MultiStageQueryContext.java
index 63601c907a24..4ed98dca594e 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/util/MultiStageQueryContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/util/MultiStageQueryContext.java
@@ -119,7 +119,6 @@ public class MultiStageQueryContext
   public static final SegmentSource DEFAULT_INCLUDE_SEGMENT_SOURCE = SegmentSource.NONE;
 
   public static final String CTX_MAX_CONCURRENT_STAGES = "maxConcurrentStages";
-  public static final int DEFAULT_MAX_CONCURRENT_STAGES = 1;
   public static final String CTX_DURABLE_SHUFFLE_STORAGE = "durableShuffleStorage";
   private static final boolean DEFAULT_DURABLE_SHUFFLE_STORAGE = false;
   public static final String CTX_SELECT_DESTINATION = "selectDestination";
@@ -206,11 +205,14 @@ public static String getMSQMode(final QueryContext queryContext)
     );
   }
 
-  public static int getMaxConcurrentStages(final QueryContext queryContext)
+  public static int getMaxConcurrentStagesWithDefault(
+      final QueryContext queryContext,
+      final int defaultMaxConcurrentStages
+  )
   {
     return queryContext.getInt(
         CTX_MAX_CONCURRENT_STAGES,
-        DEFAULT_MAX_CONCURRENT_STAGES
+        defaultMaxConcurrentStages
     );
   }
 
@@ -336,16 +338,6 @@ public static MSQSelectDestination getSelectDestination(final QueryContext query
     );
   }
 
-  @Nullable
-  public static MSQSelectDestination getSelectDestinationOrNull(final QueryContext queryContext)
-  {
-    return QueryContexts.getAsEnum(
-        CTX_SELECT_DESTINATION,
-        queryContext.getString(CTX_SELECT_DESTINATION),
-        MSQSelectDestination.class
-    );
-  }
-
   public static int getRowsInMemory(final QueryContext queryContext)
   {
     return queryContext.getInt(CTX_ROWS_IN_MEMORY, DEFAULT_ROWS_IN_MEMORY);
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/QueryValidatorTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/QueryValidatorTest.java
index d7364124483a..c1d1030fb08b 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/QueryValidatorTest.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/QueryValidatorTest.java
@@ -108,6 +108,7 @@ public void testMoreInputFiles()
         Collections.singletonList(() -> inputFiles), // Slice with a large number of inputFiles
         null,
         null,
+        null,
         null
     );
 
@@ -125,7 +126,7 @@ public void testMoreInputFiles()
     QueryValidator.validateWorkOrder(workOrder);
   }
 
-  private static QueryDefinition createQueryDefinition(int numColumns, int numWorkers)
+  public static QueryDefinition createQueryDefinition(int numColumns, int numWorkers)
   {
     QueryDefinitionBuilder builder = QueryDefinition.builder(UUID.randomUUID().toString());
 
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/WorkerImplTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/WorkerImplTest.java
new file mode 100644
index 000000000000..32cd36d09980
--- /dev/null
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/WorkerImplTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.exec;
+
+import com.google.common.collect.ImmutableMap;
+import org.apache.druid.msq.kernel.WorkOrder;
+import org.apache.druid.msq.util.MultiStageQueryContext;
+import org.apache.druid.query.QueryContext;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.Collections;
+import java.util.Map;
+
+public class WorkerImplTest
+{
+  @Test
+  public void test_makeWorkOrderToUse_nothingMissing()
+  {
+    final WorkOrder workOrder = new WorkOrder(
+        QueryValidatorTest.createQueryDefinition(10, 2),
+        0,
+        0,
+        Collections.singletonList(() -> 1),
+        null,
+        null,
+        OutputChannelMode.MEMORY,
+        ImmutableMap.of("foo", "bar")
+    );
+
+    Assert.assertSame(
+        workOrder,
+        WorkerImpl.makeWorkOrderToUse(
+            workOrder,
+            QueryContext.of(ImmutableMap.of("foo", "baz")) /* Conflicts with workOrder context; should be ignored */
+        )
+    );
+  }
+
+  @Test
+  public void test_makeWorkOrderToUse_missingOutputChannelModeAndWorkerContext()
+  {
+    final Map<String, Object> taskContext =
+        ImmutableMap.of("foo", "bar", MultiStageQueryContext.CTX_DURABLE_SHUFFLE_STORAGE, true);
+
+    final WorkOrder workOrder = new WorkOrder(
+        QueryValidatorTest.createQueryDefinition(10, 2),
+        1,
+        2,
+        Collections.singletonList(() -> 1),
+        null,
+        null,
+        null,
+        null
+    );
+
+    Assert.assertEquals(
+        new WorkOrder(
+            workOrder.getQueryDefinition(),
+            workOrder.getStageNumber(),
+            workOrder.getWorkerNumber(),
+            workOrder.getInputs(),
+            null,
+            null,
+            OutputChannelMode.DURABLE_STORAGE_INTERMEDIATE,
+            taskContext
+        ),
+        WorkerImpl.makeWorkOrderToUse(workOrder, QueryContext.of(taskContext))
+    );
+  }
+}
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernelConfigTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernelConfigTest.java
new file mode 100644
index 000000000000..765101359f66
--- /dev/null
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/kernel/controller/ControllerQueryKernelConfigTest.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.kernel.controller;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import nl.jqno.equalsverifier.EqualsVerifier;
+import org.apache.druid.msq.indexing.destination.DurableStorageMSQDestination;
+import org.apache.druid.msq.indexing.destination.MSQDestination;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.List;
+import java.util.Map;
+
+public class ControllerQueryKernelConfigTest
+{
+  @Test
+  public void testBuilder()
+  {
+    int maxRetainedPartitionSketchBytes = 1;
+    int maxConcurrentStages = 2;
+    boolean pipeline = false;
+    boolean durableStorage = true;
+    boolean faultTolerance = true;
+    MSQDestination destination = DurableStorageMSQDestination.instance();
+    String controllerHost = "controllerHost";
+    List<String> workerIds = ImmutableList.of("worker1", "worker2");
+    Map<String, Object> workerContextMap = ImmutableMap.of("foo", "bar");
+
+    final ControllerQueryKernelConfig config1 = new ControllerQueryKernelConfig(
+        maxRetainedPartitionSketchBytes,
+        maxConcurrentStages,
+        pipeline,
+        durableStorage,
+        faultTolerance,
+        destination,
+        controllerHost,
+        workerIds,
+        workerContextMap
+    );
+
+    final ControllerQueryKernelConfig config2 = ControllerQueryKernelConfig
+        .builder()
+        .maxRetainedPartitionSketchBytes(maxRetainedPartitionSketchBytes)
+        .maxConcurrentStages(maxConcurrentStages)
+        .pipeline(pipeline)
+        .durableStorage(durableStorage)
+        .faultTolerance(faultTolerance)
+        .destination(destination)
+        .controllerHost(controllerHost)
+        .workerIds(workerIds)
+        .workerContextMap(workerContextMap)
+        .build();
+
+    Assert.assertEquals(config1, config2);
+  }
+
+  @Test
+  public void testEquals()
+  {
+    EqualsVerifier.forClass(ControllerQueryKernelConfig.class)
+                  .usingGetClass()
+                  .verify();
+  }
+}
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java
index 3034be399849..ed518afd2ef8 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java
@@ -59,7 +59,6 @@
 import org.apache.druid.msq.indexing.MSQWorkerTaskLauncher;
 import org.apache.druid.msq.input.InputSpecSlicer;
 import org.apache.druid.msq.input.table.TableInputSpecSlicer;
-import org.apache.druid.msq.kernel.QueryDefinition;
 import org.apache.druid.msq.kernel.controller.ControllerQueryKernelConfig;
 import org.apache.druid.msq.util.MultiStageQueryContext;
 import org.apache.druid.query.QueryContext;
@@ -269,7 +268,7 @@ public ListenableFuture<Void> cancelTask(String workerId)
   };
 
   @Override
-  public ControllerQueryKernelConfig queryKernelConfig(MSQSpec querySpec, QueryDefinition queryDef)
+  public ControllerQueryKernelConfig queryKernelConfig(String queryId, MSQSpec querySpec)
   {
     return IndexerControllerContext.makeQueryKernelConfig(querySpec, new ControllerMemoryParameters(100_000_000));
   }

From 99dfb66fd68c04fa7846623950becc43e8a663ae Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Tue, 17 Sep 2024 01:37:46 -0700
Subject: [PATCH 36/47] Remove workerId parameter from postWorkerError.
 (#17072)

* Remove workerId parameter from postWorkerError.

It was redundant to MSQErrorReport#getTaskId.

* Fix javadoc.
---
 .../src/main/java/org/apache/druid/msq/exec/Controller.java  | 2 +-
 .../java/org/apache/druid/msq/exec/ControllerClient.java     | 5 +----
 .../src/main/java/org/apache/druid/msq/exec/WorkerImpl.java  | 2 +-
 .../druid/msq/indexing/client/IndexerControllerClient.java   | 4 ++--
 .../msq/indexing/error/MSQWarningReportLimiterPublisher.java | 2 +-
 .../org/apache/druid/msq/test/MSQTestControllerClient.java   | 2 +-
 6 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/Controller.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/Controller.java
index d2370b057935..d316b9b6b0b7 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/Controller.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/Controller.java
@@ -84,7 +84,7 @@ void updatePartialKeyStatisticsInformation(
    * taskId, not by query/stage/worker, because system errors are associated
    * with a task rather than a specific query/stage/worker execution context.
    *
-   * @see ControllerClient#postWorkerError(String, MSQErrorReport)
+   * @see ControllerClient#postWorkerError(MSQErrorReport)
    */
   void workerError(MSQErrorReport errorReport);
 
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerClient.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerClient.java
index 428ce59cd8fa..f56b752133f6 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerClient.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerClient.java
@@ -83,10 +83,7 @@ void postResultsComplete(
   /**
    * Client side method to inform the controller that the error has occured in the given worker.
    */
-  void postWorkerError(
-      String workerId,
-      MSQErrorReport errorWrapper
-  ) throws IOException;
+  void postWorkerError(MSQErrorReport errorWrapper) throws IOException;
 
   /**
    * Client side method to inform the controller about the warnings generated by the given worker.
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
index 702302f7ea1a..906d9e041b3c 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
@@ -202,7 +202,7 @@ public void run()
         log.warn("%s", logMessage);
 
         if (controllerAlive) {
-          controllerClient.postWorkerError(context.workerId(), errorReport);
+          controllerClient.postWorkerError(errorReport);
         }
 
         if (t != null) {
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerControllerClient.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerControllerClient.java
index 1e31de71a8ac..1a420d69b6c9 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerControllerClient.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/client/IndexerControllerClient.java
@@ -125,11 +125,11 @@ public void postResultsComplete(StageId stageId, int workerNumber, @Nullable Obj
   }
 
   @Override
-  public void postWorkerError(String workerId, MSQErrorReport errorWrapper) throws IOException
+  public void postWorkerError(MSQErrorReport errorWrapper) throws IOException
   {
     final String path = StringUtils.format(
         "/workerError/%s",
-        StringUtils.urlEncode(workerId)
+        StringUtils.urlEncode(errorWrapper.getTaskId())
     );
 
     doRequest(
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportLimiterPublisher.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportLimiterPublisher.java
index ffc74077502e..a820ed7995ac 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportLimiterPublisher.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/MSQWarningReportLimiterPublisher.java
@@ -97,7 +97,7 @@ public void publishException(int stageNumber, Throwable e)
       // Send the warning as an error if it is disallowed altogether
       if (criticalWarningCodes.contains(errorCode)) {
         try {
-          controllerClient.postWorkerError(workerId, MSQErrorReport.fromException(workerId, host, stageNumber, e));
+          controllerClient.postWorkerError(MSQErrorReport.fromException(workerId, host, stageNumber, e));
         }
         catch (IOException postException) {
           throw new RE(postException, "Failed to post the worker error [%s] to the controller", errorCode);
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerClient.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerClient.java
index 4c7ca61be023..3791be4f309e 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerClient.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerClient.java
@@ -75,7 +75,7 @@ public void postResultsComplete(StageId stageId, int workerNumber, @Nullable Obj
   }
 
   @Override
-  public void postWorkerError(String workerId, MSQErrorReport errorWrapper)
+  public void postWorkerError(MSQErrorReport errorWrapper)
   {
     controller.workerError(errorWrapper);
   }

From f513fbcdf16971784d5295acceaa650c2b9171a1 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Tue, 17 Sep 2024 03:51:18 -0700
Subject: [PATCH 37/47] TableInputSpecSlicer changes to support running on
 Brokers. (#17074)

* TableInputSpecSlicer changes to support running on Brokers.

Changes:

1) Rename TableInputSpecSlicer to IndexerTableInputSpecSlicer, in anticipation
   of a new implementation being added for controllers running on Brokers.

2) Allow the context to use the WorkerManager to build the TableInputSpecSlicer,
   in anticipation of Brokers wanting to use this to assign segments to servers
   that are already serving those segments.

3) Remove unused DataSegmentTimelineView interface.

4) Add additional javadoc to DataSegmentProvider.

* Style.
---
 .../druid/msq/exec/ControllerContext.java     |  2 +-
 .../apache/druid/msq/exec/ControllerImpl.java |  2 +-
 .../indexing/IndexerControllerContext.java    |  5 +-
 .../IndexerTableInputSpecSlicer.java}         | 14 ++++--
 .../msq/input/table/SegmentsInputSlice.java   |  2 +-
 .../msq/querykit/DataSegmentProvider.java     |  8 ++-
 .../msq/querykit/DataSegmentTimelineView.java | 49 -------------------
 ...a => IndexerTableInputSpecSlicerTest.java} |  7 +--
 .../msq/test/MSQTestControllerContext.java    |  6 +--
 9 files changed, 29 insertions(+), 66 deletions(-)
 rename extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/{input/table/TableInputSpecSlicer.java => indexing/IndexerTableInputSpecSlicer.java} (96%)
 delete mode 100644 extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSegmentTimelineView.java
 rename extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/input/table/{TableInputSpecSlicerTest.java => IndexerTableInputSpecSlicerTest.java} (98%)

diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java
index 44b22af36663..58b32e96e7fa 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java
@@ -74,7 +74,7 @@ public interface ControllerContext
   /**
    * Provides an {@link InputSpecSlicer} that slices {@link TableInputSpec} into {@link SegmentsInputSlice}.
    */
-  InputSpecSlicer newTableInputSpecSlicer();
+  InputSpecSlicer newTableInputSpecSlicer(WorkerManager workerManager);
 
   /**
    * Provide access to segment actions in the Overlord. Only called for ingestion queries, i.e., where
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
index 8eda56ad8576..8457675e8f26 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
@@ -366,7 +366,7 @@ private void runInternal(final QueryListener queryListener, final Closer closer)
 
       // Execution-related: run the multi-stage QueryDefinition.
       final InputSpecSlicerFactory inputSpecSlicerFactory =
-          makeInputSpecSlicerFactory(context.newTableInputSpecSlicer());
+          makeInputSpecSlicerFactory(context.newTableInputSpecSlicer(workerManager));
 
       final Pair<ControllerQueryKernel, ListenableFuture<?>> queryRunResult =
           new RunQueryUntilDone(
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java
index 589b17d632b1..0e2cc03fda74 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java
@@ -45,7 +45,6 @@
 import org.apache.druid.msq.indexing.error.MSQWarnings;
 import org.apache.druid.msq.indexing.error.UnknownFault;
 import org.apache.druid.msq.input.InputSpecSlicer;
-import org.apache.druid.msq.input.table.TableInputSpecSlicer;
 import org.apache.druid.msq.kernel.WorkOrder;
 import org.apache.druid.msq.kernel.controller.ControllerQueryKernelConfig;
 import org.apache.druid.msq.util.MultiStageQueryContext;
@@ -149,11 +148,11 @@ public DruidNode selfNode()
   }
 
   @Override
-  public InputSpecSlicer newTableInputSpecSlicer()
+  public InputSpecSlicer newTableInputSpecSlicer(final WorkerManager workerManager)
   {
     final SegmentSource includeSegmentSource =
         MultiStageQueryContext.getSegmentSources(task.getQuerySpec().getQuery().context());
-    return new TableInputSpecSlicer(
+    return new IndexerTableInputSpecSlicer(
         toolbox.getCoordinatorClient(),
         toolbox.getTaskActionClient(),
         includeSegmentSource
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/input/table/TableInputSpecSlicer.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerTableInputSpecSlicer.java
similarity index 96%
rename from extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/input/table/TableInputSpecSlicer.java
rename to extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerTableInputSpecSlicer.java
index 916dd3c1db38..48283bdd78a2 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/input/table/TableInputSpecSlicer.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerTableInputSpecSlicer.java
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.apache.druid.msq.input.table;
+package org.apache.druid.msq.indexing;
 
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableList;
@@ -35,6 +35,12 @@
 import org.apache.druid.msq.input.InputSpecSlicer;
 import org.apache.druid.msq.input.NilInputSlice;
 import org.apache.druid.msq.input.SlicerUtils;
+import org.apache.druid.msq.input.table.DataSegmentWithLocation;
+import org.apache.druid.msq.input.table.DataServerRequestDescriptor;
+import org.apache.druid.msq.input.table.DataServerSelector;
+import org.apache.druid.msq.input.table.RichSegmentDescriptor;
+import org.apache.druid.msq.input.table.SegmentsInputSlice;
+import org.apache.druid.msq.input.table.TableInputSpec;
 import org.apache.druid.query.filter.DimFilterUtils;
 import org.apache.druid.server.coordination.DruidServerMetadata;
 import org.apache.druid.timeline.DataSegment;
@@ -60,15 +66,15 @@
 /**
  * Slices {@link TableInputSpec} into {@link SegmentsInputSlice} in tasks.
  */
-public class TableInputSpecSlicer implements InputSpecSlicer
+public class IndexerTableInputSpecSlicer implements InputSpecSlicer
 {
-  private static final Logger log = new Logger(TableInputSpecSlicer.class);
+  private static final Logger log = new Logger(IndexerTableInputSpecSlicer.class);
 
   private final CoordinatorClient coordinatorClient;
   private final TaskActionClient taskActionClient;
   private final SegmentSource includeSegmentSource;
 
-  public TableInputSpecSlicer(
+  public IndexerTableInputSpecSlicer(
       CoordinatorClient coordinatorClient,
       TaskActionClient taskActionClient,
       SegmentSource includeSegmentSource
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/input/table/SegmentsInputSlice.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/input/table/SegmentsInputSlice.java
index dd59dfebd803..6c4ec10d6dfa 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/input/table/SegmentsInputSlice.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/input/table/SegmentsInputSlice.java
@@ -32,7 +32,7 @@
 /**
  * Input slice representing a set of segments to read.
  * <br>
- * Sliced from {@link TableInputSpec} by {@link TableInputSpecSlicer}.
+ * Sliced from {@link TableInputSpec}.
  * <br>
  * Similar to {@link org.apache.druid.query.spec.MultipleSpecificSegmentSpec} from native queries.
  * <br>
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSegmentProvider.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSegmentProvider.java
index 91ee4a487885..392ac4e91506 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSegmentProvider.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSegmentProvider.java
@@ -32,8 +32,14 @@ public interface DataSegmentProvider
    * Returns a supplier that fetches the segment corresponding to the provided segmentId from deep storage. The segment
    * is not actually fetched until you call {@link Supplier#get()}. Once you call this, make sure to also call
    * {@link ResourceHolder#close()}.
-   * <br>
+   *
    * It is not necessary to call {@link ResourceHolder#close()} if you never call {@link Supplier#get()}.
+   *
+   * @param segmentId       segment ID to fetch
+   * @param channelCounters counters to increment when the segment is closed
+   * @param isReindex       true if this is a DML command (INSERT or REPLACE) writing into the same table it is
+   *                        reading from; false otherwise. When true, implementations must only allow reading from
+   *                        segments that are currently-used according to the Coordinator.
    */
   Supplier<ResourceHolder<Segment>> fetchSegment(
       SegmentId segmentId,
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSegmentTimelineView.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSegmentTimelineView.java
deleted file mode 100644
index cc010a104c6c..000000000000
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSegmentTimelineView.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.druid.msq.querykit;
-
-import org.apache.druid.query.planning.DataSourceAnalysis;
-import org.apache.druid.timeline.DataSegment;
-import org.apache.druid.timeline.TimelineLookup;
-import org.joda.time.Interval;
-
-import java.util.List;
-import java.util.Optional;
-
-public interface DataSegmentTimelineView
-{
-  /**
-   * Returns the timeline for a datasource, if it 'exists'. The analysis object passed in must represent a scan-based
-   * datasource of a single table. (i.e., {@link DataSourceAnalysis#getBaseTableDataSource()} must be present.)
-   *
-   * @param dataSource table data source name
-   * @param intervals  relevant intervals. The returned timeline will *at least* include all segments that overlap
-   *                   these intervals. It may also include more. Empty means the timeline may not contain any
-   *                   segments at all.
-   *
-   * @return timeline, if it 'exists'
-   *
-   * @throws IllegalStateException if 'analysis' does not represent a scan-based datasource of a single table
-   */
-  Optional<TimelineLookup<String, DataSegment>> getTimeline(
-      String dataSource,
-      List<Interval> intervals
-  );
-}
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/input/table/TableInputSpecSlicerTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/input/table/IndexerTableInputSpecSlicerTest.java
similarity index 98%
rename from extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/input/table/TableInputSpecSlicerTest.java
rename to extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/input/table/IndexerTableInputSpecSlicerTest.java
index a27ae7d97804..ac864419abed 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/input/table/TableInputSpecSlicerTest.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/input/table/IndexerTableInputSpecSlicerTest.java
@@ -27,6 +27,7 @@
 import org.apache.druid.indexing.common.actions.TaskActionClient;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.msq.exec.SegmentSource;
+import org.apache.druid.msq.indexing.IndexerTableInputSpecSlicer;
 import org.apache.druid.msq.input.NilInputSlice;
 import org.apache.druid.query.filter.SelectorDimFilter;
 import org.apache.druid.testing.InitializedNullHandlingTest;
@@ -41,7 +42,7 @@
 
 import java.util.Collections;
 
-public class TableInputSpecSlicerTest extends InitializedNullHandlingTest
+public class IndexerTableInputSpecSlicerTest extends InitializedNullHandlingTest
 {
   private static final String DATASOURCE = "test-ds";
   private static final long BYTES_PER_SEGMENT = 1000;
@@ -97,7 +98,7 @@ public class TableInputSpecSlicerTest extends InitializedNullHandlingTest
       BYTES_PER_SEGMENT
   );
   private SegmentTimeline timeline;
-  private TableInputSpecSlicer slicer;
+  private IndexerTableInputSpecSlicer slicer;
   private TaskActionClient taskActionClient;
 
   @Before
@@ -131,7 +132,7 @@ public <RetType> RetType submit(TaskAction<RetType> taskAction)
       }
     };
 
-    slicer = new TableInputSpecSlicer(
+    slicer = new IndexerTableInputSpecSlicer(
         null /* not used for SegmentSource.NONE */,
         taskActionClient,
         SegmentSource.NONE
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java
index ed518afd2ef8..cd20f24d244f 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java
@@ -54,11 +54,11 @@
 import org.apache.druid.msq.exec.WorkerMemoryParameters;
 import org.apache.druid.msq.exec.WorkerStorageParameters;
 import org.apache.druid.msq.indexing.IndexerControllerContext;
+import org.apache.druid.msq.indexing.IndexerTableInputSpecSlicer;
 import org.apache.druid.msq.indexing.MSQSpec;
 import org.apache.druid.msq.indexing.MSQWorkerTask;
 import org.apache.druid.msq.indexing.MSQWorkerTaskLauncher;
 import org.apache.druid.msq.input.InputSpecSlicer;
-import org.apache.druid.msq.input.table.TableInputSpecSlicer;
 import org.apache.druid.msq.kernel.controller.ControllerQueryKernelConfig;
 import org.apache.druid.msq.util.MultiStageQueryContext;
 import org.apache.druid.query.QueryContext;
@@ -303,9 +303,9 @@ public TaskActionClient taskActionClient()
   }
 
   @Override
-  public InputSpecSlicer newTableInputSpecSlicer()
+  public InputSpecSlicer newTableInputSpecSlicer(WorkerManager workerManager)
   {
-    return new TableInputSpecSlicer(
+    return new IndexerTableInputSpecSlicer(
         coordinatorClient,
         taskActionClient,
         MultiStageQueryContext.getSegmentSources(queryContext)

From 0595e33b7a3ec36846828a9548b928f79b24cd87 Mon Sep 17 00:00:00 2001
From: Lasse Mammen <lkm@bookboon.com>
Date: Tue, 17 Sep 2024 13:57:34 +0100
Subject: [PATCH 38/47] feat: json_merge expression and sql function (#17081)

---
 docs/querying/math-expr.md                    |   1 +
 docs/querying/sql-functions.md                |   7 ++
 docs/querying/sql-json-functions.md           |   1 +
 .../apache/druid/guice/ExpressionModule.java  |   1 +
 .../expression/NestedDataExpressions.java     | 112 ++++++++++++++++++
 .../expression/NestedDataExpressionsTest.java |  58 +++++++++
 .../NestedDataOperatorConversions.java        |  46 +++++++
 .../calcite/planner/DruidOperatorTable.java   |   1 +
 .../calcite/CalciteNestedDataQueryTest.java   |  50 ++++++++
 website/.spelling                             |   1 +
 10 files changed, 278 insertions(+)

diff --git a/docs/querying/math-expr.md b/docs/querying/math-expr.md
index 38ced649c06c..0893fc4e2366 100644
--- a/docs/querying/math-expr.md
+++ b/docs/querying/math-expr.md
@@ -246,6 +246,7 @@ JSON functions provide facilities to extract, transform, and create `COMPLEX<jso
 | to_json_string(expr) | Convert `expr` into a JSON `STRING` value |
 | json_keys(expr, path) | Get array of field names from `expr` at the specified JSONPath `path`, or null if the data does not exist or have any fields |
 | json_paths(expr) | Get array of all JSONPath paths available from `expr` |
+| json_merge(expr1, expr2[, expr3 ...]) | Merges two or more JSON `STRING` or `COMPLEX<json>` into one. Preserves the rightmost value when there are key overlaps. |
 
 ### JSONPath syntax
 
diff --git a/docs/querying/sql-functions.md b/docs/querying/sql-functions.md
index 5cdcbb254958..6859ca673910 100644
--- a/docs/querying/sql-functions.md
+++ b/docs/querying/sql-functions.md
@@ -855,6 +855,13 @@ Returns true if the IPv6 `address` belongs to the `subnet` literal, else false.
 
 Returns an array of field names from `expr` at the specified `path`.
 
+## JSON_MERGE
+
+**Function type:** [JSON](sql-json-functions.md)
+
+`JSON_MERGE(expr1, expr2[, expr3 ...])`
+Merges two or more JSON `STRING` or `COMPLEX<json>` into one. Preserves the rightmost value when there are key overlaps. Returning always a `COMPLEX<json>` type.
+
 ## JSON_OBJECT
 
 **Function type:** [JSON](sql-json-functions.md)
diff --git a/docs/querying/sql-json-functions.md b/docs/querying/sql-json-functions.md
index 199c568c29b8..35b4f5e37695 100644
--- a/docs/querying/sql-json-functions.md
+++ b/docs/querying/sql-json-functions.md
@@ -38,6 +38,7 @@ You can use the following JSON functions to extract, transform, and create `COMP
 | --- | --- |
 |`JSON_KEYS(expr, path)`| Returns an array of field names from `expr` at the specified `path`.|
 |`JSON_OBJECT(KEY expr1 VALUE expr2[, KEY expr3 VALUE expr4, ...])` | Constructs a new `COMPLEX<json>` object. The `KEY` expressions must evaluate to string types. The `VALUE` expressions can be composed of any input type, including other `COMPLEX<json>` values. `JSON_OBJECT` can accept colon-separated key-value pairs. The following syntax is equivalent: `JSON_OBJECT(expr1:expr2[, expr3:expr4, ...])`.|
+|`JSON_MERGE(expr1, expr2[, expr3 ...])`| Merges two or more JSON `STRING` or `COMPLEX<json>` into one. Preserves the rightmost value when there are key overlaps. Returning always a `COMPLEX<json>` type.|
 |`JSON_PATHS(expr)`| Returns an array of all paths which refer to literal values in `expr` in JSONPath format. |
 |`JSON_QUERY(expr, path)`| Extracts a `COMPLEX<json>` value from `expr`, at the specified `path`. |
 |`JSON_QUERY_ARRAY(expr, path)`| Extracts an `ARRAY<COMPLEX<json>>` value from `expr` at the specified `path`. If value is not an `ARRAY`, it gets translated into a single element `ARRAY` containing the value at `path`. The primary use of this function is to extract arrays of objects to use as inputs to other [array functions](./sql-array-functions.md).|
diff --git a/processing/src/main/java/org/apache/druid/guice/ExpressionModule.java b/processing/src/main/java/org/apache/druid/guice/ExpressionModule.java
index 917cf967f14c..e1064234e56e 100644
--- a/processing/src/main/java/org/apache/druid/guice/ExpressionModule.java
+++ b/processing/src/main/java/org/apache/druid/guice/ExpressionModule.java
@@ -81,6 +81,7 @@ public class ExpressionModule implements Module
                    .add(HyperUniqueExpressions.HllEstimateExprMacro.class)
                    .add(HyperUniqueExpressions.HllRoundEstimateExprMacro.class)
                    .add(NestedDataExpressions.JsonObjectExprMacro.class)
+                   .add(NestedDataExpressions.JsonMergeExprMacro.class)
                    .add(NestedDataExpressions.JsonKeysExprMacro.class)
                    .add(NestedDataExpressions.JsonPathsExprMacro.class)
                    .add(NestedDataExpressions.JsonValueExprMacro.class)
diff --git a/processing/src/main/java/org/apache/druid/query/expression/NestedDataExpressions.java b/processing/src/main/java/org/apache/druid/query/expression/NestedDataExpressions.java
index 873b4f831883..0926ce78e0a5 100644
--- a/processing/src/main/java/org/apache/druid/query/expression/NestedDataExpressions.java
+++ b/processing/src/main/java/org/apache/druid/query/expression/NestedDataExpressions.java
@@ -22,6 +22,7 @@
 
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.ObjectReader;
 import org.apache.druid.guice.annotations.Json;
 import org.apache.druid.math.expr.Expr;
 import org.apache.druid.math.expr.ExprEval;
@@ -99,6 +100,117 @@ public ExpressionType getOutputType(InputBindingInspector inspector)
     }
   }
 
+  public static class JsonMergeExprMacro implements ExprMacroTable.ExprMacro
+  {
+    public static final String NAME = "json_merge";
+
+    private final ObjectMapper jsonMapper;
+
+    @Inject
+    public JsonMergeExprMacro(
+        @Json ObjectMapper jsonMapper
+    )
+    {
+      this.jsonMapper = jsonMapper;
+    }
+
+    @Override
+    public String name()
+    {
+      return NAME;
+    }
+
+    @Override
+    public Expr apply(List<Expr> args)
+    {
+      if (args.size() < 2) {
+        throw validationFailed("must have at least two arguments");
+      }
+
+      final class ParseJsonExpr extends ExprMacroTable.BaseScalarMacroFunctionExpr
+      {
+        public ParseJsonExpr(List<Expr> args)
+        {
+          super(JsonMergeExprMacro.this, args);
+        }
+
+        @Override
+        public ExprEval eval(ObjectBinding bindings)
+        {
+          ExprEval arg = args.get(0).eval(bindings);
+          Object obj;
+
+          if (arg.value() == null) {
+            throw JsonMergeExprMacro.this.validationFailed(
+              "invalid input expected %s but got %s instead",
+                ExpressionType.STRING,
+                arg.type()
+            );
+          }
+
+          try {
+            obj = jsonMapper.readValue(getArgAsJson(arg), Object.class);
+          }
+          catch (JsonProcessingException e) {
+            throw JsonMergeExprMacro.this.processingFailed(e, "bad string input [%s]", arg.asString());
+          }
+
+          ObjectReader updater = jsonMapper.readerForUpdating(obj);
+
+          for (int i = 1; i < args.size(); i++) {
+            ExprEval argSub = args.get(i).eval(bindings);
+            
+            try {
+              String str = getArgAsJson(argSub);
+              if (str != null) {
+                obj = updater.readValue(str);
+              }
+            }
+            catch (JsonProcessingException e) {
+              throw JsonMergeExprMacro.this.processingFailed(e, "bad string input [%s]", argSub.asString());
+            }
+          }
+
+          return ExprEval.ofComplex(ExpressionType.NESTED_DATA, obj);
+        }
+
+        @Nullable
+        @Override
+        public ExpressionType getOutputType(InputBindingInspector inspector)
+        {
+          return ExpressionType.NESTED_DATA;
+        }
+
+        private String getArgAsJson(ExprEval arg)
+        {
+          if (arg.value() == null) {
+            return null;
+          }
+
+          if (arg.type().is(ExprType.STRING)) {
+            return arg.asString();
+          } 
+          
+          if (arg.type().is(ExprType.COMPLEX)) {
+            try {
+              return jsonMapper.writeValueAsString(unwrap(arg));
+            }
+            catch (JsonProcessingException e) {
+              throw JsonMergeExprMacro.this.processingFailed(e, "bad complex input [%s]", arg.asString());
+            } 
+          } 
+          
+          throw JsonMergeExprMacro.this.validationFailed(
+            "invalid input expected %s but got %s instead",
+            ExpressionType.STRING,
+            arg.type()
+          );
+        }
+      }
+      return new ParseJsonExpr(args);
+    }
+  }
+
   public static class ToJsonStringExprMacro implements ExprMacroTable.ExprMacro
   {
     public static final String NAME = "to_json_string";
diff --git a/processing/src/test/java/org/apache/druid/query/expression/NestedDataExpressionsTest.java b/processing/src/test/java/org/apache/druid/query/expression/NestedDataExpressionsTest.java
index b14edb2d17b8..c9fe553469a7 100644
--- a/processing/src/test/java/org/apache/druid/query/expression/NestedDataExpressionsTest.java
+++ b/processing/src/test/java/org/apache/druid/query/expression/NestedDataExpressionsTest.java
@@ -49,6 +49,7 @@ public class NestedDataExpressionsTest extends InitializedNullHandlingTest
           new NestedDataExpressions.JsonPathsExprMacro(),
           new NestedDataExpressions.JsonKeysExprMacro(),
           new NestedDataExpressions.JsonObjectExprMacro(),
+          new NestedDataExpressions.JsonMergeExprMacro(JSON_MAPPER),
           new NestedDataExpressions.JsonValueExprMacro(),
           new NestedDataExpressions.JsonQueryExprMacro(),
           new NestedDataExpressions.JsonQueryArrayExprMacro(),
@@ -112,6 +113,63 @@ public void testJsonObjectExpression()
     Assert.assertEquals(ImmutableMap.of("a", "hello", "b", "world"), ((Map) eval.value()).get("y"));
   }
 
+  @Test
+  public void testJsonMergeExpression() throws JsonProcessingException
+  {
+    Expr expr = Parser.parse("json_merge('{\"a\":\"x\"}','{\"b\":\"y\"}')", MACRO_TABLE);
+    ExprEval eval = expr.eval(inputBindings);
+    Assert.assertEquals("{\"a\":\"x\",\"b\":\"y\"}", JSON_MAPPER.writeValueAsString(eval.value()));
+    Assert.assertEquals(ExpressionType.NESTED_DATA, eval.type());
+
+    expr = Parser.parse("json_merge('{\"a\":\"x\"}', null)", MACRO_TABLE);
+    eval = expr.eval(inputBindings);
+    Assert.assertEquals("{\"a\":\"x\"}", JSON_MAPPER.writeValueAsString(eval.value()));
+    Assert.assertEquals(ExpressionType.NESTED_DATA, eval.type());
+
+    expr = Parser.parse("json_merge('{\"a\":\"x\"}','{\"b\":\"y\"}','{\"c\":[1,2,3]}')", MACRO_TABLE);
+    eval = expr.eval(inputBindings);
+    Assert.assertEquals("{\"a\":\"x\",\"b\":\"y\",\"c\":[1,2,3]}", JSON_MAPPER.writeValueAsString(eval.value()));
+    Assert.assertEquals(ExpressionType.NESTED_DATA, eval.type());
+
+    expr = Parser.parse("json_merge(json_object('a', 'x'),json_object('b', 'y'))", MACRO_TABLE);
+    eval = expr.eval(inputBindings);
+    Assert.assertEquals("{\"a\":\"x\",\"b\":\"y\"}", JSON_MAPPER.writeValueAsString(eval.value()));
+    Assert.assertEquals(ExpressionType.NESTED_DATA, eval.type());
+
+    expr = Parser.parse("json_merge('{\"a\":\"x\"}',json_merge('{\"a\":\"z\"}','{\"a\":\"y\"}'))", MACRO_TABLE);
+    eval = expr.eval(inputBindings);
+    Assert.assertEquals("{\"a\":\"y\"}", JSON_MAPPER.writeValueAsString(eval.value()));
+    Assert.assertEquals(ExpressionType.NESTED_DATA, eval.type());
+
+    expr = Parser.parse("json_merge('[\"a\", \"b\"]', '[\"c\", \"d\"]')", MACRO_TABLE);
+    eval = expr.eval(inputBindings);
+    Assert.assertEquals("[\"a\",\"b\",\"c\",\"d\"]", JSON_MAPPER.writeValueAsString(eval.value()));
+    Assert.assertEquals(ExpressionType.NESTED_DATA, eval.type());
+  }
+
+  @Test
+  public void testJsonMergeOverflow() throws JsonProcessingException
+  {
+    Expr.ObjectBinding input1 = InputBindings.forInputSuppliers(
+        new ImmutableMap.Builder<String, InputBindings.InputSupplier<?>>()
+          .put("attr", InputBindings.inputSupplier(ExpressionType.NESTED_DATA, () -> ImmutableMap.of("key", "blah", "value", "blahblah")))
+          .build()
+    );
+    Expr.ObjectBinding input2 = InputBindings.forInputSuppliers(
+        new ImmutableMap.Builder<String, InputBindings.InputSupplier<?>>()
+          .put("attr", InputBindings.inputSupplier(ExpressionType.NESTED_DATA, () -> ImmutableMap.of("key", "blah2", "value", "blahblah2")))
+          .build()
+    );
+
+    Expr expr = Parser.parse("json_merge(json_object(), json_object(json_value(attr, '$.key'), json_value(attr, '$.value')))", MACRO_TABLE);
+    ExprEval eval = expr.eval(input1);
+    Assert.assertEquals("{\"blah\":\"blahblah\"}", JSON_MAPPER.writeValueAsString(eval.value()));
+    Assert.assertEquals(ExpressionType.NESTED_DATA, eval.type());
+    eval = expr.eval(input2);
+    Assert.assertEquals("{\"blah2\":\"blahblah2\"}", JSON_MAPPER.writeValueAsString(eval.value()));
+    Assert.assertEquals(ExpressionType.NESTED_DATA, eval.type());
+  }
+
   @Test
   public void testJsonKeysExpression()
   {
diff --git a/sql/src/main/java/org/apache/druid/sql/calcite/expression/builtin/NestedDataOperatorConversions.java b/sql/src/main/java/org/apache/druid/sql/calcite/expression/builtin/NestedDataOperatorConversions.java
index a6006046553e..9c6cfb0448bb 100644
--- a/sql/src/main/java/org/apache/druid/sql/calcite/expression/builtin/NestedDataOperatorConversions.java
+++ b/sql/src/main/java/org/apache/druid/sql/calcite/expression/builtin/NestedDataOperatorConversions.java
@@ -780,6 +780,52 @@ public DruidExpression toDruidExpression(PlannerContext plannerContext, RowSigna
     }
   }
 
+  public static class JsonMergeOperatorConversion implements SqlOperatorConversion
+  {
+    private static final String FUNCTION_NAME = "json_merge";
+    private static final SqlFunction SQL_FUNCTION = OperatorConversions
+        .operatorBuilder(FUNCTION_NAME)
+        .operandTypeChecker(OperandTypes.variadic(SqlOperandCountRanges.from(1)))
+        .operandTypeInference((callBinding, returnType, operandTypes) -> {
+          RelDataTypeFactory typeFactory = callBinding.getTypeFactory();
+          for (int i = 0; i < operandTypes.length; i++) {
+            operandTypes[i] = typeFactory.createTypeWithNullability(
+                typeFactory.createSqlType(SqlTypeName.ANY),
+                true
+            );
+          }
+        })
+        .returnTypeInference(NESTED_RETURN_TYPE_INFERENCE)
+        .functionCategory(SqlFunctionCategory.SYSTEM)
+        .build();
+
+    @Override
+    public SqlOperator calciteOperator()
+    {
+      return SQL_FUNCTION;
+    }
+
+    @Nullable
+    @Override
+    public DruidExpression toDruidExpression(
+        PlannerContext plannerContext,
+        RowSignature rowSignature,
+        RexNode rexNode
+    )
+    {
+      return OperatorConversions.convertCall(
+          plannerContext,
+          rowSignature,
+          rexNode,
+          druidExpressions -> DruidExpression.ofExpression(
+              ColumnType.NESTED_DATA,
+              DruidExpression.functionCall("json_merge"),
+              druidExpressions
+          )
+      );
+    }
+  }
+
   public static class ToJsonStringOperatorConversion implements SqlOperatorConversion
   {
     private static final String FUNCTION_NAME = "to_json_string";
diff --git a/sql/src/main/java/org/apache/druid/sql/calcite/planner/DruidOperatorTable.java b/sql/src/main/java/org/apache/druid/sql/calcite/planner/DruidOperatorTable.java
index 27efe16270ed..0fb1c9fb9ff0 100644
--- a/sql/src/main/java/org/apache/druid/sql/calcite/planner/DruidOperatorTable.java
+++ b/sql/src/main/java/org/apache/druid/sql/calcite/planner/DruidOperatorTable.java
@@ -353,6 +353,7 @@ public class DruidOperatorTable implements SqlOperatorTable
                    .add(new NestedDataOperatorConversions.JsonValueReturningArrayDoubleOperatorConversion())
                    .add(new NestedDataOperatorConversions.JsonValueReturningArrayVarcharOperatorConversion())
                    .add(new NestedDataOperatorConversions.JsonObjectOperatorConversion())
+                   .add(new NestedDataOperatorConversions.JsonMergeOperatorConversion())
                    .add(new NestedDataOperatorConversions.ToJsonStringOperatorConversion())
                    .add(new NestedDataOperatorConversions.ParseJsonOperatorConversion())
                    .add(new NestedDataOperatorConversions.TryParseJsonOperatorConversion())
diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java
index 88ad05a4a275..b03b6698b1df 100644
--- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java
+++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java
@@ -69,6 +69,7 @@
 import org.apache.druid.segment.incremental.IncrementalIndex;
 import org.apache.druid.segment.incremental.IncrementalIndexSchema;
 import org.apache.druid.segment.join.JoinableFactoryWrapper;
+import org.apache.druid.segment.nested.NestedPathField;
 import org.apache.druid.segment.virtual.ExpressionVirtualColumn;
 import org.apache.druid.segment.virtual.NestedFieldVirtualColumn;
 import org.apache.druid.segment.writeout.OffHeapMemorySegmentWriteOutMediumFactory;
@@ -4920,6 +4921,55 @@ public void testJsonQueryAndJsonObject()
     );
   }
 
+  @Test
+  public void testJsonMerging()
+  {
+    testQuery(
+        "SELECT "
+        + "JSON_MERGE('{\"a\":\"x\"}',JSON_OBJECT(KEY 'x' VALUE JSON_VALUE(nest, '$.x')))\n"
+        + "FROM druid.nested",
+        ImmutableList.of(
+            Druids.newScanQueryBuilder()
+                  .dataSource(DATA_SOURCE)
+                  .intervals(querySegmentSpec(Filtration.eternity()))
+                  .virtualColumns(
+                      new ExpressionVirtualColumn(
+                          "v0",
+                          "json_merge('{\\u0022a\\u0022:\\u0022x\\u0022}',json_object('x',\"v1\"))",
+                          ColumnType.NESTED_DATA,
+                          queryFramework().macroTable()
+                      ),
+                      new NestedFieldVirtualColumn(
+                          "nest",
+                          "v1",
+                          ColumnType.STRING,
+                          ImmutableList.of(
+                            new NestedPathField("x")
+                          ),
+                          false,
+                          null,
+                          false
+                      )
+                  )
+                  .columns("v0")
+                  .resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_COMPACTED_LIST)
+                  .build()
+        ),
+        ImmutableList.of(
+            new Object[]{"{\"a\":\"x\",\"x\":\"100\"}"},
+            new Object[]{"{\"a\":\"x\",\"x\":null}"},
+            new Object[]{"{\"a\":\"x\",\"x\":\"200\"}"},
+            new Object[]{"{\"a\":\"x\",\"x\":null}"},
+            new Object[]{"{\"a\":\"x\",\"x\":null}"},
+            new Object[]{"{\"a\":\"x\",\"x\":\"100\"}"},
+            new Object[]{"{\"a\":\"x\",\"x\":null}"}
+        ),
+        RowSignature.builder()
+                    .add("EXPR$0", ColumnType.NESTED_DATA)
+                    .build()
+    );
+  }
+
   @Test
   public void testCompositionTyping()
   {
diff --git a/website/.spelling b/website/.spelling
index 8175755f8046..894cd40d9596 100644
--- a/website/.spelling
+++ b/website/.spelling
@@ -384,6 +384,7 @@ json_paths
 json_query
 json_query_array
 json_value
+json_merge
 karlkfi
 kbps
 kerberos

From ab7e4fa1e2ac6feb12998be74271e4fae6d2f67c Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Tue, 17 Sep 2024 08:02:02 -0700
Subject: [PATCH 39/47] MSQ: Properly report errors that occur when starting up
 RunWorkOrder. (#17069)

* MSQ: Properly report errors that occur when starting up RunWorkOrder.

In #17046, an exception thrown by RunWorkOrder#startAsync would be ignored
and replaced with a generic CanceledFault. This patch fixes it by retaining
the original error.
---
 .../apache/druid/msq/exec/RunWorkOrder.java   |  58 ++++----
 .../org/apache/druid/msq/exec/WorkerImpl.java |   2 +-
 .../druid/msq/exec/RunWorkOrderTest.java      | 139 ++++++++++++++++++
 3 files changed, 173 insertions(+), 26 deletions(-)
 create mode 100644 extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/RunWorkOrderTest.java

diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrder.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrder.java
index 3d31d7e2c3ee..3ad8bf1f29a3 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrder.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrder.java
@@ -129,12 +129,12 @@ enum State
     STARTED,
 
     /**
-     * State entered upon calling {@link #stop()}.
+     * State entered upon calling {@link #stop(Throwable)}.
      */
     STOPPING,
 
     /**
-     * State entered when a call to {@link #stop()} concludes.
+     * State entered when a call to {@link #stop(Throwable)} concludes.
      */
     STOPPED
   }
@@ -232,7 +232,7 @@ public void startAsync()
       setUpCompletionCallbacks();
     }
     catch (Throwable t) {
-      stopUnchecked();
+      stopUnchecked(t);
     }
   }
 
@@ -242,64 +242,72 @@ public void startAsync()
    * are all properly cleaned up.
    *
    * Blocks until execution is fully stopped.
+   *
+   * @param t error to send to {@link RunWorkOrderListener#onFailure}, if success/failure has not already been sent.
+   *          Will also be thrown at the end of this method.
    */
-  public void stop() throws InterruptedException
+  public void stop(@Nullable Throwable t) throws InterruptedException
   {
     if (state.compareAndSet(State.INIT, State.STOPPING)
         || state.compareAndSet(State.STARTED, State.STOPPING)) {
       // Initiate stopping.
-      Throwable e = null;
-
       try {
         exec.cancel(cancellationId);
       }
       catch (Throwable e2) {
-        e = e2;
+        if (t == null) {
+          t = e2;
+        } else {
+          t.addSuppressed(e2);
+        }
       }
 
       try {
         frameContext.close();
       }
       catch (Throwable e2) {
-        if (e == null) {
-          e = e2;
+        if (t == null) {
+          t = e2;
         } else {
-          e.addSuppressed(e2);
+          t.addSuppressed(e2);
         }
       }
 
       try {
-        // notifyListener will ignore this cancellation error if work has already succeeded.
-        notifyListener(Either.error(new MSQException(CanceledFault.instance())));
+        // notifyListener will ignore this error if work has already succeeded.
+        notifyListener(Either.error(t != null ? t : new MSQException(CanceledFault.instance())));
       }
       catch (Throwable e2) {
-        if (e == null) {
-          e = e2;
+        if (t == null) {
+          t = e2;
         } else {
-          e.addSuppressed(e2);
+          t.addSuppressed(e2);
         }
       }
 
       stopLatch.countDown();
-
-      if (e != null) {
-        Throwables.throwIfInstanceOf(e, InterruptedException.class);
-        Throwables.throwIfUnchecked(e);
-        throw new RuntimeException(e);
-      }
     }
 
     stopLatch.await();
+
+    if (t != null) {
+      Throwables.throwIfInstanceOf(t, InterruptedException.class);
+      Throwables.throwIfUnchecked(t);
+      throw new RuntimeException(t);
+    }
   }
 
   /**
-   * Calls {@link #stop()}. If the call to {@link #stop()} throws {@link InterruptedException}, this method sets
-   * the interrupt flag and throws an unchecked exception.
+   * Calls {@link #stop(Throwable)}. If the call to {@link #stop(Throwable)} throws {@link InterruptedException},
+   * this method sets the interrupt flag and throws an unchecked exception.
+   *
+   * @param t error to send to {@link RunWorkOrderListener#onFailure}, if success/failure has not already been sent.
+   *          Will also be thrown at the end of this method.
    */
-  public void stopUnchecked()
+  public void stopUnchecked(@Nullable final Throwable t)
   {
     try {
-      stop();
+      stop(t);
     }
     catch (InterruptedException e) {
       Thread.currentThread().interrupt();
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
index 906d9e041b3c..7be045542bc8 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/WorkerImpl.java
@@ -405,7 +405,7 @@ private void handleNewWorkOrder(
     );
 
     // Set up processorCloser (called when processing is done).
-    kernelHolder.processorCloser.register(runWorkOrder::stopUnchecked);
+    kernelHolder.processorCloser.register(() -> runWorkOrder.stopUnchecked(null));
 
     // Start working on this stage immediately.
     kernel.startReading();
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/RunWorkOrderTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/RunWorkOrderTest.java
new file mode 100644
index 000000000000..dbd6857b2722
--- /dev/null
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/RunWorkOrderTest.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.exec;
+
+import org.apache.druid.frame.processor.FrameProcessorExecutor;
+import org.apache.druid.java.util.common.ISE;
+import org.apache.druid.msq.indexing.error.MSQException;
+import org.apache.druid.msq.kernel.FrameContext;
+import org.junit.Assert;
+import org.junit.Test;
+import org.mockito.ArgumentMatchers;
+import org.mockito.Mockito;
+
+public class RunWorkOrderTest
+{
+  private static final String CANCELLATION_ID = "my-cancellation-id";
+
+  @Test
+  public void test_stopUnchecked() throws InterruptedException
+  {
+    final FrameProcessorExecutor exec = Mockito.mock(FrameProcessorExecutor.class);
+    final WorkerContext workerContext = Mockito.mock(WorkerContext.class);
+    final FrameContext frameContext = Mockito.mock(FrameContext.class);
+    final WorkerStorageParameters storageParameters = Mockito.mock(WorkerStorageParameters.class);
+    final RunWorkOrderListener listener = Mockito.mock(RunWorkOrderListener.class);
+
+    Mockito.when(frameContext.storageParameters()).thenReturn(storageParameters);
+
+    final RunWorkOrder runWorkOrder =
+        new RunWorkOrder(null, null, null, exec, CANCELLATION_ID, workerContext, frameContext, listener, false, false);
+
+    runWorkOrder.stopUnchecked(null);
+
+    // Calling a second time doesn't do anything special.
+    runWorkOrder.stopUnchecked(null);
+
+    Mockito.verify(exec).cancel(CANCELLATION_ID);
+    Mockito.verify(frameContext).close();
+    Mockito.verify(listener).onFailure(ArgumentMatchers.any(MSQException.class));
+  }
+
+  @Test
+  public void test_stopUnchecked_error() throws InterruptedException
+  {
+    final FrameProcessorExecutor exec = Mockito.mock(FrameProcessorExecutor.class);
+    final WorkerContext workerContext = Mockito.mock(WorkerContext.class);
+    final FrameContext frameContext = Mockito.mock(FrameContext.class);
+    final WorkerStorageParameters storageParameters = Mockito.mock(WorkerStorageParameters.class);
+    final RunWorkOrderListener listener = Mockito.mock(RunWorkOrderListener.class);
+
+    Mockito.when(frameContext.storageParameters()).thenReturn(storageParameters);
+
+    final RunWorkOrder runWorkOrder =
+        new RunWorkOrder(null, null, null, exec, CANCELLATION_ID, workerContext, frameContext, listener, false, false);
+
+    final ISE exception = new ISE("oops");
+
+    Assert.assertThrows(
+        IllegalStateException.class,
+        () -> runWorkOrder.stopUnchecked(exception)
+    );
+
+    // Calling a second time doesn't do anything special. We already tried our best.
+    runWorkOrder.stopUnchecked(null);
+
+    Mockito.verify(exec).cancel(CANCELLATION_ID);
+    Mockito.verify(frameContext).close();
+    Mockito.verify(listener).onFailure(ArgumentMatchers.eq(exception));
+  }
+
+  @Test
+  public void test_stopUnchecked_errorDuringExecCancel() throws InterruptedException
+  {
+    final FrameProcessorExecutor exec = Mockito.mock(FrameProcessorExecutor.class);
+    final WorkerContext workerContext = Mockito.mock(WorkerContext.class);
+    final FrameContext frameContext = Mockito.mock(FrameContext.class);
+    final WorkerStorageParameters storageParameters = Mockito.mock(WorkerStorageParameters.class);
+    final RunWorkOrderListener listener = Mockito.mock(RunWorkOrderListener.class);
+
+    final ISE exception = new ISE("oops");
+    Mockito.when(frameContext.storageParameters()).thenReturn(storageParameters);
+    Mockito.doThrow(exception).when(exec).cancel(CANCELLATION_ID);
+
+    final RunWorkOrder runWorkOrder =
+        new RunWorkOrder(null, null, null, exec, CANCELLATION_ID, workerContext, frameContext, listener, false, false);
+
+    Assert.assertThrows(
+        IllegalStateException.class,
+        () -> runWorkOrder.stopUnchecked(null)
+    );
+
+    Mockito.verify(exec).cancel(CANCELLATION_ID);
+    Mockito.verify(frameContext).close();
+    Mockito.verify(listener).onFailure(ArgumentMatchers.eq(exception));
+  }
+
+  @Test
+  public void test_stopUnchecked_errorDuringFrameContextClose() throws InterruptedException
+  {
+    final FrameProcessorExecutor exec = Mockito.mock(FrameProcessorExecutor.class);
+    final WorkerContext workerContext = Mockito.mock(WorkerContext.class);
+    final FrameContext frameContext = Mockito.mock(FrameContext.class);
+    final WorkerStorageParameters storageParameters = Mockito.mock(WorkerStorageParameters.class);
+    final RunWorkOrderListener listener = Mockito.mock(RunWorkOrderListener.class);
+
+    final ISE exception = new ISE("oops");
+    Mockito.when(frameContext.storageParameters()).thenReturn(storageParameters);
+    Mockito.doThrow(exception).when(frameContext).close();
+
+    final RunWorkOrder runWorkOrder =
+        new RunWorkOrder(null, null, null, exec, CANCELLATION_ID, workerContext, frameContext, listener, false, false);
+
+    Assert.assertThrows(
+        IllegalStateException.class,
+        () -> runWorkOrder.stopUnchecked(null)
+    );
+
+    Mockito.verify(exec).cancel(CANCELLATION_ID);
+    Mockito.verify(frameContext).close();
+    Mockito.verify(listener).onFailure(ArgumentMatchers.eq(exception));
+  }
+}

From de0249a0ed49a51bb0318a448d7a9821e1eb03c8 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Tue, 17 Sep 2024 08:35:54 -0700
Subject: [PATCH 40/47] FrameChannelMerger: Fix incorrect behavior of
 finished(). (#17088)

Previously, the processor used "remainingChannels" to track the number of
non-null entries of currentFrame. Now, "remainingChannels" tracks the
number of channels that are unfinished.

The difference is subtle. In the previous code, when an input channel
was blocked upon exiting nextFrame(), the "currentFrames" entry would be
null, and therefore the "remainingChannels" variable would be decremented.
After the next await and call to populateCurrentFramesAndTournamentTree(),
"remainingChannels" would be incremented if the channel had become
unblocked after awaiting.

This means that finished(), which returned true if remainingChannels was
zero, would not be reliable if called between nextFrame() and the
next await + populateCurrentFramesAndTournamentTree().

This patch changes things such that finished() is always reliable. This
fixes a regression introduced in PR #16911, which added a call to
finished() that was, at that time, unsafe.
---
 .../frame/processor/FrameChannelMerger.java   | 27 ++++--
 .../frame/processor/SuperSorterTest.java      | 47 +++++++++-
 ...sAsyncPartitionedReadableFrameChannel.java | 51 +++++++++++
 .../test/AlwaysAsyncReadableFrameChannel.java | 85 +++++++++++++++++++
 4 files changed, 198 insertions(+), 12 deletions(-)
 create mode 100644 processing/src/test/java/org/apache/druid/frame/processor/test/AlwaysAsyncPartitionedReadableFrameChannel.java
 create mode 100644 processing/src/test/java/org/apache/druid/frame/processor/test/AlwaysAsyncReadableFrameChannel.java

diff --git a/processing/src/main/java/org/apache/druid/frame/processor/FrameChannelMerger.java b/processing/src/main/java/org/apache/druid/frame/processor/FrameChannelMerger.java
index e806f98b9264..662fd001b02d 100644
--- a/processing/src/main/java/org/apache/druid/frame/processor/FrameChannelMerger.java
+++ b/processing/src/main/java/org/apache/druid/frame/processor/FrameChannelMerger.java
@@ -19,8 +19,10 @@
 
 package org.apache.druid.frame.processor;
 
+import it.unimi.dsi.fastutil.ints.IntAVLTreeSet;
 import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
 import it.unimi.dsi.fastutil.ints.IntSet;
+import it.unimi.dsi.fastutil.ints.IntSets;
 import org.apache.druid.frame.Frame;
 import org.apache.druid.frame.channel.FrameWithPartition;
 import org.apache.druid.frame.channel.ReadableFrameChannel;
@@ -68,7 +70,11 @@ public class FrameChannelMerger implements FrameProcessor<Long>
   private final long rowLimit;
   private long rowsOutput = 0;
   private int currentPartition = 0;
-  private int remainingChannels;
+
+  /**
+   * Channels that still have input to read.
+   */
+  private final IntSet remainingChannels;
 
   // ColumnSelectorFactory that always reads from the current row in the merged sequence.
   final MultiColumnSelectorFactory mergedColumnSelectorFactory;
@@ -119,7 +125,7 @@ public FrameChannelMerger(
     this.partitions = partitionsToUse;
     this.rowLimit = rowLimit;
     this.currentFrames = new FramePlus[inputChannels.size()];
-    this.remainingChannels = 0;
+    this.remainingChannels = new IntAVLTreeSet(IntSets.fromTo(0, inputChannels.size()));
     this.tournamentTree = new TournamentTree(
         inputChannels.size(),
         (k1, k2) -> {
@@ -241,7 +247,7 @@ private FrameWithPartition nextFrame()
         if (rowLimit != UNLIMITED && rowsOutput >= rowLimit) {
           // Limit reached; we're done.
           Arrays.fill(currentFrames, null);
-          remainingChannels = 0;
+          remainingChannels.clear();
         } else {
           // Continue reading the currentChannel.
           final FramePlus channelFramePlus = currentFrames[currentChannel];
@@ -251,7 +257,6 @@ private FrameWithPartition nextFrame()
             // Done reading current frame from "channel".
             // Clear it and see if there is another one available for immediate loading.
             currentFrames[currentChannel] = null;
-            remainingChannels--;
 
             final ReadableFrameChannel channel = inputChannels.get(currentChannel);
 
@@ -265,10 +270,10 @@ private FrameWithPartition nextFrame()
                 break;
               } else {
                 currentFrames[currentChannel] = framePlus;
-                remainingChannels++;
               }
             } else if (channel.isFinished()) {
               // Done reading this channel. Fall through and continue with other channels.
+              remainingChannels.remove(currentChannel);
             } else {
               // Nothing available, not finished; we can't continue. Finish up the current frame and return it.
               break;
@@ -282,9 +287,12 @@ private FrameWithPartition nextFrame()
     }
   }
 
+  /**
+   * Returns whether all input is done being read.
+   */
   private boolean finished()
   {
-    return remainingChannels == 0;
+    return remainingChannels.isEmpty();
   }
 
   @Override
@@ -302,7 +310,7 @@ private IntSet populateCurrentFramesAndTournamentTree()
     final IntSet await = new IntOpenHashSet();
 
     for (int i = 0; i < inputChannels.size(); i++) {
-      if (currentFrames[i] == null) {
+      if (currentFrames[i] == null && remainingChannels.contains(i)) {
         final ReadableFrameChannel channel = inputChannels.get(i);
 
         if (channel.canRead()) {
@@ -312,9 +320,10 @@ private IntSet populateCurrentFramesAndTournamentTree()
             await.add(i);
           } else {
             currentFrames[i] = framePlus;
-            remainingChannels++;
           }
-        } else if (!channel.isFinished()) {
+        } else if (channel.isFinished()) {
+          remainingChannels.remove(i);
+        } else {
           await.add(i);
         }
       }
diff --git a/processing/src/test/java/org/apache/druid/frame/processor/SuperSorterTest.java b/processing/src/test/java/org/apache/druid/frame/processor/SuperSorterTest.java
index 7a885af49c54..80e7f6352d00 100644
--- a/processing/src/test/java/org/apache/druid/frame/processor/SuperSorterTest.java
+++ b/processing/src/test/java/org/apache/druid/frame/processor/SuperSorterTest.java
@@ -38,6 +38,8 @@
 import org.apache.druid.frame.key.KeyTestUtils;
 import org.apache.druid.frame.key.RowKey;
 import org.apache.druid.frame.key.RowKeyReader;
+import org.apache.druid.frame.processor.test.AlwaysAsyncPartitionedReadableFrameChannel;
+import org.apache.druid.frame.processor.test.AlwaysAsyncReadableFrameChannel;
 import org.apache.druid.frame.read.FrameReader;
 import org.apache.druid.frame.testutil.FrameSequenceBuilder;
 import org.apache.druid.frame.testutil.FrameTestUtil;
@@ -434,8 +436,8 @@ private void verifySuperSorter(
           clusterByPartitionsFuture,
           exec,
           FrameProcessorDecorator.NONE,
-          new FileOutputChannelFactory(tempFolder, maxBytesPerFrame, null),
-          outputChannelFactory,
+          makeOutputChannelFactory(new FileOutputChannelFactory(tempFolder, maxBytesPerFrame, null)),
+          makeOutputChannelFactory(outputChannelFactory),
           maxActiveProcessors,
           maxChannelsPerProcessor,
           limitHint,
@@ -839,9 +841,48 @@ public void accept(final Frame frame)
 
     for (final BlockingQueueFrameChannel channel : channels) {
       channel.writable().close();
-      retVal.add(channel.readable());
+      retVal.add(new AlwaysAsyncReadableFrameChannel(channel.readable()));
     }
 
     return retVal;
   }
+
+  /**
+   * Wraps an underlying {@link OutputChannelFactory} in one that uses {@link AlwaysAsyncReadableFrameChannel}
+   * for all of its readable channels. This helps catch bugs due to improper usage of {@link ReadableFrameChannel}
+   * methods that enable async reads.
+   */
+  private static OutputChannelFactory makeOutputChannelFactory(final OutputChannelFactory baseFactory)
+  {
+    return new OutputChannelFactory() {
+      @Override
+      public OutputChannel openChannel(int partitionNumber) throws IOException
+      {
+        final OutputChannel channel = baseFactory.openChannel(partitionNumber);
+        return OutputChannel.pair(
+            channel.getWritableChannel(),
+            channel.getFrameMemoryAllocator(),
+            () -> new AlwaysAsyncReadableFrameChannel(channel.getReadableChannelSupplier().get()),
+            channel.getPartitionNumber()
+        );
+      }
+
+      @Override
+      public PartitionedOutputChannel openPartitionedChannel(String name, boolean deleteAfterRead) throws IOException
+      {
+        final PartitionedOutputChannel channel = baseFactory.openPartitionedChannel(name, deleteAfterRead);
+        return PartitionedOutputChannel.pair(
+            channel.getWritableChannel(),
+            channel.getFrameMemoryAllocator(),
+            () -> new AlwaysAsyncPartitionedReadableFrameChannel(channel.getReadableChannelSupplier().get())
+        );
+      }
+
+      @Override
+      public OutputChannel openNilChannel(int partitionNumber)
+      {
+        return baseFactory.openNilChannel(partitionNumber);
+      }
+    };
+  }
 }
diff --git a/processing/src/test/java/org/apache/druid/frame/processor/test/AlwaysAsyncPartitionedReadableFrameChannel.java b/processing/src/test/java/org/apache/druid/frame/processor/test/AlwaysAsyncPartitionedReadableFrameChannel.java
new file mode 100644
index 000000000000..4013889df6e5
--- /dev/null
+++ b/processing/src/test/java/org/apache/druid/frame/processor/test/AlwaysAsyncPartitionedReadableFrameChannel.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.frame.processor.test;
+
+import org.apache.druid.frame.channel.PartitionedReadableFrameChannel;
+import org.apache.druid.frame.channel.ReadableFrameChannel;
+
+import java.io.IOException;
+
+/**
+ * Implementation of {@link PartitionedReadableFrameChannel} that wraps all underlying channels in
+ * {@link AlwaysAsyncReadableFrameChannel}.
+ */
+public class AlwaysAsyncPartitionedReadableFrameChannel implements PartitionedReadableFrameChannel
+{
+  private final PartitionedReadableFrameChannel delegate;
+
+  public AlwaysAsyncPartitionedReadableFrameChannel(PartitionedReadableFrameChannel delegate)
+  {
+    this.delegate = delegate;
+  }
+
+  @Override
+  public ReadableFrameChannel getReadableFrameChannel(int partitionNumber)
+  {
+    return new AlwaysAsyncReadableFrameChannel(delegate.getReadableFrameChannel(partitionNumber));
+  }
+
+  @Override
+  public void close() throws IOException
+  {
+    delegate.close();
+  }
+}
diff --git a/processing/src/test/java/org/apache/druid/frame/processor/test/AlwaysAsyncReadableFrameChannel.java b/processing/src/test/java/org/apache/druid/frame/processor/test/AlwaysAsyncReadableFrameChannel.java
new file mode 100644
index 000000000000..8ff10aeb7b07
--- /dev/null
+++ b/processing/src/test/java/org/apache/druid/frame/processor/test/AlwaysAsyncReadableFrameChannel.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.frame.processor.test;
+
+import com.google.common.util.concurrent.ListenableFuture;
+import org.apache.druid.frame.Frame;
+import org.apache.druid.frame.channel.ReadableFrameChannel;
+import org.apache.druid.java.util.common.ISE;
+
+/**
+ * Wraps an underlying channel and forces an async style of reading. After each call to {@link #read()}, the
+ * {@link #canRead()} and {@link #isFinished()} methods return false until {@link #readabilityFuture()} is called.
+ */
+public class AlwaysAsyncReadableFrameChannel implements ReadableFrameChannel
+{
+  private final ReadableFrameChannel delegate;
+  private boolean defer;
+
+  public AlwaysAsyncReadableFrameChannel(ReadableFrameChannel delegate)
+  {
+    this.delegate = delegate;
+  }
+
+  @Override
+  public boolean isFinished()
+  {
+    if (defer) {
+      return false;
+    }
+
+    return delegate.isFinished();
+  }
+
+  @Override
+  public boolean canRead()
+  {
+    if (defer) {
+      return false;
+    }
+
+    return delegate.canRead();
+  }
+
+  @Override
+  public Frame read()
+  {
+    if (defer) {
+      throw new ISE("Cannot call read() while deferred");
+    }
+
+    defer = true;
+    return delegate.read();
+  }
+
+  @Override
+  public ListenableFuture<?> readabilityFuture()
+  {
+    defer = false;
+    return delegate.readabilityFuture();
+  }
+
+  @Override
+  public void close()
+  {
+    defer = false;
+    delegate.close();
+  }
+}

From e6a07be13cd34132ae49408f5e37cb16b16b90ac Mon Sep 17 00:00:00 2001
From: Katya Macedo <38017980+ektravel@users.noreply.github.com>
Date: Tue, 17 Sep 2024 11:49:24 -0500
Subject: [PATCH 41/47] Docs - update streaming ingestion terminology for Kafka
 and Kinesis (#17003)

---
 docs/ingestion/kafka-ingestion.md   | 2 +-
 docs/ingestion/kinesis-ingestion.md | 8 ++++----
 docs/ingestion/supervisor.md        | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/ingestion/kafka-ingestion.md b/docs/ingestion/kafka-ingestion.md
index 5c4dc0856a80..7a744f969e45 100644
--- a/docs/ingestion/kafka-ingestion.md
+++ b/docs/ingestion/kafka-ingestion.md
@@ -124,7 +124,7 @@ For configuration properties shared across all streaming ingestion methods, refe
 |`topicPattern`|String|Multiple Kafka topics to read from, passed as a regex pattern. See [Ingest from multiple topics](#ingest-from-multiple-topics) for more information.|Yes if `topic` isn't set.||
 |`consumerProperties`|String, Object|A map of properties to pass to the Kafka consumer. See [Consumer properties](#consumer-properties) for details.|Yes. At the minimum, you must set the `bootstrap.servers` property to establish the initial connection to the Kafka cluster.||
 |`pollTimeout`|Long|The length of time to wait for the Kafka consumer to poll records, in milliseconds.|No|100|
-|`useEarliestOffset`|Boolean|If a supervisor manages a datasource for the first time, it obtains a set of starting offsets from Kafka. This flag determines whether it retrieves the earliest or latest offsets in Kafka. Under normal circumstances, subsequent tasks start from where the previous segments ended. Druid only uses `useEarliestOffset` on the first run.|No|`false`|
+|`useEarliestOffset`|Boolean|If a supervisor is managing a datasource for the first time, it obtains a set of starting offsets from Kafka. This flag determines whether the supervisor retrieves the earliest or latest offsets in Kafka. Under normal circumstances, subsequent tasks start from where the previous segments ended so this flag is only used on the first run.|No|`false`|
 |`idleConfig`|Object|Defines how and when the Kafka supervisor can become idle. See [Idle configuration](#idle-configuration) for more details.|No|null|
 
 #### Ingest from multiple topics
diff --git a/docs/ingestion/kinesis-ingestion.md b/docs/ingestion/kinesis-ingestion.md
index 2a855b7b7ff4..40ce75e4fcd8 100644
--- a/docs/ingestion/kinesis-ingestion.md
+++ b/docs/ingestion/kinesis-ingestion.md
@@ -128,7 +128,7 @@ For configuration properties shared across all streaming ingestion methods, refe
 |--------|----|-----------|--------|-------|
 |`stream`|String|The Kinesis stream to read.|Yes||
 |`endpoint`|String|The AWS Kinesis stream endpoint for a region. You can find a list of endpoints in the [AWS service endpoints](http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region) document.|No|`kinesis.us-east-1.amazonaws.com`|
-|`useEarliestSequenceNumber`|Boolean|If a supervisor is managing a datasource for the first time, it obtains a set of starting sequence numbers from Kinesis. This flag determines whether a supervisor retrieves the earliest or latest sequence numbers in Kinesis. Under normal circumstances, subsequent tasks start from where the previous segments ended so this flag is only used on the first run.|No|`false`|
+|`useEarliestSequenceNumber`|Boolean|If a supervisor is managing a datasource for the first time, it obtains a set of starting sequence numbers from Kinesis. This flag determines whether the supervisor retrieves the earliest or latest sequence numbers in Kinesis. Under normal circumstances, subsequent tasks start from where the previous segments ended so this flag is only used on the first run.|No|`false`|
 |`fetchDelayMillis`|Integer|Time in milliseconds to wait between subsequent calls to fetch records from Kinesis. See [Determine fetch settings](#determine-fetch-settings).|No|0|
 |`awsAssumedRoleArn`|String|The AWS assumed role to use for additional permissions.|No||
 |`awsExternalId`|String|The AWS external ID to use for additional permissions.|No||
@@ -155,7 +155,7 @@ For configuration properties shared across all streaming ingestion methods, refe
 
 |Property|Type|Description|Required|Default|
 |--------|----|-----------|--------|-------|
-|`skipSequenceNumberAvailabilityCheck`|Boolean|Whether to enable checking if the current sequence number is still available in a particular Kinesis shard. If `false`, the indexing task attempts to reset the current sequence number, depending on the value of `resetOffsetAutomatically`.|No|`false`|
+|`skipSequenceNumberAvailabilityCheck`|Boolean|Whether to enable checking if the current sequence number is still available in a particular Kinesis shard. If `false`, the indexing task attempts to reset the current sequence number, depending on the value of `resetOffsetAutomatically`. For more information on the `resetOffsetAutomatically` property, see [Supervisor tuning configuration](supervisor.md#tuning-configuration).|No|`false`|
 |`recordBufferSizeBytes`|Integer| The size of the buffer (heap memory bytes) Druid uses between the Kinesis fetch threads and the main ingestion thread.|No| See [Determine fetch settings](#determine-fetch-settings) for defaults.|
 |`recordBufferOfferTimeout`|Integer|The number of milliseconds to wait for space to become available in the buffer before timing out.|No|5000|
 |`recordBufferFullWait`|Integer|The number of milliseconds to wait for the buffer to drain before Druid attempts to fetch records from Kinesis again.|No|5000|
@@ -315,7 +315,7 @@ This window with early task shutdowns and possible task failures concludes when:
 - All closed shards have been fully read and the Kinesis ingestion tasks have published the data from those shards, committing the "closed" state to metadata storage.
 - Any remaining tasks that had inactive shards in the assignment have been shut down. These tasks would have been created before the closed shards were completely drained.
 
-Note that when the supervisor is running and detects new partitions, tasks read new partitions from the earliest offsets, irrespective of the `useEarliestSequence` setting. This is because these new shards were immediately discovered and are therefore unlikely to experience a lag.
+Note that when the supervisor is running and detects new partitions, tasks read new partitions from the earliest sequence number, irrespective of the `useEarliestSequence` setting. This is because these new shards were immediately discovered and are therefore unlikely to experience a lag.
 
 If resharding occurs when the supervisor is suspended and `useEarliestSequence` is set to `false`, resuming the supervisor causes tasks to read the new shards from the latest sequence. This is by design so that the consumer can catch up quickly with any lag accumulated while the supervisor was suspended.
 
@@ -324,7 +324,7 @@ If resharding occurs when the supervisor is suspended and `useEarliestSequence`
 Before you deploy the `druid-kinesis-indexing-service` extension to production, consider the following known issues:
 
 - Kinesis imposes a read throughput limit per shard. If you have multiple supervisors reading from the same Kinesis stream, consider adding more shards to ensure sufficient read throughput for all supervisors.
-- A Kinesis supervisor can sometimes compare the checkpoint offset to retention window of the stream to see if it has fallen behind. These checks fetch the earliest sequence number for Kinesis which can result in `IteratorAgeMilliseconds` becoming very high in AWS CloudWatch.
+- A Kinesis supervisor can sometimes compare the checkpoint sequence number to the retention window of the stream to see if it has fallen behind. These checks fetch the earliest sequence number for Kinesis which can result in `IteratorAgeMilliseconds` becoming very high in AWS CloudWatch.
 
 ## Learn more
 
diff --git a/docs/ingestion/supervisor.md b/docs/ingestion/supervisor.md
index 242adb3d58cd..09ecd262e192 100644
--- a/docs/ingestion/supervisor.md
+++ b/docs/ingestion/supervisor.md
@@ -204,7 +204,7 @@ For configuration properties specific to Kafka and Kinesis, see [Kafka tuning co
 |`indexSpecForIntermediatePersists`|Object|Defines segment storage format options to use at indexing time for intermediate persisted temporary segments. You can use `indexSpecForIntermediatePersists` to disable dimension/metric compression on intermediate segments to reduce memory required for final merging. However, disabling compression on intermediate segments might increase page cache use while they are used before getting merged into final segment published.|No||
 |`reportParseExceptions`|Boolean|DEPRECATED. If `true`, Druid throws exceptions encountered during parsing causing ingestion to halt. If `false`, Druid skips unparseable rows and fields. Setting `reportParseExceptions` to `true` overrides existing configurations for `maxParseExceptions` and `maxSavedParseExceptions`, setting `maxParseExceptions` to 0 and limiting `maxSavedParseExceptions` to not more than 1.|No|`false`|
 |`handoffConditionTimeout`|Long|Number of milliseconds to wait for segment handoff. Set to a value >= 0, where 0 means to wait indefinitely.|No|900000 (15 minutes) for Kafka. 0 for Kinesis.|
-|`resetOffsetAutomatically`|Boolean|Resets partitions when the sequence number is unavailable. If set to `true`, Druid resets partitions to the earliest or latest offset, based on the value of `useEarliestSequenceNumber` or `useEarliestOffset` (earliest if `true`, latest if `false`). If set to `false`, Druid surfaces the exception causing tasks to fail and ingestion to halt. If this occurs, manual intervention is required to correct the situation, potentially through [resetting the supervisor](../api-reference/supervisor-api.md#reset-a-supervisor).|No|`false`|
+|`resetOffsetAutomatically`|Boolean|Resets partitions when the offset is unavailable. If set to `true`, Druid resets partitions to the earliest or latest offset, based on the value of `useEarliestOffset` or `useEarliestSequenceNumber` (earliest if `true`, latest if `false`). If set to `false`, Druid surfaces the exception causing tasks to fail and ingestion to halt. If this occurs, manual intervention is required to correct the situation, potentially through [resetting the supervisor](../api-reference/supervisor-api.md#reset-a-supervisor).|No|`false`|
 |`workerThreads`|Integer|The number of threads that the supervisor uses to handle requests/responses for worker tasks, along with any other internal asynchronous operation.|No|`min(10, taskCount)`|
 |`chatRetries`|Integer|The number of times Druid retries HTTP requests to indexing tasks before considering tasks unresponsive.|No|8|
 |`httpTimeout`|ISO 8601 period|The period of time to wait for a HTTP response from an indexing task.|No|`PT10S`|
@@ -396,7 +396,7 @@ For information on how to terminate a supervisor by API, see [Supervisors: Termi
 
 Indexing tasks run on Middle Managers and are limited by the resources available in the Middle Manager cluster. In particular, you should make sure that you have sufficient worker capacity, configured using the
 `druid.worker.capacity` property, to handle the configuration in the supervisor spec. Note that worker capacity is
-shared across all types of indexing tasks, so you should plan your worker capacity to handle your total indexing load, such as batch processing, streaming tasks, and merging tasks. If your workers run out of capacity, indexing tasks queue and wait for the next available worker. This may cause queries to return partial results but will not result in data loss, assuming the tasks run before the stream purges those sequence numbers.
+shared across all types of indexing tasks, so you should plan your worker capacity to handle your total indexing load, such as batch processing, streaming tasks, and merging tasks. If your workers run out of capacity, indexing tasks queue and wait for the next available worker. This may cause queries to return partial results but will not result in data loss, assuming the tasks run before the stream purges those offsets.
 
 A running task can be in one of two states: reading or publishing. A task remains in reading state for the period defined in `taskDuration`, at which point it transitions to publishing state. A task remains in publishing state for as long as it takes to generate segments, push segments to deep storage, and have them loaded and served by a Historical service or until `completionTimeout` elapses.
 

From 450b3ba214790499c5bbd3f7ce884c30d00e29b7 Mon Sep 17 00:00:00 2001
From: Clint Wylie <cwylie@apache.org>
Date: Tue, 17 Sep 2024 13:17:44 -0700
Subject: [PATCH 42/47] add VirtualColumns.findEquivalent and
 VirtualColumn.EquivalenceKey (#17084)

---
 .../apache/druid/segment/VirtualColumn.java   |  19 ++
 .../apache/druid/segment/VirtualColumns.java  |  24 +-
 .../virtual/ExpressionVirtualColumn.java      |  91 ++++--
 .../virtual/NestedFieldVirtualColumn.java     | 261 +++++++++++-------
 .../druid/segment/VirtualColumnsTest.java     |  76 +++--
 .../virtual/NestedFieldVirtualColumnTest.java |  14 +-
 6 files changed, 336 insertions(+), 149 deletions(-)

diff --git a/processing/src/main/java/org/apache/druid/segment/VirtualColumn.java b/processing/src/main/java/org/apache/druid/segment/VirtualColumn.java
index ca9408d14e4b..b7ca00d9e5d1 100644
--- a/processing/src/main/java/org/apache/druid/segment/VirtualColumn.java
+++ b/processing/src/main/java/org/apache/druid/segment/VirtualColumn.java
@@ -21,6 +21,7 @@
 
 import com.fasterxml.jackson.annotation.JsonSubTypes;
 import com.fasterxml.jackson.annotation.JsonTypeInfo;
+import org.apache.druid.annotations.SubclassesMustOverrideEqualsAndHashCode;
 import org.apache.druid.java.util.common.Cacheable;
 import org.apache.druid.query.dimension.DimensionSpec;
 import org.apache.druid.query.filter.ColumnIndexSelector;
@@ -336,4 +337,22 @@ default ColumnIndexSupplier getIndexSupplier(
   {
     return NoIndexesColumnIndexSupplier.getInstance();
   }
+
+  /**
+   * Returns a key used for "equivalence" comparisons, for checking if some virtual column is equivalent to some other
+   * virtual column, regardless of the output name. If this method returns null, it does not participate in equivalence
+   * comparisons.
+   * 
+   * @see VirtualColumns#findEquivalent(VirtualColumn) 
+   */
+  @Nullable
+  default EquivalenceKey getEquivalanceKey()
+  {
+    return null;
+  }
+
+  @SubclassesMustOverrideEqualsAndHashCode
+  interface EquivalenceKey
+  {
+  }
 }
diff --git a/processing/src/main/java/org/apache/druid/segment/VirtualColumns.java b/processing/src/main/java/org/apache/druid/segment/VirtualColumns.java
index bc2609b4a00e..676067a959f5 100644
--- a/processing/src/main/java/org/apache/druid/segment/VirtualColumns.java
+++ b/processing/src/main/java/org/apache/druid/segment/VirtualColumns.java
@@ -26,6 +26,7 @@
 import com.google.common.base.Strings;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 import org.apache.druid.java.util.common.Cacheable;
 import org.apache.druid.java.util.common.IAE;
@@ -131,10 +132,13 @@ public static VirtualColumns nullToEmpty(@Nullable VirtualColumns virtualColumns
   // For equals, hashCode, toString, and serialization:
   private final List<VirtualColumn> virtualColumns;
   private final List<String> virtualColumnNames;
+  // For equivalence
+  private final Map<VirtualColumn.EquivalenceKey, VirtualColumn> equivalence;
 
   // For getVirtualColumn:
   private final Map<String, VirtualColumn> withDotSupport;
   private final Map<String, VirtualColumn> withoutDotSupport;
+  private final boolean hasNoDotColumns;
 
   private VirtualColumns(
       List<VirtualColumn> virtualColumns,
@@ -146,10 +150,15 @@ private VirtualColumns(
     this.withDotSupport = withDotSupport;
     this.withoutDotSupport = withoutDotSupport;
     this.virtualColumnNames = new ArrayList<>(virtualColumns.size());
-
+    this.hasNoDotColumns = withDotSupport.isEmpty();
+    this.equivalence = Maps.newHashMapWithExpectedSize(virtualColumns.size());
     for (VirtualColumn virtualColumn : virtualColumns) {
       detectCycles(virtualColumn, null);
       virtualColumnNames.add(virtualColumn.getOutputName());
+      VirtualColumn.EquivalenceKey key = virtualColumn.getEquivalanceKey();
+      if (key != null) {
+        equivalence.put(key, virtualColumn);
+      }
     }
   }
 
@@ -172,10 +181,23 @@ public VirtualColumn getVirtualColumn(String columnName)
     if (vc != null) {
       return vc;
     }
+    if (hasNoDotColumns) {
+      return null;
+    }
     final String baseColumnName = splitColumnName(columnName).lhs;
     return withDotSupport.get(baseColumnName);
   }
 
+  /**
+   * Check if a virtual column is already defined which is the same as some other virtual column, ignoring output name,
+   * returning that virtual column if it exists, or null if there is no equivalent virtual column.
+   */
+  @Nullable
+  public VirtualColumn findEquivalent(VirtualColumn virtualColumn)
+  {
+    return equivalence.get(virtualColumn.getEquivalanceKey());
+  }
+
   /**
    * Get the {@link ColumnIndexSupplier} of the specified virtual column, with the assistance of a
    * {@link ColumnSelector} to allow reading things from segments. If the column does not have indexes this method
diff --git a/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionVirtualColumn.java b/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionVirtualColumn.java
index 42a723907b9d..e6f4d57e1d57 100644
--- a/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionVirtualColumn.java
+++ b/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionVirtualColumn.java
@@ -61,9 +61,7 @@ public class ExpressionVirtualColumn implements VirtualColumn
   private static final Logger log = new Logger(ExpressionVirtualColumn.class);
 
   private final String name;
-  private final String expression;
-  @Nullable
-  private final ColumnType outputType;
+  private final Expression expression;
   private final Supplier<Expr> parsedExpression;
   private final Supplier<byte[]> cacheKey;
 
@@ -126,8 +124,7 @@ private ExpressionVirtualColumn(
   )
   {
     this.name = Preconditions.checkNotNull(name, "name");
-    this.expression = Preconditions.checkNotNull(expression, "expression");
-    this.outputType = outputType;
+    this.expression = new Expression(Preconditions.checkNotNull(expression, "expression"), outputType);
     this.parsedExpression = parsedExpression;
     this.cacheKey = makeCacheKeySupplier();
   }
@@ -142,14 +139,14 @@ public String getOutputName()
   @JsonProperty
   public String getExpression()
   {
-    return expression;
+    return expression.expressionString;
   }
 
   @Nullable
   @JsonProperty
   public ColumnType getOutputType()
   {
-    return outputType;
+    return expression.outputType;
   }
 
   @JsonIgnore
@@ -273,7 +270,7 @@ public ColumnIndexSupplier getIndexSupplier(
       ColumnIndexSelector columnIndexSelector
   )
   {
-    return getParsedExpression().get().asColumnIndexSupplier(columnIndexSelector, outputType);
+    return getParsedExpression().get().asColumnIndexSupplier(columnIndexSelector, expression.outputType);
   }
 
   @Override
@@ -283,7 +280,7 @@ public ColumnCapabilities capabilities(String columnName)
     // are unable to compute the output type of the expression, either due to incomplete type information of the
     // inputs or because of unimplemented methods on expression implementations themselves, or, because a
     // ColumnInspector is not available
-
+    final ColumnType outputType = expression.outputType;
     if (ExpressionProcessing.processArraysAsMultiValueStrings() && outputType != null && outputType.isArray()) {
       return new ColumnCapabilitiesImpl().setType(ColumnType.STRING).setHasMultipleValues(true);
     }
@@ -299,6 +296,8 @@ public ColumnCapabilities capabilities(ColumnInspector inspector, String columnN
       return inspector.getColumnCapabilities(parsedExpression.get().getBindingIfIdentifier());
     }
 
+    final ColumnType outputType = expression.outputType;
+
     final ExpressionPlan plan = ExpressionPlanner.plan(inspector, parsedExpression.get());
     final ColumnCapabilities inferred = plan.inferColumnCapabilities(outputType);
     // if we can infer the column capabilities from the expression plan, then use that
@@ -311,14 +310,14 @@ public ColumnCapabilities capabilities(ColumnInspector inspector, String columnN
           log.warn(
               "Projected output type %s of expression %s does not match provided type %s",
               inferred.asTypeString(),
-              expression,
+              expression.expressionString,
               outputType
           );
         } else {
           log.debug(
               "Projected output type %s of expression %s does not match provided type %s",
               inferred.asTypeString(),
-              expression,
+              expression.expressionString,
               outputType
           );
         }
@@ -348,6 +347,13 @@ public byte[] getCacheKey()
     return cacheKey.get();
   }
 
+  @Nullable
+  @Override
+  public EquivalenceKey getEquivalanceKey()
+  {
+    return expression;
+  }
+
   @Override
   public boolean equals(final Object o)
   {
@@ -359,14 +365,13 @@ public boolean equals(final Object o)
     }
     final ExpressionVirtualColumn that = (ExpressionVirtualColumn) o;
     return Objects.equals(name, that.name) &&
-           Objects.equals(expression, that.expression) &&
-           Objects.equals(outputType, that.outputType);
+           Objects.equals(expression, that.expression);
   }
 
   @Override
   public int hashCode()
   {
-    return Objects.hash(name, expression, outputType);
+    return Objects.hash(name, expression);
   }
 
   @Override
@@ -374,8 +379,7 @@ public String toString()
   {
     return "ExpressionVirtualColumn{" +
            "name='" + name + '\'' +
-           ", expression='" + expression + '\'' +
-           ", outputType=" + outputType +
+           ", expression=" + expression +
            '}';
   }
 
@@ -389,10 +393,10 @@ private boolean isDirectAccess(final ColumnInspector inspector)
       final ColumnCapabilities baseCapabilities =
           inspector.getColumnCapabilities(parsedExpression.get().getBindingIfIdentifier());
 
-      if (outputType == null) {
+      if (expression.outputType == null) {
         // No desired output type. Anything from the source is fine.
         return true;
-      } else if (baseCapabilities != null && outputType.equals(baseCapabilities.toColumnType())) {
+      } else if (baseCapabilities != null && expression.outputType.equals(baseCapabilities.toColumnType())) {
         // Desired output type matches the type from the source.
         return true;
       }
@@ -408,10 +412,57 @@ private Supplier<byte[]> makeCacheKeySupplier()
           .appendString(name)
           .appendCacheable(parsedExpression.get());
 
-      if (outputType != null) {
-        builder.appendString(outputType.toString());
+      if (expression.outputType != null) {
+        builder.appendString(expression.outputType.toString());
       }
       return builder.build();
     });
   }
+
+  /**
+   * {@link VirtualColumn.EquivalenceKey} for expressions. Note that this does not check true equivalence of
+   * expressions, for example it will not currently consider something like 'a + b' equivalent to 'b + a'. This is ok
+   * for current uses of this functionality, but in theory we could push down equivalence to the parsed expression
+   * instead of checking for an identical string expression, it would just be a lot more expensive.
+   */
+  private static final class Expression implements EquivalenceKey
+  {
+    private final String expressionString;
+    @Nullable
+    private final ColumnType outputType;
+
+    private Expression(String expression, @Nullable ColumnType outputType)
+    {
+      this.expressionString = expression;
+      this.outputType = outputType;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+      if (this == o) {
+        return true;
+      }
+      if (o == null || getClass() != o.getClass()) {
+        return false;
+      }
+      Expression that = (Expression) o;
+      return Objects.equals(expressionString, that.expressionString) && Objects.equals(outputType, that.outputType);
+    }
+
+    @Override
+    public int hashCode()
+    {
+      return Objects.hash(expressionString, outputType);
+    }
+
+    @Override
+    public String toString()
+    {
+      return "Expression{" +
+             "expression='" + expressionString + '\'' +
+             ", outputType=" + outputType +
+             '}';
+    }
+  }
 }
diff --git a/processing/src/main/java/org/apache/druid/segment/virtual/NestedFieldVirtualColumn.java b/processing/src/main/java/org/apache/druid/segment/virtual/NestedFieldVirtualColumn.java
index aa7dd8b88b7e..5ec5958f5863 100644
--- a/processing/src/main/java/org/apache/druid/segment/virtual/NestedFieldVirtualColumn.java
+++ b/processing/src/main/java/org/apache/druid/segment/virtual/NestedFieldVirtualColumn.java
@@ -93,27 +93,23 @@
  * nested fields ({@link NestedFieldDictionaryEncodedColumn}) including using
  * their indexes.
  * <p>
- * This virtual column is used for the SQL operators JSON_VALUE (if {@link #processFromRaw} is set to false) or
+ * This virtual column is used for the SQL operators JSON_VALUE (if {@link #isProcessFromRaw()} is set to false) or
  * JSON_QUERY (if it is true), and accepts 'JSONPath' or 'jq' syntax string representations of paths, or a parsed
  * list of {@link NestedPathPart} in order to determine what should be selected from the column.
  * <p>
  * Type information for nested fields is completely absent in the SQL planner, so it guesses the best it can to set
- * {@link #expectedType} from the context of how something is being used, e.g. an aggregators default type or an
+ * {@link #getExpectedType()} from the context of how something is being used, e.g. an aggregators default type or an
  * explicit cast, or, if using the 'RETURNING' syntax which explicitly specifies type. This might not be the same as
  * if it had actual type information, but, we try to stick with whatever we chose there to do the best we can for now.
  * <p>
- * Since {@link #capabilities(ColumnInspector, String)} is determined by the {@link #expectedType}, the results will
- * be best effor cast to the expected type if the column is not natively the expected type so that this column can
+ * Since {@link #capabilities(ColumnInspector, String)} is determined by the {@link #getExpectedType()}, the results
+ * will be best effor cast to the expected type if the column is not natively the expected type so that this column can
  * fulfill the contract of the type of selector that is likely to be created to read this column.
  */
 public class NestedFieldVirtualColumn implements VirtualColumn
 {
-  private final String columnName;
   private final String outputName;
-  @Nullable
-  private final ColumnType expectedType;
-  private final List<NestedPathPart> parts;
-  private final boolean processFromRaw;
+  private final NestedFieldSpec fieldSpec;
 
   private final boolean hasNegativeArrayIndex;
 
@@ -128,22 +124,21 @@ public NestedFieldVirtualColumn(
       @JsonProperty("useJqSyntax") @Nullable Boolean useJqSyntax
   )
   {
-    this.columnName = columnName;
     this.outputName = outputName;
     if (path != null) {
       Preconditions.checkArgument(parts == null, "Cannot define both 'path' and 'pathParts'");
     } else if (parts == null) {
       throw new IllegalArgumentException("Must define exactly one of 'path' or 'pathParts'");
     }
-
+    final List<NestedPathPart> pathParts;
     if (parts != null) {
-      this.parts = parts;
+      pathParts = parts;
     } else {
       boolean isInputJq = useJqSyntax != null && useJqSyntax;
-      this.parts = isInputJq ? NestedPathFinder.parseJqPath(path) : NestedPathFinder.parseJsonPath(path);
+      pathParts = isInputJq ? NestedPathFinder.parseJqPath(path) : NestedPathFinder.parseJsonPath(path);
     }
     boolean hasNegative = false;
-    for (NestedPathPart part : this.parts) {
+    for (NestedPathPart part : pathParts) {
       if (part instanceof NestedPathArrayElement) {
         NestedPathArrayElement elementPart = (NestedPathArrayElement) part;
         if (elementPart.getIndex() < 0) {
@@ -153,8 +148,12 @@ public NestedFieldVirtualColumn(
       }
     }
     this.hasNegativeArrayIndex = hasNegative;
-    this.expectedType = expectedType;
-    this.processFromRaw = processFromRaw == null ? false : processFromRaw;
+    this.fieldSpec = new NestedFieldSpec(
+        columnName,
+        expectedType,
+        pathParts,
+        processFromRaw != null && processFromRaw
+    );
   }
 
   @VisibleForTesting
@@ -181,12 +180,12 @@ public NestedFieldVirtualColumn(
   @Override
   public byte[] getCacheKey()
   {
-    final String partsString = NestedPathFinder.toNormalizedJsonPath(parts);
+    final String partsString = NestedPathFinder.toNormalizedJsonPath(fieldSpec.parts);
     return new CacheKeyBuilder(VirtualColumnCacheHelper.CACHE_TYPE_ID_USER_DEFINED).appendString("nested-field")
                                                                                    .appendString(outputName)
-                                                                                   .appendString(columnName)
+                                                                                   .appendString(fieldSpec.columnName)
                                                                                    .appendString(partsString)
-                                                                                   .appendBoolean(processFromRaw)
+                                                                                   .appendBoolean(fieldSpec.processFromRaw)
                                                                                    .build();
   }
 
@@ -200,25 +199,25 @@ public String getOutputName()
   @JsonProperty
   public String getColumnName()
   {
-    return columnName;
+    return fieldSpec.columnName;
   }
 
   @JsonProperty("pathParts")
   public List<NestedPathPart> getPathParts()
   {
-    return parts;
+    return fieldSpec.parts;
   }
 
   @JsonProperty
   public ColumnType getExpectedType()
   {
-    return expectedType;
+    return fieldSpec.expectedType;
   }
 
   @JsonProperty
   public boolean isProcessFromRaw()
   {
-    return processFromRaw;
+    return fieldSpec.processFromRaw;
   }
 
   @Override
@@ -241,13 +240,13 @@ public ColumnValueSelector<?> makeColumnValueSelector(
   )
   {
     // this column value selector is used for realtime queries, so we always process StructuredData
-    final ColumnValueSelector<?> baseSelector = factory.makeColumnValueSelector(this.columnName);
+    final ColumnValueSelector<?> baseSelector = factory.makeColumnValueSelector(fieldSpec.columnName);
 
     // processFromRaw is true that means JSON_QUERY, which can return partial results, otherwise this virtual column
     // is JSON_VALUE which only returns literals, so use the literal value selector instead
-    return processFromRaw
-           ? new RawFieldColumnSelector(baseSelector, parts)
-           : new RawFieldLiteralColumnValueSelector(baseSelector, parts);
+    return fieldSpec.processFromRaw
+           ? new RawFieldColumnSelector(baseSelector, fieldSpec.parts)
+           : new RawFieldLiteralColumnValueSelector(baseSelector, fieldSpec.parts);
   }
 
   @Nullable
@@ -258,7 +257,7 @@ public DimensionSelector makeDimensionSelector(
       ReadableOffset offset
   )
   {
-    ColumnHolder holder = columnSelector.getColumnHolder(columnName);
+    ColumnHolder holder = columnSelector.getColumnHolder(fieldSpec.columnName);
     if (holder == null) {
       // column doesn't exist
       return dimensionSpec.decorate(DimensionSelector.constant(null, dimensionSpec.getExtractionFn()));
@@ -283,11 +282,11 @@ private DimensionSelector makeDimensionSelectorUndecorated(
     BaseColumn theColumn = holder.getColumn();
     if (theColumn instanceof NestedDataComplexColumn) {
       final NestedDataComplexColumn column = (NestedDataComplexColumn) theColumn;
-      return column.makeDimensionSelector(parts, offset, extractionFn);
+      return column.makeDimensionSelector(fieldSpec.parts, offset, extractionFn);
     }
 
     // not a nested column, but we can still do stuff if the path is the 'root', indicated by an empty path parts
-    if (parts.isEmpty()) {
+    if (fieldSpec.parts.isEmpty()) {
       // dictionary encoded columns do not typically implement the value selector methods (getLong, getDouble, getFloat)
       // nothing *should* be using a dimension selector to call the numeric getters, but just in case... wrap their
       // selector in a "best effort" casting selector to implement them
@@ -303,10 +302,10 @@ private DimensionSelector makeDimensionSelectorUndecorated(
       );
     }
 
-    if (parts.size() == 1 && parts.get(0) instanceof NestedPathArrayElement && theColumn instanceof VariantColumn) {
+    if (fieldSpec.parts.size() == 1 && fieldSpec.parts.get(0) instanceof NestedPathArrayElement && theColumn instanceof VariantColumn) {
       final VariantColumn<?> arrayColumn = (VariantColumn<?>) theColumn;
       ColumnValueSelector<?> arraySelector = arrayColumn.makeColumnValueSelector(offset);
-      final int elementNumber = ((NestedPathArrayElement) parts.get(0)).getIndex();
+      final int elementNumber = ((NestedPathArrayElement) fieldSpec.parts.get(0)).getIndex();
       if (elementNumber < 0) {
         throw new IAE("Cannot make array element selector, negative array index not supported");
       }
@@ -351,13 +350,13 @@ public ColumnValueSelector<?> makeColumnValueSelector(
       ReadableOffset offset
   )
   {
-    ColumnHolder holder = columnSelector.getColumnHolder(this.columnName);
+    ColumnHolder holder = columnSelector.getColumnHolder(fieldSpec.columnName);
     if (holder == null) {
       return NilColumnValueSelector.instance();
     }
     BaseColumn theColumn = holder.getColumn();
 
-    if (processFromRaw || hasNegativeArrayIndex) {
+    if (fieldSpec.processFromRaw || hasNegativeArrayIndex) {
       // if the path has negative array elements, or has set the flag to process 'raw' values explicitly (JSON_QUERY),
       // then we use the 'raw' processing of the RawFieldColumnSelector/RawFieldLiteralColumnValueSelector created
       // with the column selector factory instead of using the optimized nested field column
@@ -367,11 +366,11 @@ public ColumnValueSelector<?> makeColumnValueSelector(
     // "JSON_VALUE", which only returns literals, on a NestedDataComplexColumn, so we can use the fields value selector
     if (theColumn instanceof NestedDataComplexColumn) {
       final NestedDataComplexColumn column = (NestedDataComplexColumn) theColumn;
-      return column.makeColumnValueSelector(parts, offset);
+      return column.makeColumnValueSelector(fieldSpec.parts, offset);
     }
 
     // not a nested column, but we can still do stuff if the path is the 'root', indicated by an empty path parts
-    if (parts.isEmpty()) {
+    if (fieldSpec.parts.isEmpty()) {
       // dictionary encoded columns do not typically implement the value selector methods (getLong, getDouble, getFloat)
       // so we want to wrap their selector in a "best effort" casting selector to implement them
       if (theColumn instanceof DictionaryEncodedColumn && !(theColumn instanceof VariantColumn)) {
@@ -383,10 +382,10 @@ public ColumnValueSelector<?> makeColumnValueSelector(
       return theColumn.makeColumnValueSelector(offset);
     }
 
-    if (parts.size() == 1 && parts.get(0) instanceof NestedPathArrayElement && theColumn instanceof VariantColumn) {
+    if (fieldSpec.parts.size() == 1 && fieldSpec.parts.get(0) instanceof NestedPathArrayElement && theColumn instanceof VariantColumn) {
       final VariantColumn<?> arrayColumn = (VariantColumn<?>) theColumn;
       ColumnValueSelector<?> arraySelector = arrayColumn.makeColumnValueSelector(offset);
-      final int elementNumber = ((NestedPathArrayElement) parts.get(0)).getIndex();
+      final int elementNumber = ((NestedPathArrayElement) fieldSpec.parts.get(0)).getIndex();
       if (elementNumber < 0) {
         throw new IAE("Cannot make array element selector, negative array index not supported");
       }
@@ -466,7 +465,7 @@ public SingleValueDimensionVectorSelector makeSingleValueVectorDimensionSelector
       ReadableVectorOffset offset
   )
   {
-    ColumnHolder holder = columnSelector.getColumnHolder(columnName);
+    ColumnHolder holder = columnSelector.getColumnHolder(fieldSpec.columnName);
     if (holder == null) {
       return dimensionSpec.decorate(NilVectorSelector.create(offset));
     }
@@ -482,11 +481,11 @@ private SingleValueDimensionVectorSelector makeSingleValueVectorDimensionSelecto
     BaseColumn theColumn = holder.getColumn();
     if (theColumn instanceof NestedDataComplexColumn) {
       final NestedDataComplexColumn column = (NestedDataComplexColumn) theColumn;
-      return column.makeSingleValueDimensionVectorSelector(parts, offset);
+      return column.makeSingleValueDimensionVectorSelector(fieldSpec.parts, offset);
     }
 
     // not a nested column, but we can still do stuff if the path is the 'root', indicated by an empty path parts
-    if (parts.isEmpty()) {
+    if (fieldSpec.parts.isEmpty()) {
       // we will not end up here unless underlying column capabilities lied about something being dictionary encoded...
       // so no need for magic casting like nonvectorized engine
       return ((DictionaryEncodedColumn) theColumn).makeSingleValueDimensionVectorSelector(offset);
@@ -505,7 +504,7 @@ public VectorObjectSelector makeVectorObjectSelector(
       ReadableVectorOffset offset
   )
   {
-    ColumnHolder holder = columnSelector.getColumnHolder(this.columnName);
+    ColumnHolder holder = columnSelector.getColumnHolder(fieldSpec.columnName);
     if (holder == null) {
       return NilVectorSelector.create(offset);
     }
@@ -514,80 +513,80 @@ public VectorObjectSelector makeVectorObjectSelector(
 
     if (column instanceof NestedDataComplexColumn) {
       final NestedDataComplexColumn complexColumn = (NestedDataComplexColumn) column;
-      if (processFromRaw) {
+      if (fieldSpec.processFromRaw) {
         // processFromRaw is true, that means JSON_QUERY, which can return partial results, otherwise this virtual column
         // is JSON_VALUE which only returns literals, so we can use the nested columns value selector
-        return new RawFieldVectorObjectSelector(complexColumn.makeVectorObjectSelector(offset), parts);
+        return new RawFieldVectorObjectSelector(complexColumn.makeVectorObjectSelector(offset), fieldSpec.parts);
       }
-      Set<ColumnType> types = complexColumn.getColumnTypes(parts);
+      Set<ColumnType> types = complexColumn.getColumnTypes(fieldSpec.parts);
       ColumnType leastRestrictiveType = null;
       if (types != null) {
         for (ColumnType type : types) {
           leastRestrictiveType = ColumnType.leastRestrictiveType(leastRestrictiveType, type);
         }
       }
-      if (leastRestrictiveType != null && leastRestrictiveType.isNumeric() && !Types.isNumeric(expectedType)) {
+      if (leastRestrictiveType != null && leastRestrictiveType.isNumeric() && !Types.isNumeric(fieldSpec.expectedType)) {
         return ExpressionVectorSelectors.castValueSelectorToObject(
             offset,
             columnName,
-            complexColumn.makeVectorValueSelector(parts, offset),
+            complexColumn.makeVectorValueSelector(fieldSpec.parts, offset),
             leastRestrictiveType,
-            expectedType == null ? ColumnType.STRING : expectedType
+            fieldSpec.expectedType == null ? ColumnType.STRING : fieldSpec.expectedType
         );
       }
-      final VectorObjectSelector objectSelector = complexColumn.makeVectorObjectSelector(parts, offset);
+      final VectorObjectSelector objectSelector = complexColumn.makeVectorObjectSelector(fieldSpec.parts, offset);
       if (leastRestrictiveType != null &&
           leastRestrictiveType.isArray() &&
-          expectedType != null &&
-          !expectedType.isArray()
+          fieldSpec.expectedType != null &&
+          !fieldSpec.expectedType.isArray()
       ) {
         final ExpressionType elementType = ExpressionType.fromColumnTypeStrict(leastRestrictiveType.getElementType());
-        final ExpressionType castTo = ExpressionType.fromColumnTypeStrict(expectedType);
+        final ExpressionType castTo = ExpressionType.fromColumnTypeStrict(fieldSpec.expectedType);
         return makeVectorArrayToScalarObjectSelector(offset, objectSelector, elementType, castTo);
       }
 
       return objectSelector;
     }
     // not a nested column, but we can still do stuff if the path is the 'root', indicated by an empty path parts
-    if (parts.isEmpty()) {
+    if (fieldSpec.parts.isEmpty()) {
       ColumnCapabilities capabilities = holder.getCapabilities();
       // expectedType shouldn't possibly be null if we are being asked for an object selector and the underlying column
       // is numeric, else we would have been asked for a value selector
       Preconditions.checkArgument(
-          expectedType != null,
+          fieldSpec.expectedType != null,
           "Asked for a VectorObjectSelector on a numeric column, 'expectedType' must not be null"
       );
       if (capabilities.isNumeric()) {
         return ExpressionVectorSelectors.castValueSelectorToObject(
             offset,
-            this.columnName,
+            fieldSpec.columnName,
             column.makeVectorValueSelector(offset),
             capabilities.toColumnType(),
-            expectedType
+            fieldSpec.expectedType
         );
       }
       // if the underlying column is array typed, the vector object selector it spits out will homogenize stuff to
       // make all of the objects a consistent type, which is typically a good thing, but if we are doing mixed type
       // stuff and expect the output type to be scalar typed, then we should coerce things to only extract the scalars
-      if (capabilities.isArray() && !expectedType.isArray()) {
+      if (capabilities.isArray() && !fieldSpec.expectedType.isArray()) {
         final VectorObjectSelector delegate = column.makeVectorObjectSelector(offset);
         final ExpressionType elementType = ExpressionType.fromColumnTypeStrict(capabilities.getElementType());
-        final ExpressionType castTo = ExpressionType.fromColumnTypeStrict(expectedType);
+        final ExpressionType castTo = ExpressionType.fromColumnTypeStrict(fieldSpec.expectedType);
         return makeVectorArrayToScalarObjectSelector(offset, delegate, elementType, castTo);
       }
       return column.makeVectorObjectSelector(offset);
     }
 
-    if (parts.size() == 1 && parts.get(0) instanceof NestedPathArrayElement && column instanceof VariantColumn) {
+    if (fieldSpec.parts.size() == 1 && fieldSpec.parts.get(0) instanceof NestedPathArrayElement && column instanceof VariantColumn) {
       final VariantColumn<?> arrayColumn = (VariantColumn<?>) column;
       final ExpressionType elementType = ExpressionType.fromColumnTypeStrict(
           arrayColumn.getLogicalType().isArray() ? arrayColumn.getLogicalType().getElementType() : arrayColumn.getLogicalType()
       );
-      final ExpressionType castTo = expectedType == null
+      final ExpressionType castTo = fieldSpec.expectedType == null
                                     ? ExpressionType.STRING
-                                    : ExpressionType.fromColumnTypeStrict(expectedType);
+                                    : ExpressionType.fromColumnTypeStrict(fieldSpec.expectedType);
       VectorObjectSelector arraySelector = arrayColumn.makeVectorObjectSelector(offset);
-      final int elementNumber = ((NestedPathArrayElement) parts.get(0)).getIndex();
+      final int elementNumber = ((NestedPathArrayElement) fieldSpec.parts.get(0)).getIndex();
       if (elementNumber < 0) {
         throw new IAE("Cannot make array element selector, negative array index not supported");
       }
@@ -646,17 +645,17 @@ public VectorValueSelector makeVectorValueSelector(
       ReadableVectorOffset offset
   )
   {
-    ColumnHolder holder = columnSelector.getColumnHolder(this.columnName);
+    ColumnHolder holder = columnSelector.getColumnHolder(fieldSpec.columnName);
     if (holder == null) {
       return NilVectorSelector.create(offset);
     }
     BaseColumn theColumn = holder.getColumn();
     if (!(theColumn instanceof NestedDataComplexColumn)) {
 
-      if (parts.isEmpty()) {
+      if (fieldSpec.parts.isEmpty()) {
         if (theColumn instanceof DictionaryEncodedColumn) {
           final VectorObjectSelector delegate = theColumn.makeVectorObjectSelector(offset);
-          if (expectedType != null && expectedType.is(ValueType.LONG)) {
+          if (fieldSpec.expectedType != null && fieldSpec.expectedType.is(ValueType.LONG)) {
             return new BaseLongVectorValueSelector(offset)
             {
               private int currentOffsetId = ReadableVectorInspector.NULL_ID;
@@ -701,7 +700,7 @@ private void computeLongs()
                 }
               }
             };
-          } else if (expectedType != null && expectedType.is(ValueType.FLOAT)) {
+          } else if (fieldSpec.expectedType != null && fieldSpec.expectedType.is(ValueType.FLOAT)) {
             return new BaseFloatVectorValueSelector(offset)
             {
               private int currentOffsetId = ReadableVectorInspector.NULL_ID;
@@ -794,15 +793,15 @@ private void computeDoubles()
         }
         return theColumn.makeVectorValueSelector(offset);
       }
-      if (parts.size() == 1 && parts.get(0) instanceof NestedPathArrayElement && theColumn instanceof VariantColumn) {
+      if (fieldSpec.parts.size() == 1 && fieldSpec.parts.get(0) instanceof NestedPathArrayElement && theColumn instanceof VariantColumn) {
         final VariantColumn<?> arrayColumn = (VariantColumn<?>) theColumn;
         VectorObjectSelector arraySelector = arrayColumn.makeVectorObjectSelector(offset);
-        final int elementNumber = ((NestedPathArrayElement) parts.get(0)).getIndex();
+        final int elementNumber = ((NestedPathArrayElement) fieldSpec.parts.get(0)).getIndex();
         if (elementNumber < 0) {
           throw new IAE("Cannot make array element selector, negative array index not supported");
         }
 
-        if (expectedType != null && expectedType.is(ValueType.LONG)) {
+        if (fieldSpec.expectedType != null && fieldSpec.expectedType.is(ValueType.LONG)) {
           return new BaseLongVectorValueSelector(offset)
           {
             private final long[] longs = new long[offset.getMaxVectorSize()];
@@ -871,7 +870,7 @@ public boolean[] getNullVector()
               return nulls;
             }
           };
-        } else if (expectedType != null && expectedType.is(ValueType.FLOAT)) {
+        } else if (fieldSpec.expectedType != null && fieldSpec.expectedType.is(ValueType.FLOAT)) {
           return new BaseFloatVectorValueSelector(offset)
           {
             private final float[] floats = new float[offset.getMaxVectorSize()];
@@ -1015,12 +1014,12 @@ public boolean[] getNullVector()
     final NestedDataComplexColumn column = (NestedDataComplexColumn) theColumn;
     // if column is numeric, it has a vector value selector, so we can directly make a vector value selector
     // if we are missing an expectedType, then we've got nothing else to work with so try it anyway
-    if (column.isNumeric(parts) || expectedType == null) {
-      return column.makeVectorValueSelector(parts, offset);
+    if (column.isNumeric(fieldSpec.parts) || fieldSpec.expectedType == null) {
+      return column.makeVectorValueSelector(fieldSpec.parts, offset);
     }
 
-    final VectorObjectSelector objectSelector = column.makeVectorObjectSelector(parts, offset);
-    if (expectedType.is(ValueType.LONG)) {
+    final VectorObjectSelector objectSelector = column.makeVectorObjectSelector(fieldSpec.parts, offset);
+    if (fieldSpec.expectedType.is(ValueType.LONG)) {
       return new BaseLongVectorValueSelector(offset)
       {
         private final long[] longVector = new long[offset.getMaxVectorSize()];
@@ -1162,47 +1161,47 @@ public ColumnIndexSupplier getIndexSupplier(
       ColumnIndexSelector indexSelector
   )
   {
-    ColumnHolder holder = indexSelector.getColumnHolder(this.columnName);
+    ColumnHolder holder = indexSelector.getColumnHolder(fieldSpec.columnName);
     if (holder == null) {
       return null;
     }
     BaseColumn theColumn = holder.getColumn();
     if (theColumn instanceof CompressedNestedDataComplexColumn) {
       final CompressedNestedDataComplexColumn<?> nestedColumn = (CompressedNestedDataComplexColumn<?>) theColumn;
-      final ColumnIndexSupplier nestedColumnPathIndexSupplier = nestedColumn.getColumnIndexSupplier(parts);
-      if (nestedColumnPathIndexSupplier == null && processFromRaw) {
+      final ColumnIndexSupplier nestedColumnPathIndexSupplier = nestedColumn.getColumnIndexSupplier(fieldSpec.parts);
+      if (nestedColumnPathIndexSupplier == null && fieldSpec.processFromRaw) {
         // if processing from raw, a non-exstent path from parts doesn't mean the path doesn't really exist
         // so fall back to no indexes
         return NoIndexesColumnIndexSupplier.getInstance();
       }
-      if (expectedType != null) {
-        final Set<ColumnType> types = nestedColumn.getColumnTypes(parts);
+      if (fieldSpec.expectedType != null) {
+        final Set<ColumnType> types = nestedColumn.getColumnTypes(fieldSpec.parts);
         // if the expected output type is numeric but not all of the input types are numeric, we might have additional
         // null values than what the null value bitmap is tracking, fall back to not using indexes
-        if (expectedType.isNumeric() && (types == null || types.stream().anyMatch(t -> !t.isNumeric()))) {
+        if (fieldSpec.expectedType.isNumeric() && (types == null || types.stream().anyMatch(t -> !t.isNumeric()))) {
           return NoIndexesColumnIndexSupplier.getInstance();
         }
       }
       return nestedColumnPathIndexSupplier;
     }
-    if (parts.isEmpty()) {
+    if (fieldSpec.parts.isEmpty()) {
       final ColumnIndexSupplier baseIndexSupplier = holder.getIndexSupplier();
-      if (expectedType != null) {
+      if (fieldSpec.expectedType != null) {
         if (theColumn instanceof NumericColumn) {
           return baseIndexSupplier;
         }
         if (theColumn instanceof NestedCommonFormatColumn) {
           final NestedCommonFormatColumn commonFormat = (NestedCommonFormatColumn) theColumn;
-          if (expectedType.isNumeric() && !commonFormat.getLogicalType().isNumeric()) {
+          if (fieldSpec.expectedType.isNumeric() && !commonFormat.getLogicalType().isNumeric()) {
             return NoIndexesColumnIndexSupplier.getInstance();
           }
         } else {
-          return expectedType.isNumeric() ? NoIndexesColumnIndexSupplier.getInstance() : baseIndexSupplier;
+          return fieldSpec.expectedType.isNumeric() ? NoIndexesColumnIndexSupplier.getInstance() : baseIndexSupplier;
         }
       }
       return baseIndexSupplier;
     }
-    if (parts.size() == 1 && parts.get(0) instanceof NestedPathArrayElement && theColumn instanceof VariantColumn) {
+    if (fieldSpec.parts.size() == 1 && fieldSpec.parts.get(0) instanceof NestedPathArrayElement && theColumn instanceof VariantColumn) {
       // cannot use the array column index supplier directly, in the future array columns should expose a function
       // with a signature like 'getArrayElementIndexSupplier(int index)' to allow getting indexes for specific elements
       // if we want to support this stuff. Right now VariantArrayColumn doesn't actually retain enough information about
@@ -1215,7 +1214,7 @@ public ColumnIndexSupplier getIndexSupplier(
   @Override
   public ColumnCapabilities capabilities(String columnName)
   {
-    if (processFromRaw) {
+    if (fieldSpec.processFromRaw) {
       // JSON_QUERY always returns a StructuredData
       return ColumnCapabilitiesImpl.createDefault()
                                    .setType(ColumnType.NESTED_DATA)
@@ -1225,7 +1224,7 @@ public ColumnCapabilities capabilities(String columnName)
     // this should only be used for 'realtime' queries, so don't indicate that we are dictionary encoded or have indexes
     // from here
     return ColumnCapabilitiesImpl.createDefault()
-                                 .setType(expectedType != null ? expectedType : ColumnType.STRING)
+                                 .setType(fieldSpec.expectedType != null ? fieldSpec.expectedType : ColumnType.STRING)
                                  .setHasNulls(true);
   }
 
@@ -1233,8 +1232,8 @@ public ColumnCapabilities capabilities(String columnName)
   @Override
   public ColumnCapabilities capabilities(ColumnInspector inspector, String columnName)
   {
-    if (processFromRaw) {
-      if (expectedType != null && expectedType.isArray() && ColumnType.NESTED_DATA.equals(expectedType.getElementType())) {
+    if (fieldSpec.processFromRaw) {
+      if (fieldSpec.expectedType != null && fieldSpec.expectedType.isArray() && ColumnType.NESTED_DATA.equals(fieldSpec.expectedType.getElementType())) {
         // arrays of objects!
         return ColumnCapabilitiesImpl.createDefault()
                                      .setType(ColumnType.ofArray(ColumnType.NESTED_DATA))
@@ -1249,16 +1248,16 @@ public ColumnCapabilities capabilities(ColumnInspector inspector, String columnN
     }
     // ColumnInspector isn't really enough... we need the ability to read the complex column itself to examine
     // the nested fields type information to really be accurate here, so we rely on the expectedType to guide us
-    final ColumnCapabilities capabilities = inspector.getColumnCapabilities(this.columnName);
+    final ColumnCapabilities capabilities = inspector.getColumnCapabilities(fieldSpec.columnName);
 
     if (capabilities != null) {
       // if the underlying column is a nested column (and persisted to disk, re: the dictionary encoded check)
       if (capabilities.is(ValueType.COMPLEX) &&
           capabilities.getComplexTypeName().equals(NestedDataComplexTypeSerde.TYPE_NAME) &&
           capabilities.isDictionaryEncoded().isTrue()) {
-        final boolean useDictionary = parts.isEmpty() || !(parts.get(parts.size() - 1) instanceof NestedPathArrayElement);
+        final boolean useDictionary = fieldSpec.parts.isEmpty() || !(fieldSpec.parts.get(fieldSpec.parts.size() - 1) instanceof NestedPathArrayElement);
         return ColumnCapabilitiesImpl.createDefault()
-                                     .setType(expectedType != null ? expectedType : ColumnType.STRING)
+                                     .setType(fieldSpec.expectedType != null ? fieldSpec.expectedType : ColumnType.STRING)
                                      .setDictionaryEncoded(useDictionary)
                                      .setDictionaryValuesSorted(useDictionary)
                                      .setDictionaryValuesUnique(useDictionary)
@@ -1266,12 +1265,12 @@ public ColumnCapabilities capabilities(ColumnInspector inspector, String columnN
                                      .setHasNulls(true);
       }
       // column is not nested, use underlying column capabilities, adjusted for expectedType as necessary
-      if (parts.isEmpty()) {
+      if (fieldSpec.parts.isEmpty()) {
         ColumnCapabilitiesImpl copy = ColumnCapabilitiesImpl.copyOf(capabilities);
-        if (expectedType != null) {
-          copy.setType(expectedType);
+        if (fieldSpec.expectedType != null) {
+          copy.setType(fieldSpec.expectedType);
           copy.setHasNulls(
-              copy.hasNulls().or(ColumnCapabilities.Capable.of(expectedType.getType() != capabilities.getType()))
+              copy.hasNulls().or(ColumnCapabilities.Capable.of(fieldSpec.expectedType.getType() != capabilities.getType()))
           );
         }
         return copy;
@@ -1287,7 +1286,7 @@ public ColumnCapabilities capabilities(ColumnInspector inspector, String columnN
   @Override
   public List<String> requiredColumns()
   {
-    return Collections.singletonList(columnName);
+    return Collections.singletonList(fieldSpec.columnName);
   }
 
   @Override
@@ -1296,6 +1295,13 @@ public boolean usesDotNotation()
     return false;
   }
 
+  @Nullable
+  @Override
+  public EquivalenceKey getEquivalanceKey()
+  {
+    return fieldSpec;
+  }
+
   @Override
   public boolean equals(Object o)
   {
@@ -1306,28 +1312,25 @@ public boolean equals(Object o)
       return false;
     }
     NestedFieldVirtualColumn that = (NestedFieldVirtualColumn) o;
-    return columnName.equals(that.columnName) &&
-           outputName.equals(that.outputName) &&
-           parts.equals(that.parts) &&
-           Objects.equals(expectedType, that.expectedType) &&
-           processFromRaw == that.processFromRaw;
+    return outputName.equals(that.outputName) &&
+           fieldSpec.equals(that.fieldSpec);
   }
 
   @Override
   public int hashCode()
   {
-    return Objects.hash(columnName, parts, outputName, expectedType, processFromRaw);
+    return Objects.hash(outputName, fieldSpec);
   }
 
   @Override
   public String toString()
   {
     return "NestedFieldVirtualColumn{" +
-           "columnName='" + columnName + '\'' +
+           "columnName='" + fieldSpec.columnName + '\'' +
            ", outputName='" + outputName + '\'' +
-           ", typeHint='" + expectedType + '\'' +
-           ", pathParts='" + parts + '\'' +
-           ", allowFallback=" + processFromRaw +
+           ", typeHint='" + fieldSpec.expectedType + '\'' +
+           ", pathParts='" + fieldSpec.parts + '\'' +
+           ", allowFallback=" + fieldSpec.processFromRaw +
            '}';
   }
 
@@ -1386,6 +1389,50 @@ public int getCurrentVectorSize()
     };
   }
 
+  private static class NestedFieldSpec implements EquivalenceKey
+  {
+    private final String columnName;
+    @Nullable
+    private final ColumnType expectedType;
+    private final List<NestedPathPart> parts;
+    private final boolean processFromRaw;
+
+    private NestedFieldSpec(
+        String columnName,
+        @Nullable ColumnType expectedType,
+        List<NestedPathPart> parts,
+        boolean processFromRaw
+    )
+    {
+      this.columnName = columnName;
+      this.expectedType = expectedType;
+      this.parts = parts;
+      this.processFromRaw = processFromRaw;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+      if (this == o) {
+        return true;
+      }
+      if (o == null || getClass() != o.getClass()) {
+        return false;
+      }
+      NestedFieldSpec that = (NestedFieldSpec) o;
+      return processFromRaw == that.processFromRaw
+             && Objects.equals(columnName, that.columnName)
+             && Objects.equals(expectedType, that.expectedType)
+             && Objects.equals(parts, that.parts);
+    }
+
+    @Override
+    public int hashCode()
+    {
+      return Objects.hash(columnName, expectedType, parts, processFromRaw);
+    }
+  }
+
   /**
    * Process the "raw" data to extract non-complex values. Like {@link RawFieldColumnSelector} but does not return
    * complex nested objects and does not wrap the results in {@link StructuredData}.
diff --git a/processing/src/test/java/org/apache/druid/segment/VirtualColumnsTest.java b/processing/src/test/java/org/apache/druid/segment/VirtualColumnsTest.java
index 71d9adbd0170..9f0e8c5ea31d 100644
--- a/processing/src/test/java/org/apache/druid/segment/VirtualColumnsTest.java
+++ b/processing/src/test/java/org/apache/druid/segment/VirtualColumnsTest.java
@@ -43,7 +43,6 @@
 import org.junit.Assert;
 import org.junit.Rule;
 import org.junit.Test;
-import org.junit.rules.ExpectedException;
 import org.mockito.Mock;
 import org.mockito.Mockito;
 import org.mockito.junit.MockitoJUnit;
@@ -62,9 +61,6 @@ public class VirtualColumnsTest extends InitializedNullHandlingTest
 {
   private static final String REAL_COLUMN_NAME = "real_column";
 
-  @Rule
-  public ExpectedException expectedException = ExpectedException.none();
-
   @Rule
   public MockitoRule mockitoRule = MockitoJUnit.rule().strictness(Strictness.STRICT_STUBS);
 
@@ -217,10 +213,11 @@ public void testNonExistentSelector()
   {
     final VirtualColumns virtualColumns = makeVirtualColumns();
 
-    expectedException.expect(IllegalArgumentException.class);
-    expectedException.expectMessage("No such virtual column[bar]");
-
-    virtualColumns.makeColumnValueSelector("bar", baseColumnSelectorFactory);
+    Throwable t = Assert.assertThrows(
+        IllegalArgumentException.class,
+        () -> virtualColumns.makeColumnValueSelector("bar", baseColumnSelectorFactory)
+    );
+    Assert.assertEquals("No such virtual column[bar]", t.getMessage());
   }
 
   @Test
@@ -321,10 +318,11 @@ public void testTimeNotAllowed()
         TestExprMacroTable.INSTANCE
     );
 
-    expectedException.expect(IllegalArgumentException.class);
-    expectedException.expectMessage("virtualColumn name[__time] not allowed");
-
-    VirtualColumns.create(ImmutableList.of(expr));
+    Throwable t = Assert.assertThrows(
+        IllegalArgumentException.class,
+        () -> VirtualColumns.create(ImmutableList.of(expr))
+    );
+    Assert.assertEquals("virtualColumn name[__time] not allowed", t.getMessage());
   }
 
   @Test
@@ -344,10 +342,11 @@ public void testDuplicateNameDetection()
         TestExprMacroTable.INSTANCE
     );
 
-    expectedException.expect(IllegalArgumentException.class);
-    expectedException.expectMessage("Duplicate virtualColumn name[expr]");
-
-    VirtualColumns.create(ImmutableList.of(expr, expr2));
+    Throwable t = Assert.assertThrows(
+        IllegalArgumentException.class,
+        () -> VirtualColumns.create(ImmutableList.of(expr, expr2))
+    );
+    Assert.assertEquals("Duplicate virtualColumn name[expr]", t.getMessage());
   }
 
   @Test
@@ -367,10 +366,11 @@ public void testCycleDetection()
         TestExprMacroTable.INSTANCE
     );
 
-    expectedException.expect(IllegalArgumentException.class);
-    expectedException.expectMessage("Self-referential column[expr]");
-
-    VirtualColumns.create(ImmutableList.of(expr, expr2));
+    Throwable t = Assert.assertThrows(
+        IllegalArgumentException.class,
+        () -> VirtualColumns.create(ImmutableList.of(expr, expr2))
+    );
+    Assert.assertEquals("Self-referential column[expr]", t.getMessage());
   }
 
   @Test
@@ -417,6 +417,42 @@ public void testEqualsAndHashCode()
     Assert.assertNotEquals(VirtualColumns.EMPTY.hashCode(), virtualColumns.hashCode());
   }
 
+  @Test
+  public void testEquivalence()
+  {
+    final VirtualColumn v0 = new ExpressionVirtualColumn(
+        "expr",
+        "x + y",
+        ColumnType.FLOAT,
+        TestExprMacroTable.INSTANCE
+    );
+    final VirtualColumns virtualColumns = VirtualColumns.create(ImmutableList.of(v0));
+
+    final VirtualColumn v1 = new ExpressionVirtualColumn(
+        "differentNameExpr",
+        "x + y",
+        ColumnType.FLOAT,
+        TestExprMacroTable.INSTANCE
+    );
+    final VirtualColumn v2 = new ExpressionVirtualColumn(
+        "differentNameTypeExpr",
+        "x + y",
+        ColumnType.DOUBLE,
+        TestExprMacroTable.INSTANCE
+    );
+    final VirtualColumn v3 = new ExpressionVirtualColumn(
+        "expr",
+        "x + y",
+        ColumnType.DOUBLE,
+        TestExprMacroTable.INSTANCE
+    );
+
+    Assert.assertEquals(v0, virtualColumns.findEquivalent(v0));
+    Assert.assertEquals(v0, virtualColumns.findEquivalent(v1));
+    Assert.assertNull(virtualColumns.findEquivalent(v2));
+    Assert.assertNull(virtualColumns.findEquivalent(v3));
+  }
+
   @Test
   public void testSerde() throws Exception
   {
diff --git a/processing/src/test/java/org/apache/druid/segment/virtual/NestedFieldVirtualColumnTest.java b/processing/src/test/java/org/apache/druid/segment/virtual/NestedFieldVirtualColumnTest.java
index 581c8674da90..62f265880090 100644
--- a/processing/src/test/java/org/apache/druid/segment/virtual/NestedFieldVirtualColumnTest.java
+++ b/processing/src/test/java/org/apache/druid/segment/virtual/NestedFieldVirtualColumnTest.java
@@ -86,11 +86,23 @@ public void testNoPathAndPartsDefined()
     );
   }
 
+  @Test
+  public void testEquivalence()
+  {
+    NestedFieldVirtualColumn v1 = new NestedFieldVirtualColumn("nested", "$.x.y.z", "v0", ColumnType.LONG);
+    NestedFieldVirtualColumn v2 = new NestedFieldVirtualColumn("nested", "$.x.y.z", "v1", ColumnType.LONG);
+    NestedFieldVirtualColumn v3 = new NestedFieldVirtualColumn("nested", "$.x.y.z[0]", "v0", ColumnType.LONG);
+    Assert.assertNotEquals(v1, v2);
+    Assert.assertEquals(v1.getEquivalanceKey(), v2.getEquivalanceKey());
+    Assert.assertNotEquals(v1, v3);
+    Assert.assertNotEquals(v1.getEquivalanceKey(), v3.getEquivalanceKey());
+  }
+
   @Test
   public void testEqualsAndHashcode()
   {
     EqualsVerifier.forClass(NestedFieldVirtualColumn.class)
-                  .withNonnullFields("columnName", "outputName")
+                  .withNonnullFields("fieldSpec", "outputName")
                   .withIgnoredFields("hasNegativeArrayIndex")
                   .usingGetClass()
                   .verify();

From 629ba2a889e86c2ad9896a82288af7380c196172 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Tue, 17 Sep 2024 13:37:14 -0700
Subject: [PATCH 43/47] MSQ: Add QueryKitSpec to encapsulate QueryKit params.
 (#17077)

* MSQ: Add QueryKitSpec to encapsulate QueryKit params.

This patch introduces QueryKitSpec, an object that encapsulates the
parameters to makeQueryDefinition that are consistent from call to
call. This simplifies things because we avoid passing around all the
components individually.

This patch also splits "maxWorkerCount" into "maxLeafWorkerCount" and
"maxNonLeafWorkerCount", which apply to leaf stages (no other stages as
inputs) and nonleaf stages respectively.

Finally, this patch also rovides a way for ControllerContext to supply a
QueryKitSpec to its liking. It is expected that this will be used by
controllers of quick interactive queries to set maxNonLeafWorkerCount = 1,
which will generate fanning-in query plans.

* Fix javadoc.
---
 .../druid/msq/exec/ControllerContext.java     |  14 +-
 .../apache/druid/msq/exec/ControllerImpl.java |  29 ++--
 .../indexing/IndexerControllerContext.java    |  26 +++-
 .../druid/msq/querykit/DataSourcePlan.java    | 136 +++++-------------
 .../druid/msq/querykit/MultiQueryKit.java     |  10 +-
 .../apache/druid/msq/querykit/QueryKit.java   |  12 +-
 .../druid/msq/querykit/QueryKitSpec.java      | 109 ++++++++++++++
 .../msq/querykit/WindowOperatorQueryKit.java  |  26 ++--
 .../msq/querykit/groupby/GroupByQueryKit.java |  28 ++--
 .../druid/msq/querykit/scan/ScanQueryKit.java |  19 ++-
 .../msq/test/MSQTestControllerContext.java    |  26 +++-
 11 files changed, 241 insertions(+), 194 deletions(-)
 create mode 100644 extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/QueryKitSpec.java

diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java
index 58b32e96e7fa..42515a1779ad 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerContext.java
@@ -30,7 +30,8 @@
 import org.apache.druid.msq.input.table.TableInputSpec;
 import org.apache.druid.msq.kernel.controller.ControllerQueryKernelConfig;
 import org.apache.druid.msq.querykit.QueryKit;
-import org.apache.druid.msq.util.MultiStageQueryContext;
+import org.apache.druid.msq.querykit.QueryKitSpec;
+import org.apache.druid.query.Query;
 import org.apache.druid.server.DruidNode;
 
 /**
@@ -103,8 +104,13 @@ WorkerManager newWorkerManager(
   WorkerClient newWorkerClient();
 
   /**
-   * Default target partitions per worker for {@link QueryKit#makeQueryDefinition}. Can be overridden using
-   * {@link MultiStageQueryContext#CTX_TARGET_PARTITIONS_PER_WORKER}.
+   * Create a {@link QueryKitSpec}. This method provides controller contexts a way to customize parameters around the
+   * number of workers and partitions.
    */
-  int defaultTargetPartitionsPerWorker();
+  QueryKitSpec makeQueryKitSpec(
+      QueryKit<Query<?>> queryKit,
+      String queryId,
+      MSQSpec querySpec,
+      ControllerQueryKernelConfig queryKernelConfig
+  );
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
index 8457675e8f26..72d8216088fe 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/ControllerImpl.java
@@ -152,6 +152,7 @@
 import org.apache.druid.msq.kernel.controller.WorkerInputs;
 import org.apache.druid.msq.querykit.MultiQueryKit;
 import org.apache.druid.msq.querykit.QueryKit;
+import org.apache.druid.msq.querykit.QueryKitSpec;
 import org.apache.druid.msq.querykit.QueryKitUtils;
 import org.apache.druid.msq.querykit.ShuffleSpecFactory;
 import org.apache.druid.msq.querykit.WindowOperatorQueryKit;
@@ -567,14 +568,9 @@ private QueryDefinition initializeQueryDefAndState(final Closer closer)
 
     final QueryContext queryContext = querySpec.getQuery().context();
     final QueryDefinition queryDef = makeQueryDefinition(
-        queryId(),
-        makeQueryControllerToolKit(),
+        context.makeQueryKitSpec(makeQueryControllerToolKit(), queryId, querySpec, queryKernelConfig),
         querySpec,
         context.jsonMapper(),
-        MultiStageQueryContext.getTargetPartitionsPerWorkerWithDefault(
-            queryContext,
-            context.defaultTargetPartitionsPerWorker()
-        ),
         resultsContext
     );
 
@@ -1201,7 +1197,7 @@ private Int2ObjectMap<Object> makeWorkerFactoryInfosForStage(
   }
 
   @SuppressWarnings("rawtypes")
-  private QueryKit makeQueryControllerToolKit()
+  private QueryKit<Query<?>> makeQueryControllerToolKit()
   {
     final Map<Class<? extends Query>, QueryKit> kitMap =
         ImmutableMap.<Class<? extends Query>, QueryKit>builder()
@@ -1725,11 +1721,9 @@ private void cleanUpDurableStorageIfNeeded()
 
   @SuppressWarnings("unchecked")
   private static QueryDefinition makeQueryDefinition(
-      final String queryId,
-      @SuppressWarnings("rawtypes") final QueryKit toolKit,
+      final QueryKitSpec queryKitSpec,
       final MSQSpec querySpec,
       final ObjectMapper jsonMapper,
-      final int targetPartitionsPerWorker,
       final ResultsContext resultsContext
   )
   {
@@ -1773,13 +1767,10 @@ private static QueryDefinition makeQueryDefinition(
     final QueryDefinition queryDef;
 
     try {
-      queryDef = toolKit.makeQueryDefinition(
-          queryId,
+      queryDef = queryKitSpec.getQueryKit().makeQueryDefinition(
+          queryKitSpec,
           queryToPlan,
-          toolKit,
           resultShuffleSpecFactory,
-          tuningConfig.getMaxNumWorkers(),
-          targetPartitionsPerWorker,
           0
       );
     }
@@ -1808,7 +1799,7 @@ private static QueryDefinition makeQueryDefinition(
 
       // Add all query stages.
       // Set shuffleCheckHasMultipleValues on the stage that serves as input to the final segment-generation stage.
-      final QueryDefinitionBuilder builder = QueryDefinition.builder(queryId);
+      final QueryDefinitionBuilder builder = QueryDefinition.builder(queryKitSpec.getQueryId());
 
       for (final StageDefinition stageDef : queryDef.getStageDefinitions()) {
         if (stageDef.equals(finalShuffleStageDef)) {
@@ -1834,7 +1825,7 @@ private static QueryDefinition makeQueryDefinition(
       // attaching new query results stage if the final stage does sort during shuffle so that results are ordered.
       StageDefinition finalShuffleStageDef = queryDef.getFinalStageDefinition();
       if (finalShuffleStageDef.doesSortDuringShuffle()) {
-        final QueryDefinitionBuilder builder = QueryDefinition.builder(queryId);
+        final QueryDefinitionBuilder builder = QueryDefinition.builder(queryKitSpec.getQueryId());
         builder.addAll(queryDef);
         builder.add(StageDefinition.builder(queryDef.getNextStageNumber())
                                    .inputs(new StageInputSpec(queryDef.getFinalStageDefinition().getStageNumber()))
@@ -1871,7 +1862,7 @@ private static QueryDefinition makeQueryDefinition(
       }
 
       final ResultFormat resultFormat = exportMSQDestination.getResultFormat();
-      final QueryDefinitionBuilder builder = QueryDefinition.builder(queryId);
+      final QueryDefinitionBuilder builder = QueryDefinition.builder(queryKitSpec.getQueryId());
       builder.addAll(queryDef);
       builder.add(StageDefinition.builder(queryDef.getNextStageNumber())
                                  .inputs(new StageInputSpec(queryDef.getFinalStageDefinition().getStageNumber()))
@@ -1879,7 +1870,7 @@ private static QueryDefinition makeQueryDefinition(
                                  .signature(queryDef.getFinalStageDefinition().getSignature())
                                  .shuffleSpec(null)
                                  .processorFactory(new ExportResultsFrameProcessorFactory(
-                                     queryId,
+                                     queryKitSpec.getQueryId(),
                                      exportStorageProvider,
                                      resultFormat,
                                      columnMappings,
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java
index 0e2cc03fda74..c148e7fc1bbf 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/IndexerControllerContext.java
@@ -47,8 +47,11 @@
 import org.apache.druid.msq.input.InputSpecSlicer;
 import org.apache.druid.msq.kernel.WorkOrder;
 import org.apache.druid.msq.kernel.controller.ControllerQueryKernelConfig;
+import org.apache.druid.msq.querykit.QueryKit;
+import org.apache.druid.msq.querykit.QueryKitSpec;
 import org.apache.druid.msq.util.MultiStageQueryContext;
 import org.apache.druid.query.DruidMetrics;
+import org.apache.druid.query.Query;
 import org.apache.druid.query.QueryContext;
 import org.apache.druid.rpc.ServiceClientFactory;
 import org.apache.druid.rpc.indexing.OverlordClient;
@@ -203,11 +206,26 @@ public WorkerManager newWorkerManager(
   }
 
   @Override
-  public int defaultTargetPartitionsPerWorker()
+  public QueryKitSpec makeQueryKitSpec(
+      final QueryKit<Query<?>> queryKit,
+      final String queryId,
+      final MSQSpec querySpec,
+      final ControllerQueryKernelConfig queryKernelConfig
+  )
   {
-    // Assume tasks are symmetric: workers have the same number of processors available as a controller.
-    // Create one partition per processor per task, for maximum parallelism.
-    return memoryIntrospector.numProcessingThreads();
+    return new QueryKitSpec(
+        queryKit,
+        queryId,
+        querySpec.getTuningConfig().getMaxNumWorkers(),
+        querySpec.getTuningConfig().getMaxNumWorkers(),
+
+        // Assume tasks are symmetric: workers have the same number of processors available as a controller.
+        // Create one partition per processor per task, for maximum parallelism.
+        MultiStageQueryContext.getTargetPartitionsPerWorkerWithDefault(
+            querySpec.getQuery().context(),
+            memoryIntrospector.numProcessingThreads()
+        )
+    );
   }
 
   /**
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSourcePlan.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSourcePlan.java
index 15fe6263ed83..21848813e5d3 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSourcePlan.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/DataSourcePlan.java
@@ -123,8 +123,7 @@ public class DataSourcePlan
   /**
    * Build a plan.
    *
-   * @param queryKit         query kit reference for recursive planning
-   * @param queryId          query ID
+   * @param queryKitSpec reference for recursive planning
    * @param queryContext     query context
    * @param dataSource       datasource to plan
    * @param querySegmentSpec intervals for mandatory pruning. Must be {@link MultipleIntervalSegmentSpec}. The returned
@@ -132,22 +131,17 @@ public class DataSourcePlan
    * @param filter           filter for best-effort pruning. The returned plan may or may not be filtered to this
    *                         filter. Query processing must still apply the filter to generated correct results.
    * @param filterFields     which fields from the filter to consider for pruning, or null to consider all fields.
-   * @param maxWorkerCount   maximum number of workers for subqueries
    * @param minStageNumber   starting stage number for subqueries
    * @param broadcast        whether the plan should broadcast data for this datasource
-   * @param targetPartitionsPerWorker preferred number of partitions per worker for subqueries
    */
   @SuppressWarnings("rawtypes")
   public static DataSourcePlan forDataSource(
-      final QueryKit queryKit,
-      final String queryId,
+      final QueryKitSpec queryKitSpec,
       final QueryContext queryContext,
       final DataSource dataSource,
       final QuerySegmentSpec querySegmentSpec,
       @Nullable DimFilter filter,
       @Nullable Set<String> filterFields,
-      final int maxWorkerCount,
-      final int targetPartitionsPerWorker,
       final int minStageNumber,
       final boolean broadcast
   )
@@ -182,51 +176,38 @@ public static DataSourcePlan forDataSource(
       return forLookup((LookupDataSource) dataSource, broadcast);
     } else if (dataSource instanceof FilteredDataSource) {
       return forFilteredDataSource(
-          queryKit,
-          queryId,
+          queryKitSpec,
           queryContext,
           (FilteredDataSource) dataSource,
           querySegmentSpec,
-          maxWorkerCount,
-          targetPartitionsPerWorker,
           minStageNumber,
           broadcast
       );
     } else if (dataSource instanceof UnnestDataSource) {
       return forUnnest(
-          queryKit,
-          queryId,
+          queryKitSpec,
           queryContext,
           (UnnestDataSource) dataSource,
           querySegmentSpec,
-          maxWorkerCount,
-          targetPartitionsPerWorker,
           minStageNumber,
           broadcast
       );
     } else if (dataSource instanceof QueryDataSource) {
       checkQuerySegmentSpecIsEternity(dataSource, querySegmentSpec);
       return forQuery(
-          queryKit,
-          queryId,
+          queryKitSpec,
           (QueryDataSource) dataSource,
-          maxWorkerCount,
-          targetPartitionsPerWorker,
           minStageNumber,
-          broadcast,
-          queryContext
+          broadcast
       );
     } else if (dataSource instanceof UnionDataSource) {
       return forUnion(
-          queryKit,
-          queryId,
+          queryKitSpec,
           queryContext,
           (UnionDataSource) dataSource,
           querySegmentSpec,
           filter,
           filterFields,
-          maxWorkerCount,
-          targetPartitionsPerWorker,
           minStageNumber,
           broadcast
       );
@@ -240,27 +221,21 @@ public static DataSourcePlan forDataSource(
       switch (deducedJoinAlgorithm) {
         case BROADCAST:
           return forBroadcastHashJoin(
-              queryKit,
-              queryId,
+              queryKitSpec,
               queryContext,
               (JoinDataSource) dataSource,
               querySegmentSpec,
               filter,
               filterFields,
-              maxWorkerCount,
-              targetPartitionsPerWorker,
               minStageNumber,
               broadcast
           );
 
         case SORT_MERGE:
           return forSortMergeJoin(
-              queryKit,
-              queryId,
+              queryKitSpec,
               (JoinDataSource) dataSource,
               querySegmentSpec,
-              maxWorkerCount,
-              targetPartitionsPerWorker,
               minStageNumber,
               broadcast
           );
@@ -422,25 +397,18 @@ private static DataSourcePlan forLookup(
   }
 
   private static DataSourcePlan forQuery(
-      final QueryKit queryKit,
-      final String queryId,
+      final QueryKitSpec queryKitSpec,
       final QueryDataSource dataSource,
-      final int maxWorkerCount,
-      final int targetPartitionsPerWorker,
       final int minStageNumber,
-      final boolean broadcast,
-      @Nullable final QueryContext parentContext
+      final boolean broadcast
   )
   {
-    final QueryDefinition subQueryDef = queryKit.makeQueryDefinition(
-        queryId,
+    final QueryDefinition subQueryDef = queryKitSpec.getQueryKit().makeQueryDefinition(
+        queryKitSpec,
         // Subqueries ignore SQL_INSERT_SEGMENT_GRANULARITY, even if set in the context. It's only used for the
         // outermost query, and setting it for the subquery makes us erroneously add bucketing where it doesn't belong.
         dataSource.getQuery().withOverriddenContext(CONTEXT_MAP_NO_SEGMENT_GRANULARITY),
-        queryKit,
-        ShuffleSpecFactories.globalSortWithMaxPartitionCount(maxWorkerCount * targetPartitionsPerWorker),
-        maxWorkerCount,
-        targetPartitionsPerWorker,
+        ShuffleSpecFactories.globalSortWithMaxPartitionCount(queryKitSpec.getNumPartitionsForShuffle()),
         minStageNumber
     );
 
@@ -455,27 +423,21 @@ private static DataSourcePlan forQuery(
   }
 
   private static DataSourcePlan forFilteredDataSource(
-      final QueryKit queryKit,
-      final String queryId,
+      final QueryKitSpec queryKitSpec,
       final QueryContext queryContext,
       final FilteredDataSource dataSource,
       final QuerySegmentSpec querySegmentSpec,
-      final int maxWorkerCount,
-      final int targetPartitionsPerWorker,
       final int minStageNumber,
       final boolean broadcast
   )
   {
     final DataSourcePlan basePlan = forDataSource(
-        queryKit,
-        queryId,
+        queryKitSpec,
         queryContext,
         dataSource.getBase(),
         querySegmentSpec,
         null,
         null,
-        maxWorkerCount,
-        targetPartitionsPerWorker,
         minStageNumber,
         broadcast
     );
@@ -497,28 +459,22 @@ private static DataSourcePlan forFilteredDataSource(
    * Build a plan for Unnest data source
    */
   private static DataSourcePlan forUnnest(
-      final QueryKit queryKit,
-      final String queryId,
+      final QueryKitSpec queryKitSpec,
       final QueryContext queryContext,
       final UnnestDataSource dataSource,
       final QuerySegmentSpec querySegmentSpec,
-      final int maxWorkerCount,
-      final int targetPartitionsPerWorker,
       final int minStageNumber,
       final boolean broadcast
   )
   {
     // Find the plan for base data source by recursing
     final DataSourcePlan basePlan = forDataSource(
-        queryKit,
-        queryId,
+        queryKitSpec,
         queryContext,
         dataSource.getBase(),
         querySegmentSpec,
         null,
         null,
-        maxWorkerCount,
-        targetPartitionsPerWorker,
         minStageNumber,
         broadcast
     );
@@ -543,15 +499,12 @@ private static DataSourcePlan forUnnest(
   }
 
   private static DataSourcePlan forUnion(
-      final QueryKit queryKit,
-      final String queryId,
+      final QueryKitSpec queryKitSpec,
       final QueryContext queryContext,
       final UnionDataSource unionDataSource,
       final QuerySegmentSpec querySegmentSpec,
       @Nullable DimFilter filter,
       @Nullable Set<String> filterFields,
-      final int maxWorkerCount,
-      final int targetPartitionsPerWorker,
       final int minStageNumber,
       final boolean broadcast
   )
@@ -559,22 +512,19 @@ private static DataSourcePlan forUnion(
     // This is done to prevent loss of generality since MSQ can plan any type of DataSource.
     List<DataSource> children = unionDataSource.getDataSources();
 
-    final QueryDefinitionBuilder subqueryDefBuilder = QueryDefinition.builder(queryId);
+    final QueryDefinitionBuilder subqueryDefBuilder = QueryDefinition.builder(queryKitSpec.getQueryId());
     final List<DataSource> newChildren = new ArrayList<>();
     final List<InputSpec> inputSpecs = new ArrayList<>();
     final IntSet broadcastInputs = new IntOpenHashSet();
 
     for (DataSource child : children) {
       DataSourcePlan childDataSourcePlan = forDataSource(
-          queryKit,
-          queryId,
+          queryKitSpec,
           queryContext,
           child,
           querySegmentSpec,
           filter,
           filterFields,
-          maxWorkerCount,
-          targetPartitionsPerWorker,
           Math.max(minStageNumber, subqueryDefBuilder.getNextStageNumber()),
           broadcast
       );
@@ -598,32 +548,26 @@ private static DataSourcePlan forUnion(
    * Build a plan for broadcast hash-join.
    */
   private static DataSourcePlan forBroadcastHashJoin(
-      final QueryKit queryKit,
-      final String queryId,
+      final QueryKitSpec queryKitSpec,
       final QueryContext queryContext,
       final JoinDataSource dataSource,
       final QuerySegmentSpec querySegmentSpec,
       @Nullable final DimFilter filter,
       @Nullable final Set<String> filterFields,
-      final int maxWorkerCount,
-      final int targetPartitionsPerWorker,
       final int minStageNumber,
       final boolean broadcast
   )
   {
-    final QueryDefinitionBuilder subQueryDefBuilder = QueryDefinition.builder(queryId);
+    final QueryDefinitionBuilder subQueryDefBuilder = QueryDefinition.builder(queryKitSpec.getQueryId());
     final DataSourceAnalysis analysis = dataSource.getAnalysis();
 
     final DataSourcePlan basePlan = forDataSource(
-        queryKit,
-        queryId,
+        queryKitSpec,
         queryContext,
         analysis.getBaseDataSource(),
         querySegmentSpec,
         filter,
         filter == null ? null : DimFilterUtils.onlyBaseFields(filterFields, analysis),
-        maxWorkerCount,
-        targetPartitionsPerWorker,
         Math.max(minStageNumber, subQueryDefBuilder.getNextStageNumber()),
         broadcast
     );
@@ -636,15 +580,12 @@ private static DataSourcePlan forBroadcastHashJoin(
     for (int i = 0; i < analysis.getPreJoinableClauses().size(); i++) {
       final PreJoinableClause clause = analysis.getPreJoinableClauses().get(i);
       final DataSourcePlan clausePlan = forDataSource(
-          queryKit,
-          queryId,
+          queryKitSpec,
           queryContext,
           clause.getDataSource(),
           new MultipleIntervalSegmentSpec(Intervals.ONLY_ETERNITY),
           null, // Don't push down query filters for right-hand side: needs some work to ensure it works properly.
           null,
-          maxWorkerCount,
-          targetPartitionsPerWorker,
           Math.max(minStageNumber, subQueryDefBuilder.getNextStageNumber()),
           true // Always broadcast right-hand side of the join.
       );
@@ -674,12 +615,9 @@ private static DataSourcePlan forBroadcastHashJoin(
    * Build a plan for sort-merge join.
    */
   private static DataSourcePlan forSortMergeJoin(
-      final QueryKit queryKit,
-      final String queryId,
+      final QueryKitSpec queryKitSpec,
       final JoinDataSource dataSource,
       final QuerySegmentSpec querySegmentSpec,
-      final int maxWorkerCount,
-      final int targetPartitionsPerWorker,
       final int minStageNumber,
       final boolean broadcast
   )
@@ -692,20 +630,16 @@ private static DataSourcePlan forSortMergeJoin(
         SortMergeJoinFrameProcessorFactory.validateCondition(dataSource.getConditionAnalysis())
     );
 
-    final QueryDefinitionBuilder subQueryDefBuilder = QueryDefinition.builder(queryId);
+    final QueryDefinitionBuilder subQueryDefBuilder = QueryDefinition.builder(queryKitSpec.getQueryId());
 
     // Plan the left input.
     // We're confident that we can cast dataSource.getLeft() to QueryDataSource, because DruidJoinQueryRel creates
     // subqueries when the join algorithm is sortMerge.
     final DataSourcePlan leftPlan = forQuery(
-        queryKit,
-        queryId,
+        queryKitSpec,
         (QueryDataSource) dataSource.getLeft(),
-        maxWorkerCount,
-        targetPartitionsPerWorker,
         Math.max(minStageNumber, subQueryDefBuilder.getNextStageNumber()),
-        false,
-        null
+        false
     );
     leftPlan.getSubQueryDefBuilder().ifPresent(subQueryDefBuilder::addAll);
 
@@ -713,14 +647,10 @@ private static DataSourcePlan forSortMergeJoin(
     // We're confident that we can cast dataSource.getRight() to QueryDataSource, because DruidJoinQueryRel creates
     // subqueries when the join algorithm is sortMerge.
     final DataSourcePlan rightPlan = forQuery(
-        queryKit,
-        queryId,
+        queryKitSpec,
         (QueryDataSource) dataSource.getRight(),
-        maxWorkerCount,
-        targetPartitionsPerWorker,
         Math.max(minStageNumber, subQueryDefBuilder.getNextStageNumber()),
-        false,
-        null
+        false
     );
     rightPlan.getSubQueryDefBuilder().ifPresent(subQueryDefBuilder::addAll);
 
@@ -729,7 +659,7 @@ private static DataSourcePlan forSortMergeJoin(
         ((StageInputSpec) Iterables.getOnlyElement(leftPlan.getInputSpecs())).getStageNumber()
     );
 
-    final int hashPartitionCount = maxWorkerCount * targetPartitionsPerWorker;
+    final int hashPartitionCount = queryKitSpec.getNumPartitionsForShuffle();
     final List<KeyColumn> leftPartitionKey = partitionKeys.get(0);
     leftBuilder.shuffleSpec(new HashShuffleSpec(new ClusterBy(leftPartitionKey, 0), hashPartitionCount));
     leftBuilder.signature(QueryKitUtils.sortableSignature(leftBuilder.getSignature(), leftPartitionKey));
@@ -768,7 +698,7 @@ private static DataSourcePlan forSortMergeJoin(
                                Iterables.getOnlyElement(rightPlan.getInputSpecs())
                            )
                        )
-                       .maxWorkerCount(maxWorkerCount)
+                       .maxWorkerCount(queryKitSpec.getMaxNonLeafWorkerCount())
                        .signature(joinSignatureBuilder.build())
                        .processorFactory(
                            new SortMergeJoinFrameProcessorFactory(
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/MultiQueryKit.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/MultiQueryKit.java
index 37f453f6c060..3129bbfacb9c 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/MultiQueryKit.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/MultiQueryKit.java
@@ -41,12 +41,9 @@ public MultiQueryKit(final Map<Class<? extends Query>, QueryKit> toolKitMap)
 
   @Override
   public QueryDefinition makeQueryDefinition(
-      String queryId,
+      QueryKitSpec queryKitSpec,
       Query<?> query,
-      QueryKit<Query<?>> toolKitForSubQueries,
       ShuffleSpecFactory resultShuffleSpecFactory,
-      int maxWorkerCount,
-      int targetPartitionsPerWorker,
       int minStageNumber
   )
   {
@@ -55,12 +52,9 @@ public QueryDefinition makeQueryDefinition(
     if (specificToolKit != null) {
       //noinspection unchecked
       return specificToolKit.makeQueryDefinition(
-          queryId,
+          queryKitSpec,
           query,
-          this,
           resultShuffleSpecFactory,
-          maxWorkerCount,
-          targetPartitionsPerWorker,
           minStageNumber
       );
     } else {
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/QueryKit.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/QueryKit.java
index 2bc0ad0725a8..118091ccbd49 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/QueryKit.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/QueryKit.java
@@ -30,25 +30,17 @@ public interface QueryKit<QueryType extends Query<?>>
   /**
    * Creates a {@link QueryDefinition} from a {@link Query}.
    *
-   * @param queryId                  query ID of the resulting {@link QueryDefinition}
+   * @param queryKitSpec             collection of parameters necessary for planning {@link QueryDefinition}
    * @param query                    native query to translate
-   * @param toolKitForSubQueries     kit that is used to translate native subqueries; i.e.,
-   *                                 {@link org.apache.druid.query.QueryDataSource}. Typically a {@link MultiQueryKit}.
    * @param resultShuffleSpecFactory shuffle spec factory for the final output of this query.
-   * @param maxWorkerCount           maximum number of workers: becomes
-   *                                 {@link org.apache.druid.msq.kernel.StageDefinition#getMaxWorkerCount()}
    * @param minStageNumber           lowest stage number to use for any generated stages. Useful if the resulting
    *                                 {@link QueryDefinition} is going to be added to an existing
    *                                 {@link org.apache.druid.msq.kernel.QueryDefinitionBuilder}.
-   * @param targetPartitionsPerWorker preferred number of partitions per worker for subqueries
    */
   QueryDefinition makeQueryDefinition(
-      String queryId,
+      QueryKitSpec queryKitSpec,
       QueryType query,
-      QueryKit<Query<?>> toolKitForSubQueries,
       ShuffleSpecFactory resultShuffleSpecFactory,
-      int maxWorkerCount,
-      int targetPartitionsPerWorker,
       int minStageNumber
   );
 }
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/QueryKitSpec.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/QueryKitSpec.java
new file mode 100644
index 000000000000..7cae4ed7d7bf
--- /dev/null
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/QueryKitSpec.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.querykit;
+
+import org.apache.druid.msq.input.InputSpec;
+import org.apache.druid.msq.input.InputSpecs;
+import org.apache.druid.msq.kernel.QueryDefinition;
+import org.apache.druid.query.Query;
+
+import java.util.List;
+
+/**
+ * Collection of parameters for {@link QueryKit#makeQueryDefinition}.
+ */
+public class QueryKitSpec
+{
+  private final QueryKit<Query<?>> queryKit;
+  private final String queryId;
+  private final int maxLeafWorkerCount;
+  private final int maxNonLeafWorkerCount;
+  private final int targetPartitionsPerWorker;
+
+  /**
+   * @param queryKit                  kit that is used to translate native subqueries; i.e.,
+   *                                  {@link org.apache.druid.query.QueryDataSource}. Typically a {@link MultiQueryKit}.
+   * @param queryId                   queryId of the resulting {@link QueryDefinition}
+   * @param maxLeafWorkerCount        maximum number of workers for leaf stages: becomes
+   *                                  {@link org.apache.druid.msq.kernel.StageDefinition#getMaxWorkerCount()}
+   * @param maxNonLeafWorkerCount     maximum number of workers for non-leaf stages: becomes
+   *                                  {@link org.apache.druid.msq.kernel.StageDefinition#getMaxWorkerCount()}
+   * @param targetPartitionsPerWorker preferred number of partitions per worker for subqueries
+   */
+  public QueryKitSpec(
+      QueryKit<Query<?>> queryKit,
+      String queryId,
+      int maxLeafWorkerCount,
+      int maxNonLeafWorkerCount,
+      int targetPartitionsPerWorker
+  )
+  {
+    this.queryId = queryId;
+    this.queryKit = queryKit;
+    this.maxLeafWorkerCount = maxLeafWorkerCount;
+    this.maxNonLeafWorkerCount = maxNonLeafWorkerCount;
+    this.targetPartitionsPerWorker = targetPartitionsPerWorker;
+  }
+
+  /**
+   * Instance of {@link QueryKit} for recursive calls.
+   */
+  public QueryKit<Query<?>> getQueryKit()
+  {
+    return queryKit;
+  }
+
+  /**
+   * Query ID to use when building {@link QueryDefinition}.
+   */
+  public String getQueryId()
+  {
+    return queryId;
+  }
+
+  /**
+   * Maximum worker count for a stage with the given inputs. Will use {@link #maxNonLeafWorkerCount} if there are
+   * any stage inputs, {@link #maxLeafWorkerCount} otherwise.
+   */
+  public int getMaxWorkerCount(final List<InputSpec> inputSpecs)
+  {
+    if (InputSpecs.getStageNumbers(inputSpecs).isEmpty()) {
+      return maxLeafWorkerCount;
+    } else {
+      return maxNonLeafWorkerCount;
+    }
+  }
+
+  /**
+   * Maximum number of workers for non-leaf stages (where there are some stage inputs).
+   */
+  public int getMaxNonLeafWorkerCount()
+  {
+    return maxNonLeafWorkerCount;
+  }
+
+  /**
+   * Number of partitions to generate during a shuffle.
+   */
+  public int getNumPartitionsForShuffle()
+  {
+    return maxNonLeafWorkerCount * targetPartitionsPerWorker;
+  }
+}
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/WindowOperatorQueryKit.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/WindowOperatorQueryKit.java
index b1af153fafde..02542f8e7366 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/WindowOperatorQueryKit.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/WindowOperatorQueryKit.java
@@ -35,7 +35,6 @@
 import org.apache.druid.msq.kernel.ShuffleSpec;
 import org.apache.druid.msq.kernel.StageDefinition;
 import org.apache.druid.msq.util.MultiStageQueryContext;
-import org.apache.druid.query.Query;
 import org.apache.druid.query.operator.ColumnWithDirection;
 import org.apache.druid.query.operator.NaivePartitioningOperatorFactory;
 import org.apache.druid.query.operator.NaiveSortOperatorFactory;
@@ -63,12 +62,9 @@ public WindowOperatorQueryKit(ObjectMapper jsonMapper)
 
   @Override
   public QueryDefinition makeQueryDefinition(
-      String queryId,
+      QueryKitSpec queryKitSpec,
       WindowOperatorQuery originalQuery,
-      QueryKit<Query<?>> queryKit,
       ShuffleSpecFactory resultShuffleSpecFactory,
-      int maxWorkerCount,
-      int targetPartitionsPerWorker,
       int minStageNumber
   )
   {
@@ -90,22 +86,22 @@ public QueryDefinition makeQueryDefinition(
     log.info("Created operatorList with operator factories: [%s]", operatorList);
 
     final DataSourcePlan dataSourcePlan = DataSourcePlan.forDataSource(
-        queryKit,
-        queryId,
+        queryKitSpec,
         originalQuery.context(),
         originalQuery.getDataSource(),
         originalQuery.getQuerySegmentSpec(),
         originalQuery.getFilter(),
         null,
-        maxWorkerCount,
-        targetPartitionsPerWorker,
         minStageNumber,
         false
     );
 
-    ShuffleSpec nextShuffleSpec =
-        findShuffleSpecForNextWindow(operatorList.get(0), maxWorkerCount * targetPartitionsPerWorker);
-    final QueryDefinitionBuilder queryDefBuilder = makeQueryDefinitionBuilder(queryId, dataSourcePlan, nextShuffleSpec);
+    ShuffleSpec nextShuffleSpec = findShuffleSpecForNextWindow(
+        operatorList.get(0),
+        queryKitSpec.getNumPartitionsForShuffle()
+    );
+    final QueryDefinitionBuilder queryDefBuilder =
+        makeQueryDefinitionBuilder(queryKitSpec.getQueryId(), dataSourcePlan, nextShuffleSpec);
 
     final int firstStageNumber = Math.max(minStageNumber, queryDefBuilder.getNextStageNumber());
     final WindowOperatorQuery queryToRun = (WindowOperatorQuery) originalQuery.withDataSource(dataSourcePlan.getNewDataSource());
@@ -133,7 +129,7 @@ public QueryDefinition makeQueryDefinition(
           StageDefinition.builder(firstStageNumber)
                          .inputs(new StageInputSpec(firstStageNumber - 1))
                          .signature(finalWindowStageRowSignature)
-                         .maxWorkerCount(maxWorkerCount)
+                         .maxWorkerCount(queryKitSpec.getMaxNonLeafWorkerCount())
                          .shuffleSpec(finalWindowStageShuffleSpec)
                          .processorFactory(new WindowOperatorQueryFrameProcessorFactory(
                              queryToRun,
@@ -196,7 +192,7 @@ public QueryDefinition makeQueryDefinition(
           nextShuffleSpec = finalWindowStageShuffleSpec;
         } else {
           nextShuffleSpec =
-              findShuffleSpecForNextWindow(operatorList.get(i + 1), maxWorkerCount * targetPartitionsPerWorker);
+              findShuffleSpecForNextWindow(operatorList.get(i + 1), queryKitSpec.getNumPartitionsForShuffle());
           if (nextShuffleSpec == null) {
             stageRowSignature = intermediateSignature;
           } else {
@@ -233,7 +229,7 @@ public QueryDefinition makeQueryDefinition(
             StageDefinition.builder(firstStageNumber + i)
                            .inputs(new StageInputSpec(firstStageNumber + i - 1))
                            .signature(stageRowSignature)
-                           .maxWorkerCount(maxWorkerCount)
+                           .maxWorkerCount(queryKitSpec.getMaxNonLeafWorkerCount())
                            .shuffleSpec(nextShuffleSpec)
                            .processorFactory(new WindowOperatorQueryFrameProcessorFactory(
                                queryToRun,
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByQueryKit.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByQueryKit.java
index 45a91a3d8870..db56bd02f742 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByQueryKit.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByQueryKit.java
@@ -34,12 +34,12 @@
 import org.apache.druid.msq.kernel.StageDefinition;
 import org.apache.druid.msq.querykit.DataSourcePlan;
 import org.apache.druid.msq.querykit.QueryKit;
+import org.apache.druid.msq.querykit.QueryKitSpec;
 import org.apache.druid.msq.querykit.QueryKitUtils;
 import org.apache.druid.msq.querykit.ShuffleSpecFactories;
 import org.apache.druid.msq.querykit.ShuffleSpecFactory;
 import org.apache.druid.msq.querykit.common.OffsetLimitFrameProcessorFactory;
 import org.apache.druid.query.DimensionComparisonUtils;
-import org.apache.druid.query.Query;
 import org.apache.druid.query.dimension.DimensionSpec;
 import org.apache.druid.query.groupby.GroupByQuery;
 import org.apache.druid.query.groupby.having.AlwaysHavingSpec;
@@ -66,28 +66,22 @@ public GroupByQueryKit(ObjectMapper jsonMapper)
 
   @Override
   public QueryDefinition makeQueryDefinition(
-      final String queryId,
+      final QueryKitSpec queryKitSpec,
       final GroupByQuery originalQuery,
-      final QueryKit<Query<?>> queryKit,
       final ShuffleSpecFactory resultShuffleSpecFactory,
-      final int maxWorkerCount,
-      final int targetPartitionsPerWorker,
       final int minStageNumber
   )
   {
     validateQuery(originalQuery);
 
-    final QueryDefinitionBuilder queryDefBuilder = QueryDefinition.builder(queryId);
+    final QueryDefinitionBuilder queryDefBuilder = QueryDefinition.builder(queryKitSpec.getQueryId());
     final DataSourcePlan dataSourcePlan = DataSourcePlan.forDataSource(
-        queryKit,
-        queryId,
+        queryKitSpec,
         originalQuery.context(),
         originalQuery.getDataSource(),
         originalQuery.getQuerySegmentSpec(),
         originalQuery.getFilter(),
         null,
-        maxWorkerCount,
-        targetPartitionsPerWorker,
         minStageNumber,
         false
     );
@@ -144,7 +138,7 @@ public QueryDefinition makeQueryDefinition(
       shuffleSpecFactoryPreAggregation =
           intermediateClusterBy.isEmpty()
           ? ShuffleSpecFactories.singlePartition()
-          : ShuffleSpecFactories.globalSortWithMaxPartitionCount(maxWorkerCount * targetPartitionsPerWorker);
+          : ShuffleSpecFactories.globalSortWithMaxPartitionCount(queryKitSpec.getNumPartitionsForShuffle());
 
       if (doLimitOrOffset) {
         shuffleSpecFactoryPostAggregation = ShuffleSpecFactories.singlePartitionWithLimit(postAggregationLimitHint);
@@ -169,7 +163,10 @@ public QueryDefinition makeQueryDefinition(
                        .broadcastInputs(dataSourcePlan.getBroadcastInputs())
                        .signature(intermediateSignature)
                        .shuffleSpec(shuffleSpecFactoryPreAggregation.build(intermediateClusterBy, true))
-                       .maxWorkerCount(dataSourcePlan.isSingleWorker() ? 1 : maxWorkerCount)
+                       .maxWorkerCount(
+                           dataSourcePlan.isSingleWorker()
+                           ? 1
+                           : queryKitSpec.getMaxWorkerCount(dataSourcePlan.getInputSpecs()))
                        .processorFactory(new GroupByPreShuffleFrameProcessorFactory(queryToRun))
     );
 
@@ -189,7 +186,7 @@ public QueryDefinition makeQueryDefinition(
         StageDefinition.builder(firstStageNumber + 1)
                        .inputs(new StageInputSpec(firstStageNumber))
                        .signature(resultSignature)
-                       .maxWorkerCount(maxWorkerCount)
+                       .maxWorkerCount(queryKitSpec.getMaxNonLeafWorkerCount())
                        .shuffleSpec(
                            shuffleSpecFactoryPostAggregation != null
                            ? shuffleSpecFactoryPostAggregation.build(resultClusterBy, false)
@@ -390,7 +387,10 @@ private static void validateQuery(final GroupByQuery query)
       for (final OrderByColumnSpec column : defaultLimitSpec.getColumns()) {
         final Optional<ColumnType> type = resultSignature.getColumnType(column.getDimension());
 
-        if (!type.isPresent() || !DimensionComparisonUtils.isNaturalComparator(type.get().getType(), column.getDimensionComparator())) {
+        if (!type.isPresent() || !DimensionComparisonUtils.isNaturalComparator(
+            type.get().getType(),
+            column.getDimensionComparator()
+        )) {
           throw new ISE(
               "Must use natural comparator for column [%s] of type [%s]",
               column.getDimension(),
diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/scan/ScanQueryKit.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/scan/ScanQueryKit.java
index 051caeb0e718..8d23e289bb67 100644
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/scan/ScanQueryKit.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/scan/ScanQueryKit.java
@@ -33,13 +33,13 @@
 import org.apache.druid.msq.kernel.StageDefinition;
 import org.apache.druid.msq.querykit.DataSourcePlan;
 import org.apache.druid.msq.querykit.QueryKit;
+import org.apache.druid.msq.querykit.QueryKitSpec;
 import org.apache.druid.msq.querykit.QueryKitUtils;
 import org.apache.druid.msq.querykit.ShuffleSpecFactories;
 import org.apache.druid.msq.querykit.ShuffleSpecFactory;
 import org.apache.druid.msq.querykit.common.OffsetLimitFrameProcessorFactory;
 import org.apache.druid.query.Order;
 import org.apache.druid.query.OrderBy;
-import org.apache.druid.query.Query;
 import org.apache.druid.query.scan.ScanQuery;
 import org.apache.druid.segment.column.ColumnType;
 import org.apache.druid.segment.column.RowSignature;
@@ -86,26 +86,20 @@ public static RowSignature getAndValidateSignature(final ScanQuery scanQuery, fi
   // partition without a ClusterBy, we don't need to necessarily create it via the resultShuffleSpecFactory provided
   @Override
   public QueryDefinition makeQueryDefinition(
-      final String queryId,
+      final QueryKitSpec queryKitSpec,
       final ScanQuery originalQuery,
-      final QueryKit<Query<?>> queryKit,
       final ShuffleSpecFactory resultShuffleSpecFactory,
-      final int maxWorkerCount,
-      final int targetPartitionsPerWorker,
       final int minStageNumber
   )
   {
-    final QueryDefinitionBuilder queryDefBuilder = QueryDefinition.builder(queryId);
+    final QueryDefinitionBuilder queryDefBuilder = QueryDefinition.builder(queryKitSpec.getQueryId());
     final DataSourcePlan dataSourcePlan = DataSourcePlan.forDataSource(
-        queryKit,
-        queryId,
+        queryKitSpec,
         originalQuery.context(),
         originalQuery.getDataSource(),
         originalQuery.getQuerySegmentSpec(),
         originalQuery.getFilter(),
         null,
-        maxWorkerCount,
-        targetPartitionsPerWorker,
         minStageNumber,
         false
     );
@@ -179,7 +173,10 @@ public QueryDefinition makeQueryDefinition(
                        .broadcastInputs(dataSourcePlan.getBroadcastInputs())
                        .shuffleSpec(scanShuffleSpec)
                        .signature(signatureToUse)
-                       .maxWorkerCount(dataSourcePlan.isSingleWorker() ? 1 : maxWorkerCount)
+                       .maxWorkerCount(
+                           dataSourcePlan.isSingleWorker()
+                           ? 1
+                           : queryKitSpec.getMaxWorkerCount(dataSourcePlan.getInputSpecs()))
                        .processorFactory(new ScanQueryFrameProcessorFactory(queryToRun))
     );
 
diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java
index cd20f24d244f..a7ec6054b566 100644
--- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java
+++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/test/MSQTestControllerContext.java
@@ -60,7 +60,10 @@
 import org.apache.druid.msq.indexing.MSQWorkerTaskLauncher;
 import org.apache.druid.msq.input.InputSpecSlicer;
 import org.apache.druid.msq.kernel.controller.ControllerQueryKernelConfig;
+import org.apache.druid.msq.querykit.QueryKit;
+import org.apache.druid.msq.querykit.QueryKitSpec;
 import org.apache.druid.msq.util.MultiStageQueryContext;
+import org.apache.druid.query.Query;
 import org.apache.druid.query.QueryContext;
 import org.apache.druid.rpc.indexing.OverlordClient;
 import org.apache.druid.server.DruidNode;
@@ -273,6 +276,23 @@ public ControllerQueryKernelConfig queryKernelConfig(String queryId, MSQSpec que
     return IndexerControllerContext.makeQueryKernelConfig(querySpec, new ControllerMemoryParameters(100_000_000));
   }
 
+  @Override
+  public QueryKitSpec makeQueryKitSpec(
+      final QueryKit<Query<?>> queryKit,
+      final String queryId,
+      final MSQSpec querySpec,
+      final ControllerQueryKernelConfig queryKernelConfig
+  )
+  {
+    return new QueryKitSpec(
+        queryKit,
+        queryId,
+        querySpec.getTuningConfig().getMaxNumWorkers(),
+        querySpec.getTuningConfig().getMaxNumWorkers(),
+        1
+    );
+  }
+
   @Override
   public void emitMetric(String metric, Number value)
   {
@@ -341,10 +361,4 @@ public WorkerClient newWorkerClient()
   {
     return new MSQTestWorkerClient(inMemoryWorkers);
   }
-
-  @Override
-  public int defaultTargetPartitionsPerWorker()
-  {
-    return 1;
-  }
 }

From 245cd64148a982d8ec8abaea677a7625ea9d2bc6 Mon Sep 17 00:00:00 2001
From: Edgar Melendrez <evmelendrez@gmail.com>
Date: Tue, 17 Sep 2024 13:54:49 -0700
Subject: [PATCH 44/47] [Docs] adding admonition for div (#17093)

Co-authored-by: Victoria Lim <vtlim@users.noreply.github.com>
---
 docs/querying/sql-functions.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/querying/sql-functions.md b/docs/querying/sql-functions.md
index 6859ca673910..3f45baab90be 100644
--- a/docs/querying/sql-functions.md
+++ b/docs/querying/sql-functions.md
@@ -564,6 +564,10 @@ Returns the rank for a row within a window without gaps. For example, if two row
 
 Returns the result of integer division of `x` by `y`.
 
+:::info
+The `DIV` function is not implemented in Druid versions 30.0.0 or earlier. Consider using [`SAFE_DIVIDE`](./sql-functions.md#safe_divide) instead. 
+:::
+
 ## DS_CDF
 
 `DS_CDF(expr, splitPoint0, splitPoint1, ...)`

From fb029575a404e3171001fe55d2e373e71ed81fd3 Mon Sep 17 00:00:00 2001
From: Cece Mei <yingqian.mei@gmail.com>
Date: Tue, 17 Sep 2024 15:59:33 -0700
Subject: [PATCH 45/47] Create a FilterBundle.Builder class and use it to
 construct FilterBundle. (#17055)

---
 .../org/apache/druid/query/QueryContexts.java |    1 +
 .../org/apache/druid/query/filter/Filter.java |   47 +-
 .../druid/query/filter/FilterBundle.java      |  177 ++-
 .../druid/query/filter/FilterTuning.java      |    2 +-
 .../segment/QueryableIndexCursorHolder.java   |   13 +-
 .../druid/segment/filter/AndFilter.java       |  162 +--
 .../apache/druid/segment/filter/OrFilter.java | 1115 +++++++++--------
 .../segment/index/BitmapColumnIndex.java      |   10 +-
 .../segment/filter/FilterBundleTest.java      |   17 +-
 9 files changed, 831 insertions(+), 713 deletions(-)

diff --git a/processing/src/main/java/org/apache/druid/query/QueryContexts.java b/processing/src/main/java/org/apache/druid/query/QueryContexts.java
index afdc5a552f0c..ced9f0d4e2d9 100644
--- a/processing/src/main/java/org/apache/druid/query/QueryContexts.java
+++ b/processing/src/main/java/org/apache/druid/query/QueryContexts.java
@@ -63,6 +63,7 @@ public class QueryContexts
   public static final String REWRITE_JOIN_TO_FILTER_ENABLE_KEY = "enableRewriteJoinToFilter";
   public static final String JOIN_FILTER_REWRITE_MAX_SIZE_KEY = "joinFilterRewriteMaxSize";
   public static final String MAX_NUMERIC_IN_FILTERS = "maxNumericInFilters";
+  public static final String CURSOR_AUTO_ARRANGE_FILTERS = "cursorAutoArrangeFilters";
   // This flag controls whether a SQL join query with left scan should be attempted to be run as direct table access
   // instead of being wrapped inside a query. With direct table access enabled, Druid can push down the join operation to
   // data servers.
diff --git a/processing/src/main/java/org/apache/druid/query/filter/Filter.java b/processing/src/main/java/org/apache/druid/query/filter/Filter.java
index f30681c86691..147406b2261b 100644
--- a/processing/src/main/java/org/apache/druid/query/filter/Filter.java
+++ b/processing/src/main/java/org/apache/druid/query/filter/Filter.java
@@ -47,27 +47,26 @@ public interface Filter
    * cursor. If both are set, the cursor will effectively perform a logical AND to combine them.
    * See {@link FilterBundle} for additional details.
    *
-   * @param columnIndexSelector - provides {@link org.apache.druid.segment.column.ColumnIndexSupplier} to fetch column
-   *                              indexes and {@link org.apache.druid.collections.bitmap.BitmapFactory} to manipulate
-   *                              them
-   * @param bitmapResultFactory - wrapper for {@link ImmutableBitmap} operations to tie into
-   *                              {@link org.apache.druid.query.QueryMetrics} and build the output indexes
-   * @param applyRowCount       - upper bound on number of rows this filter would be applied to, after removing rows
-   *                              short-circuited by prior bundle operations. For example, given "x AND y", if "x" is
-   *                              resolved using an index, then "y" will receive the number of rows that matched
-   *                              the filter "x". As another example, given "x OR y", if "x" is resolved using an
-   *                              index, then "y" will receive the number of rows that did *not* match the filter "x".
-   * @param totalRowCount       - total number of rows to be scanned if no indexes are applied
-   * @param includeUnknown      - mapping for Druid native two state logic system into SQL three-state logic system. If
-   *                              set to true, bitmaps returned by this method should include true bits for any rows
-   *                              where the matching result is 'unknown', such as from the input being null valued.
-   *                              See {@link NullHandling#useThreeValueLogic()}
-   * @return                    - {@link FilterBundle} containing any indexes and/or matchers that are needed to build
-   *                              a cursor
-   * @param <T>                 - Type of {@link BitmapResultFactory} results, {@link ImmutableBitmap} by default
+   * @param filterBundleBuilder contains {@link BitmapColumnIndex} and {@link ColumnIndexSelector}, and some additional
+   *                            info needed.
+   * @param bitmapResultFactory wrapper for {@link ImmutableBitmap} operations to tie into
+   *                            {@link org.apache.druid.query.QueryMetrics} and build the output indexes
+   * @param applyRowCount       upper bound on number of rows this filter would be applied to, after removing rows
+   *                            short-circuited by prior bundle operations. For example, given "x AND y", if "x" is
+   *                            resolved using an index, then "y" will receive the number of rows that matched
+   *                            the filter "x". As another example, given "x OR y", if "x" is resolved using an
+   *                            index, then "y" will receive the number of rows that did *not* match the filter "x".
+   * @param totalRowCount       total number of rows to be scanned if no indexes are applied
+   * @param includeUnknown      mapping for Druid native two state logic system into SQL three-state logic system. If
+   *                            set to true, bitmaps returned by this method should include true bits for any rows
+   *                            where the matching result is 'unknown', such as from the input being null valued.
+   *                            See {@link NullHandling#useThreeValueLogic()}
+   * @param <T>                 type of {@link BitmapResultFactory} results, {@link ImmutableBitmap} by default
+   * @return {@link FilterBundle} containing any indexes and/or matchers that are needed to build
+   * a cursor
    */
   default <T> FilterBundle makeFilterBundle(
-      ColumnIndexSelector columnIndexSelector,
+      FilterBundle.Builder filterBundleBuilder,
       BitmapResultFactory<T> bitmapResultFactory,
       int applyRowCount,
       int totalRowCount,
@@ -76,7 +75,7 @@ default <T> FilterBundle makeFilterBundle(
   {
     final FilterBundle.IndexBundle indexBundle;
     final boolean needMatcher;
-    final BitmapColumnIndex columnIndex = getBitmapColumnIndex(columnIndexSelector);
+    final BitmapColumnIndex columnIndex = filterBundleBuilder.getBitmapColumnIndex();
     if (columnIndex != null) {
       final long bitmapConstructionStartNs = System.nanoTime();
       final T result = columnIndex.computeBitmapResult(
@@ -107,7 +106,7 @@ default <T> FilterBundle makeFilterBundle(
           new FilterBundle.MatcherBundleInfo(this::toString, null, null),
           this::makeMatcher,
           this::makeVectorMatcher,
-          this.canVectorizeMatcher(columnIndexSelector)
+          this.canVectorizeMatcher(filterBundleBuilder.getColumnIndexSelector())
       );
     } else {
       matcherBundle = null;
@@ -122,7 +121,6 @@ default <T> FilterBundle makeFilterBundle(
    * examine details about the index prior to computing it, via {@link BitmapColumnIndex#getIndexCapabilities()}.
    *
    * @param selector Object used to create BitmapColumnIndex
-   *
    * @return BitmapColumnIndex that can build ImmutableBitmap of matched row numbers
    */
   @Nullable
@@ -132,7 +130,6 @@ default <T> FilterBundle makeFilterBundle(
    * Get a {@link ValueMatcher} that applies this filter to row values.
    *
    * @param factory Object used to create ValueMatchers
-   *
    * @return ValueMatcher that applies this filter to row values.
    */
   ValueMatcher makeMatcher(ColumnSelectorFactory factory);
@@ -141,7 +138,6 @@ default <T> FilterBundle makeFilterBundle(
    * Get a {@link VectorValueMatcher} that applies this filter to row vectors.
    *
    * @param factory Object used to create ValueMatchers
-   *
    * @return VectorValueMatcher that applies this filter to row vectors.
    */
   default VectorValueMatcher makeVectorMatcher(VectorColumnSelectorFactory factory)
@@ -151,6 +147,7 @@ default VectorValueMatcher makeVectorMatcher(VectorColumnSelectorFactory factory
 
   /**
    * Returns true if this filter can produce a vectorized matcher from its "makeVectorMatcher" method.
+   *
    * @param inspector Supplies type information for the selectors this filter will match against
    */
   default boolean canVectorizeMatcher(ColumnInspector inspector)
@@ -176,7 +173,7 @@ default boolean supportsRequiredColumnRewrite()
    * Return a copy of this filter that is identical to the this filter except that it operates on different columns,
    * based on a renaming map where the key is the column to be renamed in the filter, and the value is the new
    * column name.
-   *
+   * <p>
    * For example, if I have a filter (A = hello), and I have a renaming map (A -> B),
    * this should return the filter (B = hello)
    *
diff --git a/processing/src/main/java/org/apache/druid/query/filter/FilterBundle.java b/processing/src/main/java/org/apache/druid/query/filter/FilterBundle.java
index e105b0d6163b..8511642a0c4a 100644
--- a/processing/src/main/java/org/apache/druid/query/filter/FilterBundle.java
+++ b/processing/src/main/java/org/apache/druid/query/filter/FilterBundle.java
@@ -24,16 +24,21 @@
 import com.fasterxml.jackson.annotation.JsonProperty;
 import com.google.common.base.Preconditions;
 import org.apache.druid.collections.bitmap.ImmutableBitmap;
+import org.apache.druid.query.BitmapResultFactory;
 import org.apache.druid.query.filter.vector.VectorValueMatcher;
 import org.apache.druid.segment.ColumnSelectorFactory;
 import org.apache.druid.segment.column.ColumnIndexCapabilities;
 import org.apache.druid.segment.column.SimpleColumnIndexCapabilities;
 import org.apache.druid.segment.data.Offset;
 import org.apache.druid.segment.filter.FalseFilter;
+import org.apache.druid.segment.index.BitmapColumnIndex;
 import org.apache.druid.segment.vector.ReadableVectorOffset;
 import org.apache.druid.segment.vector.VectorColumnSelectorFactory;
 
 import javax.annotation.Nullable;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Comparator;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
 import java.util.function.Function;
@@ -57,27 +62,12 @@
  */
 public class FilterBundle
 {
-  public static FilterBundle allFalse(long constructionTime, ImmutableBitmap emptyBitmap)
-  {
-    return new FilterBundle(
-        new FilterBundle.SimpleIndexBundle(
-            new FilterBundle.IndexBundleInfo(() -> FalseFilter.instance().toString(), 0, constructionTime, null),
-            emptyBitmap,
-            SimpleColumnIndexCapabilities.getConstant()
-        ),
-        null
-    );
-  }
-
   @Nullable
   private final IndexBundle indexBundle;
   @Nullable
   private final MatcherBundle matcherBundle;
 
-  public FilterBundle(
-      @Nullable IndexBundle index,
-      @Nullable MatcherBundle matcherBundle
-  )
+  public FilterBundle(@Nullable IndexBundle index, @Nullable MatcherBundle matcherBundle)
   {
     Preconditions.checkArgument(
         index != null || matcherBundle != null,
@@ -87,6 +77,17 @@ public FilterBundle(
     this.matcherBundle = matcherBundle;
   }
 
+  public static FilterBundle allFalse(long constructionTime, ImmutableBitmap emptyBitmap)
+  {
+    return new FilterBundle(
+        new FilterBundle.SimpleIndexBundle(
+            new FilterBundle.IndexBundleInfo(() -> FalseFilter.instance().toString(), 0, constructionTime, null),
+            emptyBitmap,
+            SimpleColumnIndexCapabilities.getConstant()
+        ),
+        null
+    );
+  }
 
   @Nullable
   public IndexBundle getIndex()
@@ -151,6 +152,95 @@ public interface MatcherBundle
     boolean canVectorize();
   }
 
+  /**
+   * Wraps info needed to build a {@link FilterBundle}, and provides an estimated compute cost for
+   * {@link BitmapColumnIndex#computeBitmapResult}.
+   */
+  public static class Builder
+  {
+    private final Filter filter;
+    private final ColumnIndexSelector columnIndexSelector;
+    @Nullable
+    private final BitmapColumnIndex bitmapColumnIndex;
+    private final List<FilterBundle.Builder> childBuilders;
+    private final int estimatedIndexComputeCost;
+
+    public Builder(Filter filter, ColumnIndexSelector columnIndexSelector, boolean cursorAutoArrangeFilters)
+    {
+      this.filter = filter;
+      this.columnIndexSelector = columnIndexSelector;
+      this.bitmapColumnIndex = filter.getBitmapColumnIndex(columnIndexSelector);
+      // Construct Builder instances for all child filters recursively.
+      if (filter instanceof BooleanFilter) {
+        Collection<Filter> childFilters = ((BooleanFilter) filter).getFilters();
+        this.childBuilders = new ArrayList<>(childFilters.size());
+        for (Filter childFilter : childFilters) {
+          this.childBuilders.add(new FilterBundle.Builder(childFilter, columnIndexSelector, cursorAutoArrangeFilters));
+        }
+      } else {
+        this.childBuilders = new ArrayList<>(0);
+      }
+      if (cursorAutoArrangeFilters) {
+        // Sort child builders by cost in ASCENDING order, should be stable by default.
+        this.childBuilders.sort(Comparator.comparingInt(FilterBundle.Builder::getEstimatedIndexComputeCost));
+        this.estimatedIndexComputeCost = calculateEstimatedIndexComputeCost();
+      } else {
+        this.estimatedIndexComputeCost = Integer.MAX_VALUE;
+      }
+    }
+
+    private int calculateEstimatedIndexComputeCost()
+    {
+      if (this.bitmapColumnIndex == null) {
+        return Integer.MAX_VALUE;
+      }
+      int cost = this.bitmapColumnIndex.estimatedComputeCost();
+      if (cost == Integer.MAX_VALUE) {
+        return Integer.MAX_VALUE;
+      }
+
+      for (FilterBundle.Builder childBuilder : childBuilders) {
+        int childCost = childBuilder.getEstimatedIndexComputeCost();
+        if (childCost >= Integer.MAX_VALUE - cost) {
+          return Integer.MAX_VALUE;
+        }
+        cost += childCost;
+      }
+      return cost;
+    }
+
+    public ColumnIndexSelector getColumnIndexSelector()
+    {
+      return columnIndexSelector;
+    }
+
+    @Nullable
+    public BitmapColumnIndex getBitmapColumnIndex()
+    {
+      return bitmapColumnIndex;
+    }
+
+    public List<FilterBundle.Builder> getChildBuilders()
+    {
+      return childBuilders;
+    }
+
+    public int getEstimatedIndexComputeCost()
+    {
+      return estimatedIndexComputeCost;
+    }
+
+    public <T> FilterBundle build(
+        BitmapResultFactory<T> bitmapResultFactory,
+        int applyRowCount,
+        int totalRowCount,
+        boolean includeUnknown
+    )
+    {
+      return filter.makeFilterBundle(this, bitmapResultFactory, applyRowCount, totalRowCount, includeUnknown);
+    }
+  }
+
   public static class SimpleIndexBundle implements IndexBundle
   {
     private final IndexBundleInfo info;
@@ -211,11 +301,7 @@ public MatcherBundleInfo getMatcherInfo()
     }
 
     @Override
-    public ValueMatcher valueMatcher(
-        ColumnSelectorFactory selectorFactory,
-        Offset baseOffset,
-        boolean descending
-    )
+    public ValueMatcher valueMatcher(ColumnSelectorFactory selectorFactory, Offset baseOffset, boolean descending)
     {
       return matcherFn.apply(selectorFactory);
     }
@@ -339,12 +425,11 @@ public List<IndexBundleInfo> getIndexes()
      */
     public String describe()
     {
-      final StringBuilder sb = new StringBuilder()
-          .append("index: ")
-          .append(filter.get())
-          .append(" (selectionSize = ")
-          .append(selectionSize)
-          .append(")\n");
+      final StringBuilder sb = new StringBuilder().append("index: ")
+                                                  .append(filter.get())
+                                                  .append(" (selectionSize = ")
+                                                  .append(selectionSize)
+                                                  .append(")\n");
 
       if (indexes != null) {
         for (final IndexBundleInfo info : indexes) {
@@ -358,23 +443,26 @@ public String describe()
     @Override
     public String toString()
     {
-      return "{" +
-             "filter=\"" + filter.get() + '\"' +
-             ", selectionSize=" + selectionSize +
-             ", buildTime=" + TimeUnit.NANOSECONDS.toMicros(buildTimeNs) + "μs" +
-             (indexes != null ? ", indexes=" + indexes : "") +
-             '}';
+      return "{"
+             + "filter=\""
+             + filter.get()
+             + '\"'
+             + ", selectionSize="
+             + selectionSize
+             + ", buildTime="
+             + TimeUnit.NANOSECONDS.toMicros(buildTimeNs)
+             + "μs"
+             + (indexes != null ? ", indexes=" + indexes : "")
+             + '}';
     }
   }
 
   public static class MatcherBundleInfo
   {
     private static final Pattern PATTERN_LINE_START = Pattern.compile("(?m)^");
-
-    private final Supplier<String> filter;
     @Nullable
     final List<MatcherBundleInfo> matchers;
-
+    private final Supplier<String> filter;
     @Nullable
     private final IndexBundleInfo partialIndex;
 
@@ -415,10 +503,7 @@ public List<MatcherBundleInfo> getMatchers()
      */
     public String describe()
     {
-      final StringBuilder sb = new StringBuilder()
-          .append("matcher: ")
-          .append(filter.get())
-          .append("\n");
+      final StringBuilder sb = new StringBuilder().append("matcher: ").append(filter.get()).append("\n");
 
       if (partialIndex != null) {
         sb.append("  with partial ")
@@ -437,11 +522,13 @@ public String describe()
     @Override
     public String toString()
     {
-      return "{" +
-             "filter=\"" + filter.get() + '\"' +
-             (partialIndex != null ? ", partialIndex=" + partialIndex : "") +
-             (matchers != null ? ", matchers=" + matchers : "") +
-             '}';
+      return "{"
+             + "filter=\""
+             + filter.get()
+             + '\"'
+             + (partialIndex != null ? ", partialIndex=" + partialIndex : "")
+             + (matchers != null ? ", matchers=" + matchers : "")
+             + '}';
     }
   }
 }
diff --git a/processing/src/main/java/org/apache/druid/query/filter/FilterTuning.java b/processing/src/main/java/org/apache/druid/query/filter/FilterTuning.java
index 831d50261e21..892192128e0f 100644
--- a/processing/src/main/java/org/apache/druid/query/filter/FilterTuning.java
+++ b/processing/src/main/java/org/apache/druid/query/filter/FilterTuning.java
@@ -30,7 +30,7 @@
 
 /**
  * This class provides a mechanism to influence whether or not indexes are used for a {@link Filter} during processing
- * by {@link Filter#makeFilterBundle(ColumnIndexSelector, BitmapResultFactory, int, int, boolean)}
+ * by {@link Filter#makeFilterBundle(FilterBundle.Builder, BitmapResultFactory, int, int, boolean)}
  * (i.e. will a {@link Filter} be a "pre" filter in which we union indexes for all values that match the filter to
  * create a {@link org.apache.druid.segment.BitmapOffset}/{@link org.apache.druid.segment.vector.BitmapVectorOffset},
  * or will it be used as a "post" filter and evaluated while scanning row values from the
diff --git a/processing/src/main/java/org/apache/druid/segment/QueryableIndexCursorHolder.java b/processing/src/main/java/org/apache/druid/segment/QueryableIndexCursorHolder.java
index 5188b385b36e..5aa4dbed8cc0 100644
--- a/processing/src/main/java/org/apache/druid/segment/QueryableIndexCursorHolder.java
+++ b/processing/src/main/java/org/apache/druid/segment/QueryableIndexCursorHolder.java
@@ -34,6 +34,7 @@
 import org.apache.druid.query.OrderBy;
 import org.apache.druid.query.Query;
 import org.apache.druid.query.QueryContext;
+import org.apache.druid.query.QueryContexts;
 import org.apache.druid.query.QueryMetrics;
 import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.filter.Filter;
@@ -112,6 +113,7 @@ public QueryableIndexCursorHolder(
             Cursors.getTimeOrdering(ordering),
             interval,
             filter,
+            cursorBuildSpec.getQueryContext().getBoolean(QueryContexts.CURSOR_AUTO_ARRANGE_FILTERS, false),
             metrics
         )
     );
@@ -346,7 +348,6 @@ private VectorColumnSelectorFactory makeVectorColumnSelectorFactoryForOffset(
    * @param timestamp  the timestamp to search for
    * @param startIndex first index to search, inclusive
    * @param endIndex   last index to search, exclusive
-   *
    * @return first index that has a timestamp equal to, or greater, than "timestamp"
    */
   @VisibleForTesting
@@ -665,6 +666,7 @@ private CursorResources(
         Order timeOrder,
         Interval interval,
         @Nullable Filter filter,
+        boolean cursorAutoArrangeFilters,
         @Nullable QueryMetrics<? extends Query<?>> metrics
     )
     {
@@ -688,6 +690,7 @@ private CursorResources(
                 interval,
                 filter
             ),
+            cursorAutoArrangeFilters,
             bitmapIndexSelector,
             numRows,
             metrics
@@ -708,13 +711,14 @@ public void close() throws IOException
 
   /**
    * Create a {@link FilterBundle} for a cursor hold instance.
-   *
+   * <p>
    * The provided filter must include the query-level interface if needed. To compute this properly, use
    * {@link #computeFilterWithIntervalIfNeeded}.
    */
   @Nullable
   private static FilterBundle makeFilterBundle(
       @Nullable final Filter filter,
+      boolean cursorAutoArrangeFilters,
       final ColumnSelectorColumnIndexSelector bitmapIndexSelector,
       final int numRows,
       @Nullable final QueryMetrics<?> metrics
@@ -732,8 +736,11 @@ private static FilterBundle makeFilterBundle(
       return null;
     }
     final long bitmapConstructionStartNs = System.nanoTime();
-    final FilterBundle filterBundle = filter.makeFilterBundle(
+    final FilterBundle filterBundle = new FilterBundle.Builder(
+        filter,
         bitmapIndexSelector,
+        cursorAutoArrangeFilters
+    ).build(
         bitmapResultFactory,
         numRows,
         numRows,
diff --git a/processing/src/main/java/org/apache/druid/segment/filter/AndFilter.java b/processing/src/main/java/org/apache/druid/segment/filter/AndFilter.java
index dfc618acad89..c4172fc6ce2d 100644
--- a/processing/src/main/java/org/apache/druid/segment/filter/AndFilter.java
+++ b/processing/src/main/java/org/apache/druid/segment/filter/AndFilter.java
@@ -72,9 +72,68 @@ public AndFilter(List<Filter> filters)
     this(new LinkedHashSet<>(filters));
   }
 
+  public static ValueMatcher makeMatcher(final ValueMatcher[] baseMatchers)
+  {
+    Preconditions.checkState(baseMatchers.length > 0);
+    if (baseMatchers.length == 1) {
+      return baseMatchers[0];
+    }
+
+    return new ValueMatcher()
+    {
+      @Override
+      public boolean matches(boolean includeUnknown)
+      {
+        for (ValueMatcher matcher : baseMatchers) {
+          if (!matcher.matches(includeUnknown)) {
+            return false;
+          }
+        }
+        return true;
+      }
+
+      @Override
+      public void inspectRuntimeShape(RuntimeShapeInspector inspector)
+      {
+        inspector.visit("firstBaseMatcher", baseMatchers[0]);
+        inspector.visit("secondBaseMatcher", baseMatchers[1]);
+        // Don't inspect the 3rd and all consequent baseMatchers, cut runtime shape combinations at this point.
+        // Anyway if the filter is so complex, Hotspot won't inline all calls because of the inline limit.
+      }
+    };
+  }
+
+  public static VectorValueMatcher makeVectorMatcher(final VectorValueMatcher[] baseMatchers)
+  {
+    Preconditions.checkState(baseMatchers.length > 0);
+    if (baseMatchers.length == 1) {
+      return baseMatchers[0];
+    }
+
+    return new BaseVectorValueMatcher(baseMatchers[0])
+    {
+      @Override
+      public ReadableVectorMatch match(final ReadableVectorMatch mask, boolean includeUnknown)
+      {
+        ReadableVectorMatch match = mask;
+
+        for (VectorValueMatcher matcher : baseMatchers) {
+          if (match.isAllFalse()) {
+            // Short-circuit if the entire vector is false.
+            break;
+          }
+          match = matcher.match(match, includeUnknown);
+        }
+
+        assert match.isValid(mask);
+        return match;
+      }
+    };
+  }
+
   @Override
   public <T> FilterBundle makeFilterBundle(
-      ColumnIndexSelector columnIndexSelector,
+      FilterBundle.Builder filterBundleBuilder,
       BitmapResultFactory<T> bitmapResultFactory,
       int applyRowCount,
       int totalRowCount,
@@ -97,20 +156,21 @@ public <T> FilterBundle makeFilterBundle(
     // a nested AND filter might also partition itself into indexes and bundles, and since it is part of a logical AND
     // operation, this is valid (and even preferable).
     final long bitmapConstructionStartNs = System.nanoTime();
-    for (Filter subfilter : filters) {
-      final FilterBundle subBundle = subfilter.makeFilterBundle(
-          columnIndexSelector,
+    for (FilterBundle.Builder subFilterBundleBuilder : filterBundleBuilder.getChildBuilders()) {
+      final FilterBundle subBundle = subFilterBundleBuilder.build(
           bitmapResultFactory,
           Math.min(applyRowCount, indexIntersectionSize),
           totalRowCount,
           includeUnknown
       );
-      if (subBundle.getIndex() != null) {
+      if (subBundle.hasIndex()) {
         if (subBundle.getIndex().getBitmap().isEmpty()) {
           // if nothing matches for any sub filter, short-circuit, because nothing can possibly match
           return FilterBundle.allFalse(
               System.nanoTime() - bitmapConstructionStartNs,
-              columnIndexSelector.getBitmapFactory().makeEmptyImmutableBitmap()
+              subFilterBundleBuilder.getColumnIndexSelector()
+                                    .getBitmapFactory()
+                                    .makeEmptyImmutableBitmap()
           );
         }
         merged = merged.merge(subBundle.getIndex().getIndexCapabilities());
@@ -122,7 +182,7 @@ public <T> FilterBundle makeFilterBundle(
         }
         indexIntersectionSize = index.size();
       }
-      if (subBundle.getMatcherBundle() != null) {
+      if (subBundle.hasMatcher()) {
         matcherBundles.add(subBundle.getMatcherBundle());
         matcherBundleInfos.add(subBundle.getMatcherBundle().getMatcherInfo());
       }
@@ -131,11 +191,7 @@ public <T> FilterBundle makeFilterBundle(
     final FilterBundle.IndexBundle indexBundle;
     if (index != null) {
       if (indexBundleInfos.size() == 1) {
-        indexBundle = new FilterBundle.SimpleIndexBundle(
-            indexBundleInfos.get(0),
-            index,
-            merged
-        );
+        indexBundle = new FilterBundle.SimpleIndexBundle(indexBundleInfos.get(0), index, merged);
       } else {
         indexBundle = new FilterBundle.SimpleIndexBundle(
             new FilterBundle.IndexBundleInfo(
@@ -162,11 +218,7 @@ public FilterBundle.MatcherBundleInfo getMatcherInfo()
           if (matcherBundles.size() == 1) {
             return matcherBundleInfos.get(0);
           }
-          return new FilterBundle.MatcherBundleInfo(
-              () -> "AND",
-              null,
-              matcherBundleInfos
-          );
+          return new FilterBundle.MatcherBundleInfo(() -> "AND", null, matcherBundleInfos);
         }
 
         @Override
@@ -180,7 +232,10 @@ public ValueMatcher valueMatcher(ColumnSelectorFactory selectorFactory, Offset b
         }
 
         @Override
-        public VectorValueMatcher vectorMatcher(VectorColumnSelectorFactory selectorFactory, ReadableVectorOffset baseOffset)
+        public VectorValueMatcher vectorMatcher(
+            VectorColumnSelectorFactory selectorFactory,
+            ReadableVectorOffset baseOffset
+        )
         {
           final VectorValueMatcher[] vectorMatchers = new VectorValueMatcher[matcherBundles.size()];
           for (int i = 0; i < matcherBundles.size(); i++) {
@@ -204,10 +259,7 @@ public boolean canVectorize()
       matcherBundle = null;
     }
 
-    return new FilterBundle(
-        indexBundle,
-        matcherBundle
-    );
+    return new FilterBundle(indexBundle, matcherBundle);
   }
 
   @Nullable
@@ -239,6 +291,13 @@ public ColumnIndexCapabilities getIndexCapabilities()
         return finalMerged;
       }
 
+      @Override
+      public int estimatedComputeCost()
+      {
+        // There's no additional cost on AND filter, cost in child filters would be summed.
+        return 0;
+      }
+
       @Override
       public <T> T computeBitmapResult(BitmapResultFactory<T> bitmapResultFactory, boolean includeUnknown)
       {
@@ -350,65 +409,6 @@ public String toString()
     return StringUtils.format("(%s)", AND_JOINER.join(filters));
   }
 
-  public static ValueMatcher makeMatcher(final ValueMatcher[] baseMatchers)
-  {
-    Preconditions.checkState(baseMatchers.length > 0);
-    if (baseMatchers.length == 1) {
-      return baseMatchers[0];
-    }
-
-    return new ValueMatcher()
-    {
-      @Override
-      public boolean matches(boolean includeUnknown)
-      {
-        for (ValueMatcher matcher : baseMatchers) {
-          if (!matcher.matches(includeUnknown)) {
-            return false;
-          }
-        }
-        return true;
-      }
-
-      @Override
-      public void inspectRuntimeShape(RuntimeShapeInspector inspector)
-      {
-        inspector.visit("firstBaseMatcher", baseMatchers[0]);
-        inspector.visit("secondBaseMatcher", baseMatchers[1]);
-        // Don't inspect the 3rd and all consequent baseMatchers, cut runtime shape combinations at this point.
-        // Anyway if the filter is so complex, Hotspot won't inline all calls because of the inline limit.
-      }
-    };
-  }
-
-  public static VectorValueMatcher makeVectorMatcher(final VectorValueMatcher[] baseMatchers)
-  {
-    Preconditions.checkState(baseMatchers.length > 0);
-    if (baseMatchers.length == 1) {
-      return baseMatchers[0];
-    }
-
-    return new BaseVectorValueMatcher(baseMatchers[0])
-    {
-      @Override
-      public ReadableVectorMatch match(final ReadableVectorMatch mask, boolean includeUnknown)
-      {
-        ReadableVectorMatch match = mask;
-
-        for (VectorValueMatcher matcher : baseMatchers) {
-          if (match.isAllFalse()) {
-            // Short-circuit if the entire vector is false.
-            break;
-          }
-          match = matcher.match(match, includeUnknown);
-        }
-
-        assert match.isValid(mask);
-        return match;
-      }
-    };
-  }
-
   @Override
   public boolean equals(Object o)
   {
diff --git a/processing/src/main/java/org/apache/druid/segment/filter/OrFilter.java b/processing/src/main/java/org/apache/druid/segment/filter/OrFilter.java
index 700b2fbfa168..e8bdce85c9bf 100644
--- a/processing/src/main/java/org/apache/druid/segment/filter/OrFilter.java
+++ b/processing/src/main/java/org/apache/druid/segment/filter/OrFilter.java
@@ -78,421 +78,162 @@ public OrFilter(List<Filter> filters)
     this(new LinkedHashSet<>(filters));
   }
 
-  @Override
-  public <T> FilterBundle makeFilterBundle(
-      ColumnIndexSelector columnIndexSelector,
-      BitmapResultFactory<T> bitmapResultFactory,
-      int applyRowCount,
-      int totalRowCount,
-      boolean includeUnknown
-  )
+  private static ValueMatcher makeMatcher(final ValueMatcher[] baseMatchers)
   {
-    // for OR filters, we have a few possible outcomes:
-    // 1 - all clauses are index only bundles. in this case we union the bitmaps together and make an index only bundle
-    // 2 - some clauses support indexes. in this case, we union the bitmaps of any index only bundles together to form a
-    //     partial index which is constructed into a matcher bundle with convertIndexToMatcherBundle. We translate any
-    //     index AND matcher bundles into a matcher only bundle with convertBundleToMatcherOnlyBundle. Finally, we
-    //     combine these with the remaining matcher only bundles to with makeMatcher/makeVectorMatcher to make a matcher
-    //     only bundle
-    // 3 - no clauses support indexes. in this case, we make a matcher only bundle using makeMatcher/makeVectorMatcher
-
-    final List<FilterBundle.IndexBundle> indexOnlyBundles = new ArrayList<>();
-    final List<FilterBundle.IndexBundleInfo> indexOnlyBundlesInfo = new ArrayList<>();
-    final List<FilterBundle.MatcherBundle> partialIndexBundles = new ArrayList<>();
-    final List<FilterBundle.MatcherBundle> matcherOnlyBundles = new ArrayList<>();
-
-    int indexUnionSize = 0;
-    ImmutableBitmap index = null;
-    ColumnIndexCapabilities merged = new SimpleColumnIndexCapabilities(true, true);
-    int emptyCount = 0;
+    Preconditions.checkState(baseMatchers.length > 0);
 
-    final long bitmapConstructionStartNs = System.nanoTime();
+    if (baseMatchers.length == 1) {
+      return baseMatchers[0];
+    }
 
-    for (Filter subfilter : filters) {
-      final FilterBundle bundle = subfilter.makeFilterBundle(
-          columnIndexSelector,
-          bitmapResultFactory,
-          Math.min(applyRowCount, totalRowCount - indexUnionSize),
-          totalRowCount,
-          includeUnknown
-      );
-      if (bundle.hasIndex()) {
-        final ImmutableBitmap bundleIndex = bundle.getIndex().getBitmap();
-        if (bundleIndex.isEmpty()) {
-          // we leave any indexes which are empty out of index, indexOnlyBundles, and partialIndexBundles
-          // even though we skip them, we still keep track of them to check for the case when we can build the OR into
-          // an index only bundle. We can count index and matcher bundles here too because the AND operation means that
-          // an empty index means the matcher can be skipped
-          emptyCount++;
-        } else {
-          if (bundle.hasMatcher()) {
-            // index and matcher bundles must be handled separately, they will need to be a single value matcher built
-            // by doing an AND operation between the index and the value matcher
-            // (a bundle is basically an AND operation between the index and matcher if the matcher is present)
-            partialIndexBundles.add(convertBundleToMatcherOnlyBundle(bundle, bundleIndex));
-          } else {
-            indexOnlyBundles.add(bundle.getIndex());
-            indexOnlyBundlesInfo.add(bundle.getIndex().getIndexInfo());
-            merged.merge(bundle.getIndex().getIndexCapabilities());
-            // union index only bitmaps together; if all sub-filters are 'index only' bundles we will make an index only
-            // bundle ourselves, else we will use this index as a single value matcher
-            if (index == null) {
-              index = bundle.getIndex().getBitmap();
-            } else {
-              index = index.union(bundle.getIndex().getBitmap());
-            }
-            indexUnionSize = index.size();
+    return new ValueMatcher()
+    {
+      @Override
+      public boolean matches(boolean includeUnknown)
+      {
+        for (ValueMatcher matcher : baseMatchers) {
+          if (matcher.matches(includeUnknown)) {
+            return true;
           }
         }
-      } else {
-        matcherOnlyBundles.add(bundle.getMatcherBundle());
+        return false;
       }
-    }
-    final long totalBitmapConstructTimeNs = System.nanoTime() - bitmapConstructionStartNs;
-
 
-    // if all the filters are 'index only', we can make an index only bundle
-    if (indexOnlyBundles.size() + emptyCount == filters.size()) {
-      if (index == null || index.isEmpty()) {
-        return FilterBundle.allFalse(
-            totalBitmapConstructTimeNs,
-            columnIndexSelector.getBitmapFactory().makeEmptyImmutableBitmap()
-        );
-      }
-      if (indexOnlyBundles.size() == 1) {
-        return new FilterBundle(
-            indexOnlyBundles.get(0),
-            null
-        );
+      @Override
+      public void inspectRuntimeShape(RuntimeShapeInspector inspector)
+      {
+        inspector.visit("firstBaseMatcher", baseMatchers[0]);
+        inspector.visit("secondBaseMatcher", baseMatchers[1]);
+        // Don't inspect the 3rd and all consequent baseMatchers, cut runtime shape combinations at this point.
+        // Anyway if the filter is so complex, Hotspot won't inline all calls because of the inline limit.
       }
-      return new FilterBundle(
-          new FilterBundle.SimpleIndexBundle(
-              new FilterBundle.IndexBundleInfo(
-                  () -> "OR",
-                  applyRowCount,
-                  totalBitmapConstructTimeNs,
-                  indexOnlyBundlesInfo
-              ),
-              index,
-              merged
-          ),
-          null
-      );
-    }
+    };
+  }
 
-    // if not the index only outcome, we build a matcher only bundle from all the matchers
-    final int estimatedSize = (indexOnlyBundles.isEmpty() ? 0 : 1)
-                              + partialIndexBundles.size()
-                              + matcherOnlyBundles.size();
-    final List<FilterBundle.MatcherBundle> allMatcherBundles = Lists.newArrayListWithCapacity(estimatedSize);
-    final List<FilterBundle.MatcherBundleInfo> allMatcherBundlesInfo = Lists.newArrayListWithCapacity(estimatedSize);
-    if (!indexOnlyBundles.isEmpty()) {
-      // translate the indexOnly bundles into a single matcher
-      final FilterBundle.MatcherBundle matcherBundle = convertIndexToMatcherBundle(
-          applyRowCount,
-          indexOnlyBundles,
-          indexOnlyBundlesInfo,
-          totalBitmapConstructTimeNs,
-          index
-      );
-      allMatcherBundles.add(matcherBundle);
-      allMatcherBundlesInfo.add(matcherBundle.getMatcherInfo());
-    }
-    for (FilterBundle.MatcherBundle bundle : partialIndexBundles) {
-      allMatcherBundles.add(bundle);
-      allMatcherBundlesInfo.add(bundle.getMatcherInfo());
-    }
-    for (FilterBundle.MatcherBundle bundle : matcherOnlyBundles) {
-      allMatcherBundles.add(bundle);
-      allMatcherBundlesInfo.add(bundle.getMatcherInfo());
+  private static VectorValueMatcher makeVectorMatcher(final VectorValueMatcher[] baseMatchers)
+  {
+    Preconditions.checkState(baseMatchers.length > 0);
+    if (baseMatchers.length == 1) {
+      return baseMatchers[0];
     }
 
-    return new FilterBundle(
-        null,
-        new FilterBundle.MatcherBundle()
-        {
-          @Override
-          public FilterBundle.MatcherBundleInfo getMatcherInfo()
-          {
-            return new FilterBundle.MatcherBundleInfo(
-                () -> "OR",
-                null,
-                allMatcherBundlesInfo
-            );
-          }
+    return new BaseVectorValueMatcher(baseMatchers[0])
+    {
+      final VectorMatch currentMask = VectorMatch.wrap(new int[getMaxVectorSize()]);
+      final VectorMatch scratch = VectorMatch.wrap(new int[getMaxVectorSize()]);
+      final VectorMatch retVal = VectorMatch.wrap(new int[getMaxVectorSize()]);
 
-          @Override
-          public ValueMatcher valueMatcher(ColumnSelectorFactory selectorFactory, Offset baseOffset, boolean descending)
-          {
-            final ValueMatcher[] matchers = new ValueMatcher[allMatcherBundles.size()];
-            for (int i = 0; i < allMatcherBundles.size(); i++) {
-              matchers[i] = allMatcherBundles.get(i).valueMatcher(selectorFactory, baseOffset, descending);
-            }
-            return makeMatcher(matchers);
-          }
+      @Override
+      public ReadableVectorMatch match(final ReadableVectorMatch mask, boolean includeUnknown)
+      {
+        ReadableVectorMatch currentMatch = baseMatchers[0].match(mask, includeUnknown);
 
-          @Override
-          public VectorValueMatcher vectorMatcher(
-              VectorColumnSelectorFactory selectorFactory,
-              ReadableVectorOffset baseOffset
-          )
-          {
-            final VectorValueMatcher[] matchers = new VectorValueMatcher[allMatcherBundles.size()];
-            for (int i = 0; i < allMatcherBundles.size(); i++) {
-              matchers[i] = allMatcherBundles.get(i).vectorMatcher(selectorFactory, baseOffset);
-            }
-            return makeVectorMatcher(matchers);
+        // Initialize currentMask = mask, then progressively remove rows from the mask as we find matches for them.
+        // This isn't necessary for correctness (we could use the original "mask" on every call to "match") but it
+        // allows for short-circuiting on a row-by-row basis.
+        currentMask.copyFrom(mask);
+
+        // Initialize retVal = currentMatch, the rows matched by the first matcher. We'll add more as we loop over
+        // the rest of the matchers.
+        retVal.copyFrom(currentMatch);
+
+        for (int i = 1; i < baseMatchers.length; i++) {
+          if (retVal.isAllTrue(getCurrentVectorSize())) {
+            // Short-circuit if the entire vector is true.
+            break;
           }
 
-          @Override
-          public boolean canVectorize()
-          {
-            for (FilterBundle.MatcherBundle bundle : allMatcherBundles) {
-              if (!bundle.canVectorize()) {
-                return false;
-              }
-            }
-            return true;
+          currentMask.removeAll(currentMatch);
+          currentMatch = baseMatchers[i].match(currentMask, false);
+          retVal.addAll(currentMatch, scratch);
+
+          if (currentMatch == currentMask) {
+            // baseMatchers[i] matched every remaining row. Short-circuit out.
+            break;
           }
         }
-    );
-  }
-
-  @Nullable
-  @Override
-  public BitmapColumnIndex getBitmapColumnIndex(ColumnIndexSelector selector)
-  {
-    if (filters.size() == 1) {
-      return Iterables.getOnlyElement(filters).getBitmapColumnIndex(selector);
-    }
 
-    List<BitmapColumnIndex> bitmapColumnIndices = new ArrayList<>(filters.size());
-    ColumnIndexCapabilities merged = new SimpleColumnIndexCapabilities(true, true);
-    for (Filter filter : filters) {
-      BitmapColumnIndex index = filter.getBitmapColumnIndex(selector);
-      if (index == null) {
-        // all or nothing
-        return null;
+        assert retVal.isValid(mask);
+        return retVal;
       }
-      merged = merged.merge(index.getIndexCapabilities());
-      bitmapColumnIndices.add(index);
-    }
+    };
+  }
 
-    final ColumnIndexCapabilities finalMerged = merged;
-    return new BitmapColumnIndex()
+  /**
+   * Convert a {@link FilterBundle} that has both {@link FilterBundle#getIndex()} and
+   * {@link FilterBundle#getMatcherBundle()} into a 'matcher only' bundle by converting the index into a matcher
+   * with {@link #convertIndexToValueMatcher(ReadableOffset, ImmutableBitmap, boolean)} and
+   * {@link #convertIndexToVectorValueMatcher(ReadableVectorOffset, ImmutableBitmap)} and then doing a logical AND
+   * with the bundles matchers.
+   */
+  private static FilterBundle.MatcherBundle convertBundleToMatcherOnlyBundle(
+      FilterBundle bundle,
+      ImmutableBitmap bundleIndex
+  )
+  {
+    return new FilterBundle.MatcherBundle()
     {
       @Override
-      public ColumnIndexCapabilities getIndexCapabilities()
+      public FilterBundle.MatcherBundleInfo getMatcherInfo()
       {
-        return finalMerged;
-      }
-
-      @Override
-      public <T> T computeBitmapResult(BitmapResultFactory<T> bitmapResultFactory, boolean includeUnknown)
-      {
-        return bitmapResultFactory.union(
-            () -> bitmapColumnIndices.stream().map(x -> x.computeBitmapResult(bitmapResultFactory, includeUnknown)).iterator()
+        return new FilterBundle.MatcherBundleInfo(
+            () -> "AND",
+            bundle.getIndex().getIndexInfo(),
+            Collections.singletonList(bundle.getMatcherBundle().getMatcherInfo())
         );
       }
 
-      @Nullable
       @Override
-      public <T> T computeBitmapResult(
-          BitmapResultFactory<T> bitmapResultFactory,
-          int applyRowCount,
-          int totalRowCount,
-          boolean includeUnknown
+      public ValueMatcher valueMatcher(
+          ColumnSelectorFactory selectorFactory,
+          Offset baseOffset,
+          boolean descending
       )
       {
-        List<T> results = Lists.newArrayListWithCapacity(bitmapColumnIndices.size());
-        for (BitmapColumnIndex index : bitmapColumnIndices) {
-          final T r = index.computeBitmapResult(bitmapResultFactory, applyRowCount, totalRowCount, includeUnknown);
-          if (r == null) {
-            // all or nothing
-            return null;
-          }
-          results.add(r);
-        }
-        return bitmapResultFactory.union(results);
-      }
-    };
-  }
-
-  @Override
-  public ValueMatcher makeMatcher(ColumnSelectorFactory factory)
-  {
-    final ValueMatcher[] matchers = new ValueMatcher[filters.size()];
-
-    int i = 0;
-    for (Filter filter : filters) {
-      matchers[i++] = filter.makeMatcher(factory);
-    }
-    return makeMatcher(matchers);
-  }
-
-  @Override
-  public VectorValueMatcher makeVectorMatcher(final VectorColumnSelectorFactory factory)
-  {
-    final VectorValueMatcher[] matchers = new VectorValueMatcher[filters.size()];
-
-    int i = 0;
-    for (Filter filter : filters) {
-      matchers[i++] = filter.makeVectorMatcher(factory);
-    }
-    return makeVectorMatcher(matchers);
-  }
-
-  @Override
-  public boolean canVectorizeMatcher(ColumnInspector inspector)
-  {
-    return filters.stream().allMatch(filter -> filter.canVectorizeMatcher(inspector));
-  }
-
-  @Override
-  public LinkedHashSet<Filter> getFilters()
-  {
-    return filters;
-  }
-
-  @Override
-  public boolean supportsRequiredColumnRewrite()
-  {
-    for (Filter filter : filters) {
-      if (!filter.supportsRequiredColumnRewrite()) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  @Override
-  public Filter rewriteRequiredColumns(Map<String, String> columnRewrites)
-  {
-    final List<Filter> newFilters = new ArrayList<>(filters.size());
-    for (Filter filter : filters) {
-      newFilters.add(filter.rewriteRequiredColumns(columnRewrites));
-    }
-    return new OrFilter(newFilters);
-  }
-
-  @Override
-  public String toString()
-  {
-    return StringUtils.format("(%s)", OR_JOINER.join(filters));
-  }
-
-  @Override
-  public boolean equals(Object o)
-  {
-    if (this == o) {
-      return true;
-    }
-    if (o == null || getClass() != o.getClass()) {
-      return false;
-    }
-    OrFilter orFilter = (OrFilter) o;
-    return Objects.equals(getFilters(), orFilter.getFilters());
-  }
-
-  @Override
-  public int hashCode()
-  {
-    return Objects.hash(getFilters());
-  }
-
-
-  private static ValueMatcher makeMatcher(final ValueMatcher[] baseMatchers)
-  {
-    Preconditions.checkState(baseMatchers.length > 0);
-
-    if (baseMatchers.length == 1) {
-      return baseMatchers[0];
-    }
-
-    return new ValueMatcher()
-    {
-      @Override
-      public boolean matches(boolean includeUnknown)
-      {
-        for (ValueMatcher matcher : baseMatchers) {
-          if (matcher.matches(includeUnknown)) {
-            return true;
-          }
-        }
-        return false;
+        return AndFilter.makeMatcher(
+            new ValueMatcher[]{
+                convertIndexToValueMatcher(baseOffset.getBaseReadableOffset(), bundleIndex, descending),
+                bundle.getMatcherBundle().valueMatcher(selectorFactory, baseOffset, descending)
+            }
+        );
       }
 
       @Override
-      public void inspectRuntimeShape(RuntimeShapeInspector inspector)
+      public VectorValueMatcher vectorMatcher(
+          VectorColumnSelectorFactory selectorFactory,
+          ReadableVectorOffset baseOffset
+      )
       {
-        inspector.visit("firstBaseMatcher", baseMatchers[0]);
-        inspector.visit("secondBaseMatcher", baseMatchers[1]);
-        // Don't inspect the 3rd and all consequent baseMatchers, cut runtime shape combinations at this point.
-        // Anyway if the filter is so complex, Hotspot won't inline all calls because of the inline limit.
+        return AndFilter.makeVectorMatcher(
+            new VectorValueMatcher[]{
+                convertIndexToVectorValueMatcher(
+                    baseOffset,
+                    bundleIndex
+                ),
+                bundle.getMatcherBundle().vectorMatcher(selectorFactory, baseOffset)
+            }
+        );
       }
-    };
-  }
-
-  private static VectorValueMatcher makeVectorMatcher(final VectorValueMatcher[] baseMatchers)
-  {
-    Preconditions.checkState(baseMatchers.length > 0);
-    if (baseMatchers.length == 1) {
-      return baseMatchers[0];
-    }
-
-    return new BaseVectorValueMatcher(baseMatchers[0])
-    {
-      final VectorMatch currentMask = VectorMatch.wrap(new int[getMaxVectorSize()]);
-      final VectorMatch scratch = VectorMatch.wrap(new int[getMaxVectorSize()]);
-      final VectorMatch retVal = VectorMatch.wrap(new int[getMaxVectorSize()]);
 
       @Override
-      public ReadableVectorMatch match(final ReadableVectorMatch mask, boolean includeUnknown)
+      public boolean canVectorize()
       {
-        ReadableVectorMatch currentMatch = baseMatchers[0].match(mask, includeUnknown);
-
-        // Initialize currentMask = mask, then progressively remove rows from the mask as we find matches for them.
-        // This isn't necessary for correctness (we could use the original "mask" on every call to "match") but it
-        // allows for short-circuiting on a row-by-row basis.
-        currentMask.copyFrom(mask);
-
-        // Initialize retVal = currentMatch, the rows matched by the first matcher. We'll add more as we loop over
-        // the rest of the matchers.
-        retVal.copyFrom(currentMatch);
-
-        for (int i = 1; i < baseMatchers.length; i++) {
-          if (retVal.isAllTrue(getCurrentVectorSize())) {
-            // Short-circuit if the entire vector is true.
-            break;
-          }
-
-          currentMask.removeAll(currentMatch);
-          currentMatch = baseMatchers[i].match(currentMask, false);
-          retVal.addAll(currentMatch, scratch);
-
-          if (currentMatch == currentMask) {
-            // baseMatchers[i] matched every remaining row. Short-circuit out.
-            break;
-          }
-        }
-
-        assert retVal.isValid(mask);
-        return retVal;
+        return bundle.getMatcherBundle() == null || bundle.getMatcherBundle().canVectorize();
       }
     };
   }
 
   /**
-   * Convert a {@link FilterBundle} that has both {@link FilterBundle#getIndex()} and
-   * {@link FilterBundle#getMatcherBundle()} into a 'matcher only' bundle by converting the index into a matcher
-   * with {@link #convertIndexToValueMatcher(ReadableOffset, ImmutableBitmap, boolean)} and
-   * {@link #convertIndexToVectorValueMatcher(ReadableVectorOffset, ImmutableBitmap)} and then doing a logical AND
-   * with the bundles matchers.
+   * Convert an index into a matcher bundle, using
+   * {@link #convertIndexToValueMatcher(ReadableOffset, ImmutableBitmap, boolean)} and
+   * {@link #convertIndexToVectorValueMatcher(ReadableVectorOffset, ImmutableBitmap)}
    */
-  private static FilterBundle.MatcherBundle convertBundleToMatcherOnlyBundle(
-      FilterBundle bundle,
-      ImmutableBitmap bundleIndex
+  private static FilterBundle.MatcherBundle convertIndexToMatcherBundle(
+      int selectionRowCount,
+      List<FilterBundle.IndexBundle> indexOnlyBundles,
+      List<FilterBundle.IndexBundleInfo> indexOnlyBundlesInfo,
+      long totalBitmapConstructTimeNs,
+      ImmutableBitmap partialIndex
   )
   {
     return new FilterBundle.MatcherBundle()
@@ -500,10 +241,22 @@ private static FilterBundle.MatcherBundle convertBundleToMatcherOnlyBundle(
       @Override
       public FilterBundle.MatcherBundleInfo getMatcherInfo()
       {
+        if (indexOnlyBundles.size() == 1) {
+          return new FilterBundle.MatcherBundleInfo(
+              indexOnlyBundles.get(0).getIndexInfo()::getFilter,
+              indexOnlyBundles.get(0).getIndexInfo(),
+              null
+          );
+        }
         return new FilterBundle.MatcherBundleInfo(
-            () -> "AND",
-            bundle.getIndex().getIndexInfo(),
-            Collections.singletonList(bundle.getMatcherBundle().getMatcherInfo())
+            () -> "OR",
+            new FilterBundle.IndexBundleInfo(
+                () -> "OR",
+                selectionRowCount,
+                totalBitmapConstructTimeNs,
+                indexOnlyBundlesInfo
+            ),
+            null
         );
       }
 
@@ -514,12 +267,7 @@ public ValueMatcher valueMatcher(
           boolean descending
       )
       {
-        return AndFilter.makeMatcher(
-            new ValueMatcher[]{
-                convertIndexToValueMatcher(baseOffset.getBaseReadableOffset(), bundleIndex, descending),
-                bundle.getMatcherBundle().valueMatcher(selectorFactory, baseOffset, descending)
-            }
-        );
+        return convertIndexToValueMatcher(baseOffset.getBaseReadableOffset(), partialIndex, descending);
       }
 
       @Override
@@ -528,220 +276,479 @@ public VectorValueMatcher vectorMatcher(
           ReadableVectorOffset baseOffset
       )
       {
-        return AndFilter.makeVectorMatcher(
-            new VectorValueMatcher[]{
-                convertIndexToVectorValueMatcher(
-                    baseOffset,
-                    bundleIndex
-                ),
-                bundle.getMatcherBundle().vectorMatcher(selectorFactory, baseOffset)
-            }
-        );
+        return convertIndexToVectorValueMatcher(baseOffset, partialIndex);
       }
 
       @Override
       public boolean canVectorize()
       {
-        return bundle.getMatcherBundle() == null || bundle.getMatcherBundle().canVectorize();
+        return true;
+      }
+    };
+  }
+
+  private static ValueMatcher convertIndexToValueMatcher(
+      final ReadableOffset offset,
+      final ImmutableBitmap rowBitmap,
+      boolean descending
+  )
+  {
+
+    if (descending) {
+
+      final IntIterator iter = BitmapOffset.getReverseBitmapOffsetIterator(rowBitmap);
+
+      if (!iter.hasNext()) {
+        return ValueMatchers.allFalse();
+      }
+      return new ValueMatcher()
+      {
+        int iterOffset = Integer.MAX_VALUE;
+
+        @Override
+        public boolean matches(boolean includeUnknown)
+        {
+          int currentOffset = offset.getOffset();
+          while (iterOffset > currentOffset && iter.hasNext()) {
+            iterOffset = iter.next();
+          }
+
+          return iterOffset == currentOffset;
+        }
+
+        @Override
+        public void inspectRuntimeShape(RuntimeShapeInspector inspector)
+        {
+          inspector.visit("offset", offset);
+          inspector.visit("iter", iter);
+        }
+      };
+    } else {
+      final PeekableIntIterator peekableIterator = rowBitmap.peekableIterator();
+
+      if (!peekableIterator.hasNext()) {
+        return ValueMatchers.allFalse();
+      }
+      return new ValueMatcher()
+      {
+        int iterOffset = -1;
+
+        @Override
+        public boolean matches(boolean includeUnknown)
+        {
+          int currentOffset = offset.getOffset();
+          peekableIterator.advanceIfNeeded(currentOffset);
+          if (peekableIterator.hasNext()) {
+            iterOffset = peekableIterator.peekNext();
+          }
+
+          return iterOffset == currentOffset;
+        }
+
+        @Override
+        public void inspectRuntimeShape(RuntimeShapeInspector inspector)
+        {
+          inspector.visit("offset", offset);
+          inspector.visit("peekableIterator", peekableIterator);
+        }
+      };
+    }
+  }
+
+  private static VectorValueMatcher convertIndexToVectorValueMatcher(
+      final ReadableVectorOffset vectorOffset,
+      final ImmutableBitmap bitmap
+  )
+  {
+    final PeekableIntIterator peekableIntIterator = bitmap.peekableIterator();
+    if (!peekableIntIterator.hasNext()) {
+      return BooleanVectorValueMatcher.of(vectorOffset, ConstantMatcherType.ALL_FALSE);
+    }
+
+    return new VectorValueMatcher()
+    {
+      final VectorMatch match = VectorMatch.wrap(new int[vectorOffset.getMaxVectorSize()]);
+      int iterOffset = -1;
+
+      @Override
+      public ReadableVectorMatch match(ReadableVectorMatch mask, boolean includeUnknown)
+      {
+        final int[] selection = match.getSelection();
+        if (vectorOffset.isContiguous()) {
+          int numRows = 0;
+          for (int i = 0; i < mask.getSelectionSize(); i++) {
+            final int maskNum = mask.getSelection()[i];
+            final int rowNum = vectorOffset.getStartOffset() + maskNum;
+            peekableIntIterator.advanceIfNeeded(rowNum);
+            if (peekableIntIterator.hasNext()) {
+              iterOffset = peekableIntIterator.peekNext();
+              if (iterOffset == rowNum) {
+                selection[numRows++] = maskNum;
+              }
+            }
+          }
+          match.setSelectionSize(numRows);
+          return match;
+        } else {
+          final int[] currentOffsets = vectorOffset.getOffsets();
+          int numRows = 0;
+          for (int i = 0; i < mask.getSelectionSize(); i++) {
+            final int maskNum = mask.getSelection()[i];
+            final int rowNum = currentOffsets[mask.getSelection()[i]];
+            peekableIntIterator.advanceIfNeeded(rowNum);
+            if (peekableIntIterator.hasNext()) {
+              iterOffset = peekableIntIterator.peekNext();
+              if (iterOffset == rowNum) {
+                selection[numRows++] = maskNum;
+              }
+            }
+          }
+          match.setSelectionSize(numRows);
+          return match;
+        }
+      }
+
+      @Override
+      public int getMaxVectorSize()
+      {
+        return vectorOffset.getMaxVectorSize();
+      }
+
+      @Override
+      public int getCurrentVectorSize()
+      {
+        return vectorOffset.getCurrentVectorSize();
+      }
+    };
+  }
+
+  @Override
+  public <T> FilterBundle makeFilterBundle(
+      FilterBundle.Builder filterBundleBuilder,
+      BitmapResultFactory<T> bitmapResultFactory,
+      int applyRowCount,
+      int totalRowCount,
+      boolean includeUnknown
+  )
+  {
+    // for OR filters, we have a few possible outcomes:
+    // 1 - all clauses are index only bundles. in this case we union the bitmaps together and make an index only bundle
+    // 2 - some clauses support indexes. in this case, we union the bitmaps of any index only bundles together to form a
+    //     partial index which is constructed into a matcher bundle with convertIndexToMatcherBundle. We translate any
+    //     index AND matcher bundles into a matcher only bundle with convertBundleToMatcherOnlyBundle. Finally, we
+    //     combine these with the remaining matcher only bundles to with makeMatcher/makeVectorMatcher to make a matcher
+    //     only bundle
+    // 3 - no clauses support indexes. in this case, we make a matcher only bundle using makeMatcher/makeVectorMatcher
+
+    final List<FilterBundle.IndexBundle> indexOnlyBundles = new ArrayList<>();
+    final List<FilterBundle.IndexBundleInfo> indexOnlyBundlesInfo = new ArrayList<>();
+    final List<FilterBundle.MatcherBundle> partialIndexBundles = new ArrayList<>();
+    final List<FilterBundle.MatcherBundle> matcherOnlyBundles = new ArrayList<>();
+
+    int indexUnionSize = 0;
+    ImmutableBitmap index = null;
+    ColumnIndexCapabilities merged = new SimpleColumnIndexCapabilities(true, true);
+    int emptyCount = 0;
+
+    final long bitmapConstructionStartNs = System.nanoTime();
+    for (FilterBundle.Builder subFilterBundleBuilder : filterBundleBuilder.getChildBuilders()) {
+      final FilterBundle bundle = subFilterBundleBuilder.build(
+          bitmapResultFactory,
+          Math.min(applyRowCount, totalRowCount - indexUnionSize),
+          totalRowCount,
+          includeUnknown
+      );
+      if (bundle.hasIndex()) {
+        final ImmutableBitmap bundleIndex = bundle.getIndex().getBitmap();
+        if (bundleIndex.isEmpty()) {
+          // we leave any indexes which are empty out of index, indexOnlyBundles, and partialIndexBundles
+          // even though we skip them, we still keep track of them to check for the case when we can build the OR into
+          // an index only bundle. We can count index and matcher bundles here too because the AND operation means that
+          // an empty index means the matcher can be skipped
+          emptyCount++;
+        } else {
+          if (bundle.hasMatcher()) {
+            // index and matcher bundles must be handled separately, they will need to be a single value matcher built
+            // by doing an AND operation between the index and the value matcher
+            // (a bundle is basically an AND operation between the index and matcher if the matcher is present)
+            partialIndexBundles.add(convertBundleToMatcherOnlyBundle(bundle, bundleIndex));
+          } else {
+            indexOnlyBundles.add(bundle.getIndex());
+            indexOnlyBundlesInfo.add(bundle.getIndex().getIndexInfo());
+            merged.merge(bundle.getIndex().getIndexCapabilities());
+            // union index only bitmaps together; if all sub-filters are 'index only' bundles we will make an index only
+            // bundle ourselves, else we will use this index as a single value matcher
+            if (index == null) {
+              index = bundle.getIndex().getBitmap();
+            } else {
+              index = index.union(bundle.getIndex().getBitmap());
+            }
+            indexUnionSize = index.size();
+          }
+        }
+      } else {
+        matcherOnlyBundles.add(bundle.getMatcherBundle());
+      }
+    }
+    final long totalBitmapConstructTimeNs = System.nanoTime() - bitmapConstructionStartNs;
+
+
+    // if all the filters are 'index only', we can make an index only bundle
+    if (indexOnlyBundles.size() + emptyCount == filters.size()) {
+      if (index == null || index.isEmpty()) {
+        return FilterBundle.allFalse(
+            totalBitmapConstructTimeNs,
+            filterBundleBuilder.getColumnIndexSelector().getBitmapFactory().makeEmptyImmutableBitmap()
+        );
+      }
+      if (indexOnlyBundles.size() == 1) {
+        return new FilterBundle(
+            indexOnlyBundles.get(0),
+            null
+        );
+      }
+      return new FilterBundle(
+          new FilterBundle.SimpleIndexBundle(
+              new FilterBundle.IndexBundleInfo(
+                  () -> "OR",
+                  applyRowCount,
+                  totalBitmapConstructTimeNs,
+                  indexOnlyBundlesInfo
+              ),
+              index,
+              merged
+          ),
+          null
+      );
+    }
+
+    // if not the index only outcome, we build a matcher only bundle from all the matchers
+    final int estimatedSize = (indexOnlyBundles.isEmpty() ? 0 : 1)
+                              + partialIndexBundles.size()
+                              + matcherOnlyBundles.size();
+    final List<FilterBundle.MatcherBundle> allMatcherBundles = Lists.newArrayListWithCapacity(estimatedSize);
+    final List<FilterBundle.MatcherBundleInfo> allMatcherBundlesInfo = Lists.newArrayListWithCapacity(estimatedSize);
+    if (!indexOnlyBundles.isEmpty()) {
+      // translate the indexOnly bundles into a single matcher
+      final FilterBundle.MatcherBundle matcherBundle = convertIndexToMatcherBundle(
+          applyRowCount,
+          indexOnlyBundles,
+          indexOnlyBundlesInfo,
+          totalBitmapConstructTimeNs,
+          index
+      );
+      allMatcherBundles.add(matcherBundle);
+      allMatcherBundlesInfo.add(matcherBundle.getMatcherInfo());
+    }
+    for (FilterBundle.MatcherBundle bundle : partialIndexBundles) {
+      allMatcherBundles.add(bundle);
+      allMatcherBundlesInfo.add(bundle.getMatcherInfo());
+    }
+    for (FilterBundle.MatcherBundle bundle : matcherOnlyBundles) {
+      allMatcherBundles.add(bundle);
+      allMatcherBundlesInfo.add(bundle.getMatcherInfo());
+    }
+
+    return new FilterBundle(
+        null,
+        new FilterBundle.MatcherBundle()
+        {
+          @Override
+          public FilterBundle.MatcherBundleInfo getMatcherInfo()
+          {
+            return new FilterBundle.MatcherBundleInfo(
+                () -> "OR",
+                null,
+                allMatcherBundlesInfo
+            );
+          }
+
+          @Override
+          public ValueMatcher valueMatcher(ColumnSelectorFactory selectorFactory, Offset baseOffset, boolean descending)
+          {
+            final ValueMatcher[] matchers = new ValueMatcher[allMatcherBundles.size()];
+            for (int i = 0; i < allMatcherBundles.size(); i++) {
+              matchers[i] = allMatcherBundles.get(i).valueMatcher(selectorFactory, baseOffset, descending);
+            }
+            return makeMatcher(matchers);
+          }
+
+          @Override
+          public VectorValueMatcher vectorMatcher(
+              VectorColumnSelectorFactory selectorFactory,
+              ReadableVectorOffset baseOffset
+          )
+          {
+            final VectorValueMatcher[] matchers = new VectorValueMatcher[allMatcherBundles.size()];
+            for (int i = 0; i < allMatcherBundles.size(); i++) {
+              matchers[i] = allMatcherBundles.get(i).vectorMatcher(selectorFactory, baseOffset);
+            }
+            return makeVectorMatcher(matchers);
+          }
+
+          @Override
+          public boolean canVectorize()
+          {
+            for (FilterBundle.MatcherBundle bundle : allMatcherBundles) {
+              if (!bundle.canVectorize()) {
+                return false;
+              }
+            }
+            return true;
+          }
+        }
+    );
+  }
+
+  @Nullable
+  @Override
+  public BitmapColumnIndex getBitmapColumnIndex(ColumnIndexSelector selector)
+  {
+    if (filters.size() == 1) {
+      return Iterables.getOnlyElement(filters).getBitmapColumnIndex(selector);
+    }
+
+    List<BitmapColumnIndex> bitmapColumnIndices = new ArrayList<>(filters.size());
+    ColumnIndexCapabilities merged = new SimpleColumnIndexCapabilities(true, true);
+    for (Filter filter : filters) {
+      BitmapColumnIndex index = filter.getBitmapColumnIndex(selector);
+      if (index == null) {
+        // all or nothing
+        return null;
       }
-    };
-  }
+      merged = merged.merge(index.getIndexCapabilities());
+      bitmapColumnIndices.add(index);
+    }
 
-  /**
-   * Convert an index into a matcher bundle, using
-   * {@link #convertIndexToValueMatcher(ReadableOffset, ImmutableBitmap, boolean)} and
-   * {@link #convertIndexToVectorValueMatcher(ReadableVectorOffset, ImmutableBitmap)}
-   */
-  private static FilterBundle.MatcherBundle convertIndexToMatcherBundle(
-      int selectionRowCount,
-      List<FilterBundle.IndexBundle> indexOnlyBundles,
-      List<FilterBundle.IndexBundleInfo> indexOnlyBundlesInfo,
-      long totalBitmapConstructTimeNs,
-      ImmutableBitmap partialIndex
-  )
-  {
-    return new FilterBundle.MatcherBundle()
+    final ColumnIndexCapabilities finalMerged = merged;
+    return new BitmapColumnIndex()
     {
       @Override
-      public FilterBundle.MatcherBundleInfo getMatcherInfo()
+      public ColumnIndexCapabilities getIndexCapabilities()
       {
-        if (indexOnlyBundles.size() == 1) {
-          return new FilterBundle.MatcherBundleInfo(
-              indexOnlyBundles.get(0).getIndexInfo()::getFilter,
-              indexOnlyBundles.get(0).getIndexInfo(),
-              null
-          );
-        }
-        return new FilterBundle.MatcherBundleInfo(
-            () -> "OR",
-            new FilterBundle.IndexBundleInfo(
-                () -> "OR",
-                selectionRowCount,
-                totalBitmapConstructTimeNs,
-                indexOnlyBundlesInfo
-            ),
-            null
-        );
+        return finalMerged;
       }
 
       @Override
-      public ValueMatcher valueMatcher(
-          ColumnSelectorFactory selectorFactory,
-          Offset baseOffset,
-          boolean descending
-      )
+      public int estimatedComputeCost()
       {
-        return convertIndexToValueMatcher(baseOffset.getBaseReadableOffset(), partialIndex, descending);
+        // There's no additional cost on OR filter, cost in child filters would be summed.
+        return 0;
       }
 
       @Override
-      public VectorValueMatcher vectorMatcher(
-          VectorColumnSelectorFactory selectorFactory,
-          ReadableVectorOffset baseOffset
-      )
+      public <T> T computeBitmapResult(BitmapResultFactory<T> bitmapResultFactory, boolean includeUnknown)
       {
-        return convertIndexToVectorValueMatcher(baseOffset, partialIndex);
+        return bitmapResultFactory.union(
+            () -> bitmapColumnIndices.stream()
+                                     .map(x -> x.computeBitmapResult(bitmapResultFactory, includeUnknown))
+                                     .iterator()
+        );
       }
 
+      @Nullable
       @Override
-      public boolean canVectorize()
+      public <T> T computeBitmapResult(
+          BitmapResultFactory<T> bitmapResultFactory,
+          int applyRowCount,
+          int totalRowCount,
+          boolean includeUnknown
+      )
       {
-        return true;
+        List<T> results = Lists.newArrayListWithCapacity(bitmapColumnIndices.size());
+        for (BitmapColumnIndex index : bitmapColumnIndices) {
+          final T r = index.computeBitmapResult(bitmapResultFactory, applyRowCount, totalRowCount, includeUnknown);
+          if (r == null) {
+            // all or nothing
+            return null;
+          }
+          results.add(r);
+        }
+        return bitmapResultFactory.union(results);
       }
     };
   }
 
-  private static ValueMatcher convertIndexToValueMatcher(
-      final ReadableOffset offset,
-      final ImmutableBitmap rowBitmap,
-      boolean descending
-  )
+  @Override
+  public ValueMatcher makeMatcher(ColumnSelectorFactory factory)
   {
+    final ValueMatcher[] matchers = new ValueMatcher[filters.size()];
 
-    if (descending) {
-
-      final IntIterator iter = BitmapOffset.getReverseBitmapOffsetIterator(rowBitmap);
+    int i = 0;
+    for (Filter filter : filters) {
+      matchers[i++] = filter.makeMatcher(factory);
+    }
+    return makeMatcher(matchers);
+  }
 
-      if (!iter.hasNext()) {
-        return ValueMatchers.allFalse();
-      }
-      return new ValueMatcher()
-      {
-        int iterOffset = Integer.MAX_VALUE;
+  @Override
+  public VectorValueMatcher makeVectorMatcher(final VectorColumnSelectorFactory factory)
+  {
+    final VectorValueMatcher[] matchers = new VectorValueMatcher[filters.size()];
 
-        @Override
-        public boolean matches(boolean includeUnknown)
-        {
-          int currentOffset = offset.getOffset();
-          while (iterOffset > currentOffset && iter.hasNext()) {
-            iterOffset = iter.next();
-          }
+    int i = 0;
+    for (Filter filter : filters) {
+      matchers[i++] = filter.makeVectorMatcher(factory);
+    }
+    return makeVectorMatcher(matchers);
+  }
 
-          return iterOffset == currentOffset;
-        }
+  @Override
+  public boolean canVectorizeMatcher(ColumnInspector inspector)
+  {
+    return filters.stream().allMatch(filter -> filter.canVectorizeMatcher(inspector));
+  }
 
-        @Override
-        public void inspectRuntimeShape(RuntimeShapeInspector inspector)
-        {
-          inspector.visit("offset", offset);
-          inspector.visit("iter", iter);
-        }
-      };
-    } else {
-      final PeekableIntIterator peekableIterator = rowBitmap.peekableIterator();
+  @Override
+  public LinkedHashSet<Filter> getFilters()
+  {
+    return filters;
+  }
 
-      if (!peekableIterator.hasNext()) {
-        return ValueMatchers.allFalse();
+  @Override
+  public boolean supportsRequiredColumnRewrite()
+  {
+    for (Filter filter : filters) {
+      if (!filter.supportsRequiredColumnRewrite()) {
+        return false;
       }
-      return new ValueMatcher()
-      {
-        int iterOffset = -1;
-
-        @Override
-        public boolean matches(boolean includeUnknown)
-        {
-          int currentOffset = offset.getOffset();
-          peekableIterator.advanceIfNeeded(currentOffset);
-          if (peekableIterator.hasNext()) {
-            iterOffset = peekableIterator.peekNext();
-          }
-
-          return iterOffset == currentOffset;
-        }
-
-        @Override
-        public void inspectRuntimeShape(RuntimeShapeInspector inspector)
-        {
-          inspector.visit("offset", offset);
-          inspector.visit("peekableIterator", peekableIterator);
-        }
-      };
     }
+
+    return true;
   }
 
-  private static VectorValueMatcher convertIndexToVectorValueMatcher(
-      final ReadableVectorOffset vectorOffset,
-      final ImmutableBitmap bitmap
-  )
+  @Override
+  public Filter rewriteRequiredColumns(Map<String, String> columnRewrites)
   {
-    final PeekableIntIterator peekableIntIterator = bitmap.peekableIterator();
-    if (!peekableIntIterator.hasNext()) {
-      return BooleanVectorValueMatcher.of(vectorOffset, ConstantMatcherType.ALL_FALSE);
+    final List<Filter> newFilters = new ArrayList<>(filters.size());
+    for (Filter filter : filters) {
+      newFilters.add(filter.rewriteRequiredColumns(columnRewrites));
     }
+    return new OrFilter(newFilters);
+  }
 
-    return new VectorValueMatcher()
-    {
-      final VectorMatch match = VectorMatch.wrap(new int[vectorOffset.getMaxVectorSize()]);
-      int iterOffset = -1;
-      @Override
-      public ReadableVectorMatch match(ReadableVectorMatch mask, boolean includeUnknown)
-      {
-        final int[] selection = match.getSelection();
-        if (vectorOffset.isContiguous()) {
-          int numRows = 0;
-          for (int i = 0; i < mask.getSelectionSize(); i++) {
-            final int maskNum = mask.getSelection()[i];
-            final int rowNum = vectorOffset.getStartOffset() + maskNum;
-            peekableIntIterator.advanceIfNeeded(rowNum);
-            if (peekableIntIterator.hasNext()) {
-              iterOffset = peekableIntIterator.peekNext();
-              if (iterOffset == rowNum) {
-                selection[numRows++] = maskNum;
-              }
-            }
-          }
-          match.setSelectionSize(numRows);
-          return match;
-        } else {
-          final int[] currentOffsets = vectorOffset.getOffsets();
-          int numRows = 0;
-          for (int i = 0; i < mask.getSelectionSize(); i++) {
-            final int maskNum = mask.getSelection()[i];
-            final int rowNum = currentOffsets[mask.getSelection()[i]];
-            peekableIntIterator.advanceIfNeeded(rowNum);
-            if (peekableIntIterator.hasNext()) {
-              iterOffset = peekableIntIterator.peekNext();
-              if (iterOffset == rowNum) {
-                selection[numRows++] = maskNum;
-              }
-            }
-          }
-          match.setSelectionSize(numRows);
-          return match;
-        }
-      }
+  @Override
+  public String toString()
+  {
+    return StringUtils.format("(%s)", OR_JOINER.join(filters));
+  }
 
-      @Override
-      public int getMaxVectorSize()
-      {
-        return vectorOffset.getMaxVectorSize();
-      }
+  @Override
+  public boolean equals(Object o)
+  {
+    if (this == o) {
+      return true;
+    }
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+    OrFilter orFilter = (OrFilter) o;
+    return Objects.equals(getFilters(), orFilter.getFilters());
+  }
 
-      @Override
-      public int getCurrentVectorSize()
-      {
-        return vectorOffset.getCurrentVectorSize();
-      }
-    };
+  @Override
+  public int hashCode()
+  {
+    return Objects.hash(getFilters());
   }
 }
diff --git a/processing/src/main/java/org/apache/druid/segment/index/BitmapColumnIndex.java b/processing/src/main/java/org/apache/druid/segment/index/BitmapColumnIndex.java
index 04a5bb8b6b5c..f28429969d33 100644
--- a/processing/src/main/java/org/apache/druid/segment/index/BitmapColumnIndex.java
+++ b/processing/src/main/java/org/apache/druid/segment/index/BitmapColumnIndex.java
@@ -35,6 +35,14 @@ public interface BitmapColumnIndex
 {
   ColumnIndexCapabilities getIndexCapabilities();
 
+  /**
+   * Returns an estimated cost for computing the bitmap result.
+   */
+  default int estimatedComputeCost()
+  {
+    return Integer.MAX_VALUE;
+  }
+
   /**
    * Compute a bitmap result wrapped with the {@link BitmapResultFactory} representing the rows matched by this index.
    * If building a cursor, use {@link #computeBitmapResult(BitmapResultFactory, int, int, boolean)} instead.
@@ -45,7 +53,6 @@ public interface BitmapColumnIndex
    *                            to true, bitmaps returned by this method should include true bits for any rows where
    *                            the matching result is 'unknown', such as from the input being null valued.
    *                            See {@link NullHandling#useThreeValueLogic()}.
-   *
    * @return bitmap result representing rows matched by this index
    */
   <T> T computeBitmapResult(
@@ -69,7 +76,6 @@ <T> T computeBitmapResult(
    *                            set to true, bitmaps returned by this method should include true bits for any rows where
    *                            the matching result is 'unknown', such as from the input being null valued.
    *                            See {@link NullHandling#useThreeValueLogic()}.
-   *
    * @return bitmap result representing rows matched by this index
    */
   @Nullable
diff --git a/processing/src/test/java/org/apache/druid/segment/filter/FilterBundleTest.java b/processing/src/test/java/org/apache/druid/segment/filter/FilterBundleTest.java
index aa7645354440..9bc86ae3900e 100644
--- a/processing/src/test/java/org/apache/druid/segment/filter/FilterBundleTest.java
+++ b/processing/src/test/java/org/apache/druid/segment/filter/FilterBundleTest.java
@@ -43,7 +43,12 @@
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameter;
+import org.junit.runners.Parameterized.Parameters;
 
+@RunWith(Parameterized.class)
 public class FilterBundleTest extends InitializedNullHandlingTest
 {
   private Closer closer;
@@ -53,6 +58,15 @@ public class FilterBundleTest extends InitializedNullHandlingTest
   @Rule
   public TemporaryFolder tmpDir = new TemporaryFolder();
 
+  @Parameters
+  public static Object[] flags()
+  {
+    return new Object[]{false, true};
+  }
+
+  @Parameter
+  public boolean cursorAutoArrangeFilters;
+
   @Before
   public void setUp()
   {
@@ -317,8 +331,7 @@ public void test_or_countryIsNull_and_isRobotInFalseTrue_pageLike()
 
   protected FilterBundle makeFilterBundle(final Filter filter)
   {
-    return filter.makeFilterBundle(
-        indexSelector,
+    return new FilterBundle.Builder(filter, indexSelector, cursorAutoArrangeFilters).build(
         new DefaultBitmapResultFactory(bitmapFactory),
         indexSelector.getNumRows(),
         indexSelector.getNumRows(),

From 5b86fe4d21c855d7145cdf0da64e411fa829a5a7 Mon Sep 17 00:00:00 2001
From: Pranav Bhole <pranavbhole@gmail.com>
Date: Tue, 17 Sep 2024 19:31:08 -0700
Subject: [PATCH 46/47] Fixing test

---
 .../apache/druid/data/input/impl/HttpInputSourceTest.java   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/processing/src/test/java/org/apache/druid/data/input/impl/HttpInputSourceTest.java b/processing/src/test/java/org/apache/druid/data/input/impl/HttpInputSourceTest.java
index bb3f2390c4fa..c11cb96f34d4 100644
--- a/processing/src/test/java/org/apache/druid/data/input/impl/HttpInputSourceTest.java
+++ b/processing/src/test/java/org/apache/druid/data/input/impl/HttpInputSourceTest.java
@@ -158,8 +158,8 @@ public void testEmptyAllowedHeaders()
     );
     expectedException.expect(DruidException.class);
     expectedException.expectMessage(
-        "Got forbidden header r-Cookie, allowed headers are only []. "
-        + "You can set the property druid.ingestion.http.allowedHeaders in middle managers or peons to whitelist request headers");
+        "Got forbidden header [r-Cookie], allowed headers are only [[]]. "
+        + "You can control the allowed headers by updating druid.ingestion.http.allowedHeaders");
 
     final HttpInputSource inputSource = new HttpInputSource(
         ImmutableList.of(URI.create("http://test.com/http-test")),
@@ -180,7 +180,7 @@ public void shouldFailOnForbiddenHeaders()
     );
     expectedException.expect(DruidException.class);
     expectedException.expectMessage(
-        "Got forbidden header G-Cookie, allowed headers are only [r-cookie, content-type]");
+        "Got forbidden header [G-Cookie], allowed headers are only [[r-cookie, content-type]]");
     new HttpInputSource(
         ImmutableList.of(URI.create("http://test.com/http-test")),
         "myName",

From 1e8335a8e630709a29e40f5316f04f4a4304a9b1 Mon Sep 17 00:00:00 2001
From: Pranav Bhole <pranavbhole@gmail.com>
Date: Tue, 17 Sep 2024 19:35:18 -0700
Subject: [PATCH 47/47] fixing test

---
 .../sql/calcite/IngestTableFunctionTest.java  | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/IngestTableFunctionTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/IngestTableFunctionTest.java
index 2ddcc82d2ce0..9a1a79dea4d6 100644
--- a/sql/src/test/java/org/apache/druid/sql/calcite/IngestTableFunctionTest.java
+++ b/sql/src/test/java/org/apache/druid/sql/calcite/IngestTableFunctionTest.java
@@ -87,6 +87,16 @@
 @SqlTestFrameworkConfig.ComponentSupplier(IngestTableFunctionTest.ExportComponentSupplier.class)
 public class IngestTableFunctionTest extends CalciteIngestionDmlTest
 {
+  protected static URI toURI(String uri)
+  {
+    try {
+      return new URI(uri);
+    }
+    catch (URISyntaxException e) {
+      throw new ISE("Bad URI: %s", uri);
+    }
+  }
+
   protected final ExternalDataSource httpDataSource = new ExternalDataSource(
       new HttpInputSource(
           Collections.singletonList(toURI("http://foo.com/bar.csv")),
@@ -118,16 +128,6 @@ public class IngestTableFunctionTest extends CalciteIngestionDmlTest
                   .build()
   );
 
-  protected static URI toURI(String uri)
-  {
-    try {
-      return new URI(uri);
-    }
-    catch (URISyntaxException e) {
-      throw new ISE("Bad URI: %s", uri);
-    }
-  }
-
   /**
    * Basic use of EXTERN
    */