From ba5e39e11d18575c74d4ec08e3c700492342bab1 Mon Sep 17 00:00:00 2001 From: Sasha Syrotenko Date: Fri, 30 Jan 2026 20:27:02 +0200 Subject: [PATCH 01/14] Adjust costs for burst scaleup during heavy lag for cost-based autoscaler --- .../autoscaler/CostBasedAutoScaler.java | 7 +++--- .../autoscaler/CostBasedAutoScalerTest.java | 23 ++++++++++--------- .../autoscaler/WeightedCostFunctionTest.java | 2 +- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java index a7ea833da347..060240014be4 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java @@ -62,7 +62,7 @@ public class CostBasedAutoScaler implements SupervisorTaskAutoScaler * This constant helps control the granularity of lag considerations in scaling decisions, * ensuring smoother transitions between scaled states and avoiding abrupt changes in task counts. */ - private static final int LAG_STEP = 100_000; + private static final int LAG_STEP = 50_000; /** * This parameter fine-tunes autoscaling behavior by adding extra flexibility * when calculating maximum allowable partitions per task in response to lag, @@ -71,10 +71,10 @@ public class CostBasedAutoScaler implements SupervisorTaskAutoScaler */ private static final int BASE_RAW_EXTRA = 5; // Base PPT lag threshold allowing to activate a burst scaleup to eliminate high lag. - static final int EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD = 50_000; + static final int EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD = 25_000; // Extra PPT lag threshold allowing activation of even more aggressive scaleup to eliminate high lag, // also enabling lag-amplified idle calculation decay in the cost function (to reduce idle weight). - static final int AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD = 100_000; + static final int AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD = 50_000; public static final String LAG_COST_METRIC = "task/autoScaler/costBased/lagCost"; public static final String IDLE_COST_METRIC = "task/autoScaler/costBased/idleCost"; @@ -172,6 +172,7 @@ public int computeTaskCountForScaleAction() log.info("New task count [%d] on supervisor [%s], scaling up", taskCount, supervisorId); } else if (!config.isScaleDownOnTaskRolloverOnly() && optimalTaskCount < currentTaskCount + && optimalTaskCount > 0 && ++scaleDownCounter >= config.getScaleDownBarrier()) { taskCount = optimalTaskCount; scaleDownCounter = 0; diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java index 17c247f24b02..7d1d3c85bd53 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java @@ -38,6 +38,7 @@ import static org.apache.druid.indexing.common.stats.DropwizardRowIngestionMeters.FIFTEEN_MINUTE_NAME; import static org.apache.druid.indexing.common.stats.DropwizardRowIngestionMeters.FIVE_MINUTE_NAME; import static org.apache.druid.indexing.common.stats.DropwizardRowIngestionMeters.ONE_MINUTE_NAME; +import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler.AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD; import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD; import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler.computeExtraMaxPartitionsPerTaskIncrease; import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler.computeValidTaskCounts; @@ -174,14 +175,14 @@ class Example } Example[] examples = new Example[]{ - new Example(3, 50_000L, 8), - new Example(3, 300_000L, 15), - new Example(3, 500_000L, 30), - new Example(10, 100_000L, 15), - new Example(10, 300_000L, 30), - new Example(10, 500_000L, 30), - new Example(20, 500_000L, 30), - new Example(25, 500_000L, 30) + new Example(3, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD, 8), + new Example(3, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 3, 15), + new Example(3, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 5, 30), + new Example(10, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD - 1, 15), + new Example(10, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 3, 30), + new Example(10, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 10, 30), + new Example(20, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 10, 30), + new Example(25, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD* 10, 30) }; for (Example example : examples) { @@ -217,13 +218,13 @@ class Example public void testComputeExtraPPTIncrease() { // No extra increase below the threshold - Assert.assertEquals(0, computeExtraMaxPartitionsPerTaskIncrease(30L * 49_000L, 30, 3, 30)); + Assert.assertEquals(0, computeExtraMaxPartitionsPerTaskIncrease(30L * EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD - 1, 30, 3, 30)); Assert.assertEquals(4, computeExtraMaxPartitionsPerTaskIncrease(30L * EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD, 30, 3, 30)); // More aggressive increase when the lag is high - Assert.assertEquals(6, computeExtraMaxPartitionsPerTaskIncrease(30L * 300_000L, 30, 3, 30)); + Assert.assertEquals(8, computeExtraMaxPartitionsPerTaskIncrease(30L * AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 5, 30, 3, 30)); // Zero when on max task count - Assert.assertEquals(0, computeExtraMaxPartitionsPerTaskIncrease(30L * 500_000L, 30, 30, 30)); + Assert.assertEquals(0, computeExtraMaxPartitionsPerTaskIncrease(30L * AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 10, 30, 30, 30)); } @Test diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java index 416def7e3ab5..4f5832915f68 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java @@ -310,7 +310,7 @@ public void testLagAmplificationReducesIdleUnderHighLag() int partitionCount = 30; double pollIdleRatio = 0.1; - CostMetrics lowLag = createMetrics(40_000.0, currentTaskCount, partitionCount, pollIdleRatio); + CostMetrics lowLag = createMetrics(5_000.0, currentTaskCount, partitionCount, pollIdleRatio); CostMetrics highLag = createMetrics(500_000.0, currentTaskCount, partitionCount, pollIdleRatio); double lowLagCost = costFunction.computeCost(lowLag, proposedTaskCount, idleOnlyConfig).totalCost(); From 8c776320ce800d4b167210a542f1fc1241cb3bca Mon Sep 17 00:00:00 2001 From: Sasha Syrotenko Date: Fri, 30 Jan 2026 20:44:41 +0200 Subject: [PATCH 02/14] Checkstyle --- .../supervisor/autoscaler/CostBasedAutoScalerTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java index 7d1d3c85bd53..5deae662d755 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java @@ -182,7 +182,7 @@ class Example new Example(10, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 3, 30), new Example(10, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 10, 30), new Example(20, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 10, 30), - new Example(25, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD* 10, 30) + new Example(25, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 10, 30) }; for (Example example : examples) { From 696559432d6684ec457d12e91965a47a75a59a20 Mon Sep 17 00:00:00 2001 From: Sasha Syrotenko Date: Mon, 2 Feb 2026 16:52:56 +0200 Subject: [PATCH 03/14] Introduce additional temporary config params to tweak high lag handling --- docs/ingestion/supervisor.md | 2 +- .../CostBasedAutoScalerIntegrationTest.java | 5 +- .../autoscaler/CostBasedAutoScaler.java | 104 ++++++++++++------ .../autoscaler/CostBasedAutoScalerConfig.java | 63 ++++++++--- .../autoscaler/WeightedCostFunction.java | 50 ++++----- .../CostBasedAutoScalerConfigTest.java | 20 +++- .../CostBasedAutoScalerMockTest.java | 50 ++++----- .../autoscaler/CostBasedAutoScalerTest.java | 80 ++++++++------ .../autoscaler/WeightedCostFunctionTest.java | 82 ++++++++++++++ 9 files changed, 304 insertions(+), 152 deletions(-) diff --git a/docs/ingestion/supervisor.md b/docs/ingestion/supervisor.md index b6be49cec9ba..57a8d1a96b59 100644 --- a/docs/ingestion/supervisor.md +++ b/docs/ingestion/supervisor.md @@ -208,7 +208,7 @@ The following table outlines the configuration properties related to the `costBa |`lagWeight`|The weight of extracted lag value in cost function.| No| 0.25| |`idleWeight`|The weight of extracted poll idle value in cost function. | No | 0.75 | |`defaultProcessingRate`|A planned processing rate per task, required for first cost estimations. | No | 1000 | -|`scaleDownBarrier`| A number of successful scale down attempts which should be skipped to prevent the auto-scaler from scaling down tasks immediately. | No | 5 | +|`minScaleDownDelay`| Minimum duration between successful scale actions, specified as an ISO-8601 duration string. | No | `PT50M` | |`scaleDownDuringTaskRolloverOnly`| Indicates whether task scaling down is limited to periods during task rollovers only. | No | False | The following example shows a supervisor spec with `lagBased` autoscaler: diff --git a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/autoscaler/CostBasedAutoScalerIntegrationTest.java b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/autoscaler/CostBasedAutoScalerIntegrationTest.java index ad23e0b6fbe6..45873ac8096e 100644 --- a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/autoscaler/CostBasedAutoScalerIntegrationTest.java +++ b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/autoscaler/CostBasedAutoScalerIntegrationTest.java @@ -41,6 +41,7 @@ import org.hamcrest.Matchers; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; +import org.joda.time.Duration; import org.joda.time.Period; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -133,7 +134,7 @@ public void test_autoScaler_computesOptimalTaskCountAndProduceScaleDown() .lagWeight(0.9) .idleWeight(0.1) .scaleDownDuringTaskRolloverOnly(false) - .scaleDownBarrier(1) + .scaleDownBarrier(Duration.millis(1500)) .build(); final KafkaSupervisorSpec spec = createKafkaSupervisorWithAutoScaler(superId, autoScalerConfig, initialTaskCount); @@ -229,7 +230,7 @@ void test_scaleDownDuringTaskRollover() .idleWeight(0.9) .scaleDownDuringTaskRolloverOnly(true) // Do not slow scale-downs - .scaleDownBarrier(0) + .scaleDownBarrier(Duration.ZERO) .build(); final KafkaSupervisorSpec spec = createKafkaSupervisorWithAutoScaler(superId, autoScalerConfig, initialTaskCount); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java index 060240014be4..8b137e000f9c 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java @@ -27,6 +27,7 @@ import org.apache.druid.indexing.overlord.supervisor.autoscaler.SupervisorTaskAutoScaler; import org.apache.druid.indexing.seekablestream.SeekableStreamIndexTaskRunner; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor; +import org.apache.druid.java.util.common.DateTimes; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.concurrent.Execs; import org.apache.druid.java.util.emitter.EmittingLogger; @@ -57,24 +58,22 @@ public class CostBasedAutoScaler implements SupervisorTaskAutoScaler private static final int MAX_INCREASE_IN_PARTITIONS_PER_TASK = 2; private static final int MAX_DECREASE_IN_PARTITIONS_PER_TASK = MAX_INCREASE_IN_PARTITIONS_PER_TASK * 2; + /** - * Defines the step size used for evaluating lag when computing scaling actions. - * This constant helps control the granularity of lag considerations in scaling decisions, - * ensuring smoother transitions between scaled states and avoiding abrupt changes in task counts. + * Controls how fast the additional tasks grow with the square root of current tasks. + * This allows bigger jumps when under-provisioned, but growth slows down as the task count increases. */ - private static final int LAG_STEP = 50_000; + private static final int SQRT_TASK_COUNT_SCALE_FACTOR = 5; /** - * This parameter fine-tunes autoscaling behavior by adding extra flexibility - * when calculating maximum allowable partitions per task in response to lag, - * which must be processed as fast, as possible. - * It acts as a foundational factor that balances the responsiveness and stability of autoscaling. + * Caps the maximum number of additional tasks in a single scale-up to preserve stability. */ - private static final int BASE_RAW_EXTRA = 5; + private static final int MAX_JUMP = 12; + // Base PPT lag threshold allowing to activate a burst scaleup to eliminate high lag. - static final int EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD = 25_000; + static final int EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD = 50_000; // Extra PPT lag threshold allowing activation of even more aggressive scaleup to eliminate high lag, // also enabling lag-amplified idle calculation decay in the cost function (to reduce idle weight). - static final int AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD = 50_000; + static final int AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD = 100_000; public static final String LAG_COST_METRIC = "task/autoScaler/costBased/lagCost"; public static final String IDLE_COST_METRIC = "task/autoScaler/costBased/idleCost"; @@ -90,7 +89,7 @@ public class CostBasedAutoScaler implements SupervisorTaskAutoScaler private final WeightedCostFunction costFunction; private volatile CostMetrics lastKnownMetrics; - private int scaleDownCounter = 0; + private volatile long lastScaleActionTimeMillis = -1; public CostBasedAutoScaler( SeekableStreamSupervisor supervisor, @@ -153,7 +152,6 @@ public int computeTaskCountForRollover() if (config.isScaleDownOnTaskRolloverOnly()) { return computeOptimalTaskCount(lastKnownMetrics); } else { - scaleDownCounter = 0; return -1; } } @@ -166,16 +164,16 @@ public int computeTaskCountForScaleAction() // Perform scale-up actions; scale-down actions only if configured. int taskCount = -1; - if (optimalTaskCount > currentTaskCount) { + if (isScaleActionAllowed() && optimalTaskCount > currentTaskCount) { taskCount = optimalTaskCount; - scaleDownCounter = 0; // Nullify the scaleDown counter after a successful scaleup too. + lastScaleActionTimeMillis = DateTimes.nowUtc().getMillis(); log.info("New task count [%d] on supervisor [%s], scaling up", taskCount, supervisorId); } else if (!config.isScaleDownOnTaskRolloverOnly() + && isScaleActionAllowed() && optimalTaskCount < currentTaskCount - && optimalTaskCount > 0 - && ++scaleDownCounter >= config.getScaleDownBarrier()) { + && optimalTaskCount > 0) { taskCount = optimalTaskCount; - scaleDownCounter = 0; + lastScaleActionTimeMillis = DateTimes.nowUtc().getMillis(); log.info("New task count [%d] on supervisor [%s], scaling down", taskCount, supervisorId); } else { log.info("No scaling required for supervisor [%s]", supervisorId); @@ -217,7 +215,8 @@ int computeOptimalTaskCount(CostMetrics metrics) currentTaskCount, (long) metrics.getAggregateLag(), config.getTaskCountMin(), - config.getTaskCountMax() + config.getTaskCountMax(), + config.getHighLagThreshold() ); if (validTaskCounts.length == 0) { @@ -231,7 +230,7 @@ int computeOptimalTaskCount(CostMetrics metrics) for (int taskCount : validTaskCounts) { CostResult costResult = costFunction.computeCost(metrics, taskCount, config); double cost = costResult.totalCost(); - log.debug( + log.info( "Proposed task count: %d, Cost: %.4f (lag: %.4f, idle: %.4f)", taskCount, cost, @@ -279,7 +278,8 @@ static int[] computeValidTaskCounts( int currentTaskCount, double aggregateLag, int taskCountMin, - int taskCountMax + int taskCountMax, + int highLagThreshold ) { if (partitionCount <= 0 || currentTaskCount <= 0) { @@ -288,11 +288,12 @@ static int[] computeValidTaskCounts( IntSet result = new IntArraySet(); final int currentPartitionsPerTask = partitionCount / currentTaskCount; - final int extraIncrease = computeExtraMaxPartitionsPerTaskIncrease( + final int extraIncrease = computeScaleUpBoost( aggregateLag, partitionCount, currentTaskCount, - taskCountMax + taskCountMax, + highLagThreshold ); final int effectiveMaxIncrease = MAX_INCREASE_IN_PARTITIONS_PER_TASK + extraIncrease; @@ -314,14 +315,23 @@ static int[] computeValidTaskCounts( /** * Computes extra allowed increase in partitions-per-task in scenarios when the average per-partition lag - * is above the configured threshold. By default, it is {@code EXTRA_SCALING_ACTIVATION_LAG_THRESHOLD}. - * Generally, one of the autoscaler priorities is to keep the lag as close to zero as possible. + * is above the configured threshold. + *

+ * This uses a capped sqrt-based formula: + * {@code additionalTasks = min(MAX_JUMP, BASE + sqrt(currentTasks) * SQRT_COEFF) * lagFactor * headroom} + *

+ * This ensures: + * 1. Narrow window preserved {@code MAX_JUMP} caps the reach. + * 2. Bigger jumps are allowed when under-provisioned. + * 3. Sqrt growth (additional tasks grow slower than task count). + * 4. Self-damping (headroomRatio reduces jumps near max capacity). */ - static int computeExtraMaxPartitionsPerTaskIncrease( + static int computeScaleUpBoost( double aggregateLag, int partitionCount, int currentTaskCount, - int taskCountMax + int taskCountMax, + int highLagThreshold ) { if (partitionCount <= 0 || taskCountMax <= 0) { @@ -329,17 +339,24 @@ static int computeExtraMaxPartitionsPerTaskIncrease( } final double lagPerPartition = aggregateLag / partitionCount; - if (lagPerPartition < EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD) { + if (lagPerPartition < highLagThreshold) { return 0; } - int rawExtra = BASE_RAW_EXTRA; - if (lagPerPartition > AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD) { - rawExtra += (int) ((lagPerPartition - AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD) / LAG_STEP); - } - + final double lagSeverity = lagPerPartition / highLagThreshold; + final double lagFactor = lagSeverity / (lagSeverity + 1.0); final double headroomRatio = Math.max(0.0, 1.0 - (double) currentTaskCount / taskCountMax); - return (int) (rawExtra * headroomRatio); + + // Compute target additional tasks (sqrt-based growth with cap) + final double rawAdditional = 1 + Math.sqrt(currentTaskCount) * SQRT_TASK_COUNT_SCALE_FACTOR; + final double cappedAdditional = Math.min(rawAdditional, MAX_JUMP); + final int additionalTasks = (int) (cappedAdditional * lagFactor * headroomRatio); + + final int targetMax = Math.min(taskCountMax, currentTaskCount + additionalTasks); + final int targetMinPPT = Math.max(1, (partitionCount + targetMax - 1) / targetMax); + final int currentPPT = partitionCount / currentTaskCount; + + return Math.max(0, currentPPT - targetMinPPT - MAX_INCREASE_IN_PARTITIONS_PER_TASK); } /** @@ -464,4 +481,23 @@ private CostMetrics collectMetrics() ); } + /** + * Determines if a scale action is currently allowed based on the elapsed time + * since the last scale action and the configured minimum scale-down delay. + */ + private boolean isScaleActionAllowed() + { + if (lastScaleActionTimeMillis < 0) { + return true; + } + + final long barrierMillis = config.getMinScaleDownDelay().getMillis(); + if (barrierMillis <= 0) { + return true; + } + + final long elapsedMillis = DateTimes.nowUtc().getMillis() - lastScaleActionTimeMillis; + return elapsedMillis >= barrierMillis; + } + } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java index d835fea51578..eeaf46614e70 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java @@ -29,6 +29,7 @@ import org.apache.druid.indexing.overlord.supervisor.autoscaler.SupervisorTaskAutoScaler; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor; import org.apache.druid.java.util.emitter.service.ServiceEmitter; +import org.joda.time.Duration; import javax.annotation.Nullable; import java.util.Objects; @@ -48,12 +49,12 @@ public class CostBasedAutoScalerConfig implements AutoScalerConfig static final double DEFAULT_LAG_WEIGHT = 0.25; static final double DEFAULT_IDLE_WEIGHT = 0.75; static final double DEFAULT_PROCESSING_RATE = 1000.0; // 1000 records/sec per task as default - static final int DEFAULT_SCALE_DOWN_BARRIER = 5; // We delay scale down by 5 * DEFAULT_SCALE_ACTION_PERIOD_MILLIS + static final Duration DEFAULT_MIN_SCALE_DELAY = Duration.millis(DEFAULT_SCALE_ACTION_PERIOD_MILLIS * 3); private final boolean enableTaskAutoScaler; private final int taskCountMax; private final int taskCountMin; - private Integer taskCountStart; + private final Integer taskCountStart; private final long minTriggerScaleActionFrequencyMillis; private final Double stopTaskCountRatio; private final long scaleActionPeriodMillis; @@ -62,12 +63,15 @@ public class CostBasedAutoScalerConfig implements AutoScalerConfig private final double idleWeight; private final double defaultProcessingRate; /** - * Represents the threshold value used to prevent the auto-scaler from scaling down tasks immediately, - * when the computed cost-based metrics fall below this barrier. - * A higher value implies a more conservative scaling down behavior, ensuring that tasks - * are not prematurely terminated in scenarios of potential workload spikes or insufficient cost savings. + * Per-partition lag threshold allowing to activate a burst scaleup to eliminate high lag. */ - private final int scaleDownBarrier; + private final int highLagThreshold; + /** + * Represents the minimum duration between successful scale actions. + * A higher value implies a more conservative scaling behavior, ensuring that tasks + * are not scaled too frequently during workload fluctuations. + */ + private final Duration minScaleDownDelay; /** * Indicates whether task scaling down is limited to periods during task rollovers only. * If set to {@code false}, allows scaling down during normal task run time. @@ -86,7 +90,8 @@ public CostBasedAutoScalerConfig( @Nullable @JsonProperty("lagWeight") Double lagWeight, @Nullable @JsonProperty("idleWeight") Double idleWeight, @Nullable @JsonProperty("defaultProcessingRate") Double defaultProcessingRate, - @Nullable @JsonProperty("scaleDownBarrier") Integer scaleDownBarrier, + @Nullable @JsonProperty("highLagThreshold") Integer highLagThreshold, + @Nullable @JsonProperty("minScaleDownDelay") Duration minScaleDownDelay, @Nullable @JsonProperty("scaleDownDuringTaskRolloverOnly") Boolean scaleDownDuringTaskRolloverOnly ) { @@ -105,7 +110,11 @@ public CostBasedAutoScalerConfig( this.lagWeight = Configs.valueOrDefault(lagWeight, DEFAULT_LAG_WEIGHT); this.idleWeight = Configs.valueOrDefault(idleWeight, DEFAULT_IDLE_WEIGHT); this.defaultProcessingRate = Configs.valueOrDefault(defaultProcessingRate, DEFAULT_PROCESSING_RATE); - this.scaleDownBarrier = Configs.valueOrDefault(scaleDownBarrier, DEFAULT_SCALE_DOWN_BARRIER); + this.highLagThreshold = Configs.valueOrDefault( + highLagThreshold, + CostBasedAutoScaler.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD + ); + this.minScaleDownDelay = Configs.valueOrDefault(minScaleDownDelay, DEFAULT_MIN_SCALE_DELAY); this.scaleDownDuringTaskRolloverOnly = Configs.valueOrDefault(scaleDownDuringTaskRolloverOnly, false); if (this.enableTaskAutoScaler) { @@ -133,7 +142,7 @@ public CostBasedAutoScalerConfig( Preconditions.checkArgument(this.lagWeight >= 0, "lagWeight must be >= 0"); Preconditions.checkArgument(this.idleWeight >= 0, "idleWeight must be >= 0"); Preconditions.checkArgument(this.defaultProcessingRate > 0, "defaultProcessingRate must be > 0"); - Preconditions.checkArgument(this.scaleDownBarrier >= 0, "scaleDownBarrier must be >= 0"); + Preconditions.checkArgument(this.minScaleDownDelay.getMillis() >= 0, "minScaleDownDelay must be >= 0"); } /** @@ -213,9 +222,9 @@ public double getDefaultProcessingRate() } @JsonProperty - public int getScaleDownBarrier() + public Duration getMinScaleDownDelay() { - return scaleDownBarrier; + return minScaleDownDelay; } @JsonProperty("scaleDownDuringTaskRolloverOnly") @@ -224,6 +233,12 @@ public boolean isScaleDownOnTaskRolloverOnly() return scaleDownDuringTaskRolloverOnly; } + @JsonProperty("highLagThreshold") + public int getHighLagThreshold() + { + return highLagThreshold; + } + @Override public SupervisorTaskAutoScaler createAutoScaler(Supervisor supervisor, SupervisorSpec spec, ServiceEmitter emitter) { @@ -250,7 +265,7 @@ public boolean equals(Object o) && Double.compare(that.lagWeight, lagWeight) == 0 && Double.compare(that.idleWeight, idleWeight) == 0 && Double.compare(that.defaultProcessingRate, defaultProcessingRate) == 0 - && scaleDownBarrier == that.scaleDownBarrier + && Objects.equals(minScaleDownDelay, that.minScaleDownDelay) && scaleDownDuringTaskRolloverOnly == that.scaleDownDuringTaskRolloverOnly && Objects.equals(taskCountStart, that.taskCountStart) && Objects.equals(stopTaskCountRatio, that.stopTaskCountRatio); @@ -270,7 +285,7 @@ public int hashCode() lagWeight, idleWeight, defaultProcessingRate, - scaleDownBarrier, + minScaleDownDelay, scaleDownDuringTaskRolloverOnly ); } @@ -289,7 +304,8 @@ public String toString() ", lagWeight=" + lagWeight + ", idleWeight=" + idleWeight + ", defaultProcessingRate=" + defaultProcessingRate + - ", scaleDownBarrier=" + scaleDownBarrier + + ", highLagThreshold=" + highLagThreshold + + ", scaleDownBarrier=" + minScaleDownDelay + ", scaleDownDuringTaskRolloverOnly=" + scaleDownDuringTaskRolloverOnly + '}'; } @@ -310,8 +326,9 @@ public static class Builder private Double lagWeight; private Double idleWeight; private Double defaultProcessingRate; - private Integer scaleDownBarrier; + private Duration scaleDownBarrier; private Boolean scaleDownDuringTaskRolloverOnly; + private Integer highLagThreshold; private Builder() { @@ -377,7 +394,7 @@ public Builder defaultProcessingRate(double defaultProcessingRate) return this; } - public Builder scaleDownBarrier(int scaleDownBarrier) + public Builder scaleDownBarrier(Duration scaleDownBarrier) { this.scaleDownBarrier = scaleDownBarrier; return this; @@ -389,6 +406,17 @@ public Builder scaleDownDuringTaskRolloverOnly(boolean scaleDownDuringTaskRollov return this; } + public Builder highLagThreshold(int highLagThreshold) + { + this.highLagThreshold = highLagThreshold; + return this; + } + + public Builder aggressiveScalingLagPerPartitionThreshold(int aggressiveScalingLagPerPartitionThreshold) + { + return this; + } + public CostBasedAutoScalerConfig build() { return new CostBasedAutoScalerConfig( @@ -402,6 +430,7 @@ public CostBasedAutoScalerConfig build() lagWeight, idleWeight, defaultProcessingRate, + highLagThreshold, scaleDownBarrier, scaleDownDuringTaskRolloverOnly ); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunction.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunction.java index 8a3759556910..9922fd97088b 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunction.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunction.java @@ -37,17 +37,11 @@ public class WeightedCostFunction * cost-based auto-scaling decisions. */ private static final double LAG_AMPLIFICATION_MAX_MULTIPLIER = 2.0; - private static final long LAG_AMPLIFICATION_MAX_LAG_PER_PARTITION = 500_000L; /** - * It is used to calculate the denominator for the ramp formula in the cost - * computation logic. This value represents the difference between the maximum lag per - * partition (LAG_AMPLIFICATION_MAX_LAG_PER_PARTITION) and the extra scaling activation - * lag threshold (CostBasedAutoScaler.EXTRA_SCALING_ACTIVATION_LAG_THRESHOLD). - *

- * It is impacting how the cost model evaluates scaling decisions during high-lag sceario. + * Multiplier for computing the maximum lag per partition used in lag amplification. + * The max lag is calculated as: aggressiveScalingLagPerPartitionThreshold * LAG_AMPLIFICATION_MAX_LAG_MULTIPLIER. */ - private static final double RAMP_DENOMINATOR = - LAG_AMPLIFICATION_MAX_LAG_PER_PARTITION - (double) CostBasedAutoScaler.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD; + private static final int LAG_AMPLIFICATION_MAX_LAG_MULTIPLIER = 5; /** * Computes cost for a given task count using compute time metrics. @@ -86,7 +80,7 @@ public CostResult computeCost(CostMetrics metrics, int proposedTaskCount, CostBa lagRecoveryTime = metrics.getAggregateLag() / (proposedTaskCount * avgProcessingRate); } - final double predictedIdleRatio = estimateIdleRatio(metrics, proposedTaskCount); + final double predictedIdleRatio = estimateIdleRatio(metrics, proposedTaskCount, config); final double idleCost = proposedTaskCount * metrics.getTaskDurationSeconds() * predictedIdleRatio; final double lagCost = config.getLagWeight() * lagRecoveryTime; final double weightedIdleCost = config.getIdleWeight() * idleCost; @@ -117,9 +111,11 @@ public CostResult computeCost(CostMetrics metrics, int proposedTaskCount, CostBa * * @param metrics current system metrics containing idle ratio and task count * @param taskCount target task count to estimate an idle ratio for + * @param config auto-scaler configuration containing threshold values * @return estimated idle ratio in range [0.0, 1.0] */ - private double estimateIdleRatio(CostMetrics metrics, int taskCount) + @SuppressWarnings("ExtractMethodRecommender") + private double estimateIdleRatio(CostMetrics metrics, int taskCount, CostBasedAutoScalerConfig config) { final double currentPollIdleRatio = metrics.getPollIdleRatio(); @@ -138,24 +134,20 @@ private double estimateIdleRatio(CostMetrics metrics, int taskCount) final double taskRatio = (double) taskCount / currentTaskCount; final double linearPrediction = Math.max(0.0, Math.min(1.0, 1.0 - busyFraction / taskRatio)); - // Lag-based adjustment: more work per task → less idle - final double lagPerTask = metrics.getAggregateLag() / taskCount; - double lagBusyFactor = 1.0 - Math.exp(-lagPerTask / CostBasedAutoScaler.AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD); - final int partitionCount = metrics.getPartitionCount(); - - if (partitionCount > 0) { - final double lagPerPartition = metrics.getAggregateLag() / partitionCount; - // Lag-amplified idle decay - if (lagPerPartition >= CostBasedAutoScaler.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD) { - double ramp = Math.max(0.0, - (lagPerPartition - CostBasedAutoScaler.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD) - / RAMP_DENOMINATOR - ); - ramp = Math.min(1.0, ramp); - - final double multiplier = 1.0 + ramp * (LAG_AMPLIFICATION_MAX_MULTIPLIER - 1.0); - lagBusyFactor = Math.min(1.0, lagBusyFactor * multiplier); - } + final double lagPerPartition = metrics.getAggregateLag() / metrics.getPartitionCount(); + double lagBusyFactor = 0.; + + // Lag-amplified idle decay + final int extraThreshold = config.getHighLagThreshold(); + if (lagPerPartition >= extraThreshold) { + final double lagPerTask = metrics.getAggregateLag() / taskCount; + lagBusyFactor = 1.0 - Math.exp(-lagPerTask / extraThreshold); + + final long lagAmplificationMaxLagPerPartition = (long) extraThreshold * LAG_AMPLIFICATION_MAX_LAG_MULTIPLIER; + final double rampDenominator = lagAmplificationMaxLagPerPartition - (double) extraThreshold; + final double ramp = Math.min(1.0, Math.max(0.0, (lagPerPartition - extraThreshold) / rampDenominator)); + + lagBusyFactor = Math.min(1.0, lagBusyFactor * (1.0 + ramp * (LAG_AMPLIFICATION_MAX_MULTIPLIER - 1.0))); } // Clamp to valid range [0, 1] diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java index 364e0d75b808..707beb87e904 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java @@ -21,15 +21,17 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.druid.jackson.DefaultObjectMapper; +import org.joda.time.Duration; import org.junit.Assert; import org.junit.Test; +import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD; import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScalerConfig.DEFAULT_IDLE_WEIGHT; import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScalerConfig.DEFAULT_LAG_WEIGHT; +import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScalerConfig.DEFAULT_MIN_SCALE_DELAY; import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScalerConfig.DEFAULT_MIN_TRIGGER_SCALE_ACTION_FREQUENCY_MILLIS; import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScalerConfig.DEFAULT_PROCESSING_RATE; import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScalerConfig.DEFAULT_SCALE_ACTION_PERIOD_MILLIS; -import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScalerConfig.DEFAULT_SCALE_DOWN_BARRIER; public class CostBasedAutoScalerConfigTest { @@ -50,7 +52,8 @@ public void testSerdeWithAllProperties() throws Exception + " \"lagWeight\": 0.6,\n" + " \"idleWeight\": 0.4,\n" + " \"defaultProcessingRate\": 2000.0,\n" - + " \"scaleDownBarrier\": 10,\n" + + " \"highLagThreshold\": 30000,\n" + + " \"minScaleDownDelay\": \"PT10M\",\n" + " \"scaleDownDuringTaskRolloverOnly\": true\n" + "}"; @@ -66,8 +69,9 @@ public void testSerdeWithAllProperties() throws Exception Assert.assertEquals(0.6, config.getLagWeight(), 0.001); Assert.assertEquals(0.4, config.getIdleWeight(), 0.001); Assert.assertEquals(2000.0, config.getDefaultProcessingRate(), 0.001); - Assert.assertEquals(10, config.getScaleDownBarrier()); + Assert.assertEquals(Duration.standardMinutes(10), config.getMinScaleDownDelay()); Assert.assertTrue(config.isScaleDownOnTaskRolloverOnly()); + Assert.assertEquals(30000, config.getHighLagThreshold()); // Test serialization back to JSON String serialized = mapper.writeValueAsString(config); @@ -98,10 +102,11 @@ public void testSerdeWithDefaults() throws Exception Assert.assertEquals(DEFAULT_LAG_WEIGHT, config.getLagWeight(), 0.001); Assert.assertEquals(DEFAULT_IDLE_WEIGHT, config.getIdleWeight(), 0.001); Assert.assertEquals(DEFAULT_PROCESSING_RATE, config.getDefaultProcessingRate(), 0.001); - Assert.assertEquals(DEFAULT_SCALE_DOWN_BARRIER, config.getScaleDownBarrier()); + Assert.assertEquals(DEFAULT_MIN_SCALE_DELAY, config.getMinScaleDownDelay()); Assert.assertFalse(config.isScaleDownOnTaskRolloverOnly()); Assert.assertNull(config.getTaskCountStart()); Assert.assertNull(config.getStopTaskCountRatio()); + Assert.assertEquals(EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD, config.getHighLagThreshold()); } @Test @@ -184,8 +189,10 @@ public void testBuilder() .lagWeight(0.6) .idleWeight(0.4) .defaultProcessingRate(2000.0) - .scaleDownBarrier(10) + .scaleDownBarrier(Duration.standardMinutes(10)) .scaleDownDuringTaskRolloverOnly(true) + .highLagThreshold(30000) + .aggressiveScalingLagPerPartitionThreshold(60000) .build(); Assert.assertTrue(config.getEnableTaskAutoScaler()); @@ -198,7 +205,8 @@ public void testBuilder() Assert.assertEquals(0.6, config.getLagWeight(), 0.001); Assert.assertEquals(0.4, config.getIdleWeight(), 0.001); Assert.assertEquals(2000.0, config.getDefaultProcessingRate(), 0.001); - Assert.assertEquals(10, config.getScaleDownBarrier()); + Assert.assertEquals(Duration.standardMinutes(10), config.getMinScaleDownDelay()); Assert.assertTrue(config.isScaleDownOnTaskRolloverOnly()); + Assert.assertEquals(30000, config.getHighLagThreshold()); } } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerMockTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerMockTest.java index becfd4964b4a..347908af1023 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerMockTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerMockTest.java @@ -77,12 +77,12 @@ public void setUp() @Test public void testScaleUpWhenOptimalGreaterThanCurrent() { - // Use config with barrier=2 to test counter reset behavior + // Use config with a long barrier to test cooldown behavior CostBasedAutoScalerConfig barrierConfig = CostBasedAutoScalerConfig.builder() .taskCountMax(100) .taskCountMin(1) .enableTaskAutoScaler(true) - .scaleDownBarrier(2) + .scaleDownBarrier(Duration.standardHours(1)) .build(); CostBasedAutoScaler autoScaler = spy(new CostBasedAutoScaler( @@ -94,14 +94,7 @@ public void testScaleUpWhenOptimalGreaterThanCurrent() int currentTaskCount = 10; int scaleUpOptimal = 17; - int scaleDownOptimal = 5; - - // First, increment the scaleDownCounter by making a scale-down attempt - doReturn(scaleDownOptimal).when(autoScaler).computeOptimalTaskCount(any()); - setupMocksForMetricsCollection(currentTaskCount, 10.0, 0.9); - Assert.assertEquals("Scale-down blocked (counter=1)", -1, autoScaler.computeTaskCountForScaleAction()); - - // Now trigger scale-up, which should reset the counter + // Trigger scale-up, which should set the cooldown timer doReturn(scaleUpOptimal).when(autoScaler).computeOptimalTaskCount(any()); setupMocksForMetricsCollection(currentTaskCount, 5000.0, 0.1); @@ -111,11 +104,11 @@ public void testScaleUpWhenOptimalGreaterThanCurrent() autoScaler.computeTaskCountForScaleAction() ); - // Verify counter was reset: scale-down should be blocked again (counter starts from 0) - doReturn(scaleDownOptimal).when(autoScaler).computeOptimalTaskCount(any()); + // Verify cooldown blocks immediate subsequent scaling + doReturn(scaleUpOptimal).when(autoScaler).computeOptimalTaskCount(any()); setupMocksForMetricsCollection(currentTaskCount, 10.0, 0.9); Assert.assertEquals( - "Scale-down should be blocked after scale-up reset the counter", + "Scale action should be blocked during the cooldown window", -1, autoScaler.computeTaskCountForScaleAction() ); @@ -140,12 +133,12 @@ public void testNoOpWhenOptimalEqualsCurrent() @Test public void testScaleDownBlockedReturnsMinusOne() { - // Use config with barrier=2 to test counter behavior + // Use config with a long barrier to test cooldown behavior CostBasedAutoScalerConfig barrierConfig = CostBasedAutoScalerConfig.builder() .taskCountMax(100) .taskCountMin(1) .enableTaskAutoScaler(true) - .scaleDownBarrier(2) + .scaleDownBarrier(Duration.standardHours(1)) .build(); CostBasedAutoScaler autoScaler = spy(new CostBasedAutoScaler( @@ -161,23 +154,16 @@ public void testScaleDownBlockedReturnsMinusOne() doReturn(optimalCount).when(autoScaler).computeOptimalTaskCount(any()); setupMocksForMetricsCollection(currentTaskCount, 10.0, 0.9); - // First attempt: counter=1, blocked + // First attempt: allowed (no prior scale action) Assert.assertEquals( - "Should return -1 when optimal is less than current (scale-down blocked, counter=1)", - -1, - autoScaler.computeTaskCountForScaleAction() - ); - - // Second attempt: counter=2, succeeds (barrier reached) - Assert.assertEquals( - "Scale-down should succeed when barrier reached", + "Scale-down should succeed when no prior scale action exists", optimalCount, autoScaler.computeTaskCountForScaleAction() ); - // Verify counter was reset: next scale-down should be blocked again + // Second attempt: blocked by cooldown Assert.assertEquals( - "Scale-down should be blocked after successful scale-down reset the counter", + "Scale-down should be blocked during the cooldown window", -1, autoScaler.computeTaskCountForScaleAction() ); @@ -300,8 +286,8 @@ public void testBoundaryConditionOptimalEqualsCurrentMinusOne() int result = autoScaler.computeTaskCountForScaleAction(); Assert.assertEquals( - "Should block scale-down even by one task", - -1, + "Should allow scale-down by one task when cooldown has elapsed", + optimalCount, result ); } @@ -314,7 +300,7 @@ public void testScaleDownBlockedWhenScaleDownOnRolloverOnlyEnabled() .taskCountMin(1) .enableTaskAutoScaler(true) .scaleDownDuringTaskRolloverOnly(true) - .scaleDownBarrier(1) // Set the barrier to 1 so it would trigger immediately if not blocked + .scaleDownBarrier(Duration.ZERO) .build(); CostBasedAutoScaler autoScaler = spy(new CostBasedAutoScaler( @@ -345,6 +331,7 @@ public void testScaleDownAllowedDuringRolloverWhenScaleDownOnRolloverOnlyEnabled .taskCountMin(1) .enableTaskAutoScaler(true) .scaleDownDuringTaskRolloverOnly(true) + .scaleDownBarrier(Duration.ZERO) .build(); CostBasedAutoScaler autoScaler = spy(new CostBasedAutoScaler( @@ -357,11 +344,12 @@ public void testScaleDownAllowedDuringRolloverWhenScaleDownOnRolloverOnlyEnabled int currentTaskCount = 50; int optimalCount = 30; - // Set up lastKnownMetrics by calling computeTaskCountForScaleAction first - doReturn(optimalCount).when(autoScaler).computeOptimalTaskCount(any()); + // Set up lastKnownMetrics by calling computeTaskCountForScaleAction first without scaling + doReturn(currentTaskCount).when(autoScaler).computeOptimalTaskCount(any()); setupMocksForMetricsCollection(currentTaskCount, 10.0, 0.9); autoScaler.computeTaskCountForScaleAction(); // This populates lastKnownMetrics + doReturn(optimalCount).when(autoScaler).computeOptimalTaskCount(any()); Assert.assertEquals( "Should scale-down during rollover when scaleDownDuringTaskRolloverOnly is true", optimalCount, diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java index 5deae662d755..7b445da50d59 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java @@ -25,6 +25,7 @@ import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisorIOConfig; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.segment.incremental.RowIngestionMeters; +import org.joda.time.Duration; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -38,9 +39,8 @@ import static org.apache.druid.indexing.common.stats.DropwizardRowIngestionMeters.FIFTEEN_MINUTE_NAME; import static org.apache.druid.indexing.common.stats.DropwizardRowIngestionMeters.FIVE_MINUTE_NAME; import static org.apache.druid.indexing.common.stats.DropwizardRowIngestionMeters.ONE_MINUTE_NAME; -import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler.AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD; import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD; -import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler.computeExtraMaxPartitionsPerTaskIncrease; +import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler.computeScaleUpBoost; import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler.computeValidTaskCounts; import static org.mockito.Mockito.when; @@ -77,49 +77,52 @@ public void setUp() public void testComputeValidTaskCounts() { // For 100 partitions at 25 tasks (4 partitions/task), valid counts include 25 and 34 - int[] validTaskCounts = computeValidTaskCounts(100, 25, 0L, 1, 100); + int[] validTaskCounts = computeValidTaskCounts(100, 25, 0L, 1, 100, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); Assert.assertTrue("Should contain the current task count", contains(validTaskCounts, 25)); Assert.assertTrue("Should contain the next scale-up option", contains(validTaskCounts, 34)); // Edge cases - Assert.assertEquals(0, computeValidTaskCounts(0, 10, 0L, 1, 100).length); - Assert.assertEquals(0, computeValidTaskCounts(-5, 10, 0L, 1, 100).length); + Assert.assertEquals(0, computeValidTaskCounts(0, 10, 0L, 1, 100, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD).length); + Assert.assertEquals(0, computeValidTaskCounts(-5, 10, 0L, 1, 100, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD).length); // Single partition - int[] singlePartition = computeValidTaskCounts(1, 1, 0L, 1, 100); + int[] singlePartition = computeValidTaskCounts(1, 1, 0L, 1, 100, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); Assert.assertTrue("Single partition should have at least one valid count", singlePartition.length > 0); Assert.assertTrue("Single partition should contain 1", contains(singlePartition, 1)); // Current exceeds partitions - should still yield valid, deduplicated options - int[] exceedsPartitions = computeValidTaskCounts(2, 5, 0L, 1, 100); + int[] exceedsPartitions = computeValidTaskCounts(2, 5, 0L, 1, 100, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); Assert.assertEquals(2, exceedsPartitions.length); Assert.assertTrue(contains(exceedsPartitions, 1)); Assert.assertTrue(contains(exceedsPartitions, 2)); // Lag expansion: low lag should not include max, high lag should - int[] lowLagCounts = computeValidTaskCounts(30, 3, 0L, 1, 30); + int[] lowLagCounts = computeValidTaskCounts(30, 3, 0L, 1, 30, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); Assert.assertFalse("Low lag should not include max task count", contains(lowLagCounts, 30)); Assert.assertTrue("Low lag should cap scale up around 4 tasks", contains(lowLagCounts, 4)); long highAggregateLag = 30L * 500_000L; - int[] highLagCounts = computeValidTaskCounts(30, 3, highAggregateLag, 1, 30); - Assert.assertTrue("High lag should allow scaling to max tasks", contains(highLagCounts, 30)); + int[] highLagCounts = computeValidTaskCounts(30, 3, highAggregateLag, 1, 30, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); + // With capped scaling (1->10 tasks jump max), we won't reach 30 immediately from 3. + // We expect it to reach 10. + Assert.assertTrue("High lag should allow scaling to 10 tasks", contains(highLagCounts, 10)); + Assert.assertFalse("Should not jump straight to 30 from 3", contains(highLagCounts, 30)); // Respects taskCountMax - int[] cappedCounts = computeValidTaskCounts(30, 4, highAggregateLag, 1, 3); + int[] cappedCounts = computeValidTaskCounts(30, 4, highAggregateLag, 1, 3, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); Assert.assertTrue("Should include taskCountMax when doable", contains(cappedCounts, 3)); Assert.assertFalse("Should not exceed taskCountMax", contains(cappedCounts, 4)); // Respects taskCountMin - filters out values below the minimum // With partitionCount=100, currentTaskCount=10, the computed range includes values like 8, 9, 10, 12, 13 - int[] minCappedCounts = computeValidTaskCounts(100, 10, 0L, 10, 100); + int[] minCappedCounts = computeValidTaskCounts(100, 10, 0L, 10, 100, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); Assert.assertFalse("Should not go below taskCountMin", contains(minCappedCounts, 8)); Assert.assertFalse("Should not go below taskCountMin", contains(minCappedCounts, 9)); Assert.assertTrue("Should include values at taskCountMin", contains(minCappedCounts, 10)); Assert.assertTrue("Should include values above taskCountMin", contains(minCappedCounts, 12)); // Both bounds applied together - int[] bothBounds = computeValidTaskCounts(100, 10, 0L, 10, 12); + int[] bothBounds = computeValidTaskCounts(100, 10, 0L, 10, 12, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); Assert.assertFalse("Should not go below taskCountMin", contains(bothBounds, 8)); Assert.assertFalse("Should not go below taskCountMin", contains(bothBounds, 9)); Assert.assertFalse("Should not exceed taskCountMax", contains(bothBounds, 13)); @@ -174,20 +177,21 @@ class Example } } + // Updated expectations based on capped sqrt-based scaling Example[] examples = new Example[]{ - new Example(3, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD, 8), - new Example(3, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 3, 15), - new Example(3, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 5, 30), - new Example(10, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD - 1, 15), - new Example(10, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 3, 30), - new Example(10, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 10, 30), - new Example(20, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 10, 30), - new Example(25, AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 10, 30) + new Example(3, 50_000L, 6), + new Example(3, 300_000L, 10), + new Example(3, 500_000L, 10), + new Example(10, 100_000L, 30), + new Example(10, 300_000L, 30), + new Example(10, 500_000L, 30), + new Example(20, 500_000L, 30), + new Example(25, 500_000L, 30) }; for (Example example : examples) { long aggregateLag = example.lagPerPartition * partitionCount; - int[] validCounts = computeValidTaskCounts(partitionCount, example.currentTasks, aggregateLag, 1, taskCountMax); + int[] validCounts = computeValidTaskCounts(partitionCount, example.currentTasks, aggregateLag, 1, taskCountMax, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); Assert.assertTrue( "Should include expected task count for current=" + example.currentTasks + ", lag=" + example.lagPerPartition, contains(validCounts, example.expectedTasks) @@ -215,16 +219,25 @@ class Example } @Test - public void testComputeExtraPPTIncrease() + public void testComputeScaleUpBoost() { // No extra increase below the threshold - Assert.assertEquals(0, computeExtraMaxPartitionsPerTaskIncrease(30L * EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD - 1, 30, 3, 30)); - Assert.assertEquals(4, computeExtraMaxPartitionsPerTaskIncrease(30L * EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD, 30, 3, 30)); + Assert.assertEquals(0, computeScaleUpBoost(30L * 49_000L, 30, 3, 30, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD)); - // More aggressive increase when the lag is high - Assert.assertEquals(8, computeExtraMaxPartitionsPerTaskIncrease(30L * AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 5, 30, 3, 30)); - // Zero when on max task count - Assert.assertEquals(0, computeExtraMaxPartitionsPerTaskIncrease(30L * AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD * 10, 30, 30, 30)); + // Test exact values based on the formula: + // boost = max(0, currentPPT - targetMinPPT - 2) + + // Case 1: 3 tasks, 50k lag -> target 7 tasks -> targetMinPPT 5 -> currentPPT 10 -> boost 10-5-2 = 3 + Assert.assertEquals(3, computeScaleUpBoost(30L * 50_000L, 30, 3, 30, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD)); + + // Case 2: 3 tasks, 300k lag -> target 10 tasks -> targetMinPPT 3 -> currentPPT 10 -> boost 10-3-2 = 5 + Assert.assertEquals(5, computeScaleUpBoost(30L * 300_000L, 30, 3, 30, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD)); + + // Case 3: 3 tasks, 500k lag -> target 10 tasks (capped) -> targetMinPPT 3 -> boost 5 + Assert.assertEquals(5, computeScaleUpBoost(30L * 500_000L, 30, 3, 30, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD)); + + // Case 4: Zero when on max task count + Assert.assertEquals(0, computeScaleUpBoost(30L * 500_000L, 30, 30, 30, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD)); } @Test @@ -383,7 +396,10 @@ public void testComputeTaskCountForRolloverAndConfigProperties() .taskCountMin(1) .enableTaskAutoScaler(true) .build(); - Assert.assertEquals(5, cfgWithDefaults.getScaleDownBarrier()); + Assert.assertEquals( + CostBasedAutoScalerConfig.DEFAULT_MIN_SCALE_DELAY, + cfgWithDefaults.getMinScaleDownDelay() + ); Assert.assertEquals(1000.0, cfgWithDefaults.getDefaultProcessingRate(), 0.001); Assert.assertFalse(cfgWithDefaults.isScaleDownOnTaskRolloverOnly()); @@ -392,11 +408,11 @@ public void testComputeTaskCountForRolloverAndConfigProperties() .taskCountMax(10) .taskCountMin(1) .enableTaskAutoScaler(true) - .scaleDownBarrier(10) + .scaleDownBarrier(Duration.standardMinutes(10)) .defaultProcessingRate(5000.0) .scaleDownDuringTaskRolloverOnly(true) .build(); - Assert.assertEquals(10, cfgWithCustom.getScaleDownBarrier()); + Assert.assertEquals(Duration.standardMinutes(10), cfgWithCustom.getMinScaleDownDelay()); Assert.assertEquals(5000.0, cfgWithCustom.getDefaultProcessingRate(), 0.001); Assert.assertTrue(cfgWithCustom.isScaleDownOnTaskRolloverOnly()); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java index 4f5832915f68..4c070f4b0165 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java @@ -321,6 +321,88 @@ public void testLagAmplificationReducesIdleUnderHighLag() ); } + @Test + public void testCustomLagThresholdsAffectCostCalculation() + { + // Test that custom threshold values change behavior compared to defaults + int currentTaskCount = 3; + int proposedTaskCount = 8; + int partitionCount = 30; + double pollIdleRatio = 0.1; + + // Use high lag that exceeds both default and custom thresholds + // Default thresholds: extra=25000, aggressive=50000 + // Custom thresholds: extra=10000, aggressive=20000 (more sensitive) + CostMetrics metrics = createMetrics(15_000.0, currentTaskCount, partitionCount, pollIdleRatio); + + CostBasedAutoScalerConfig defaultConfig = CostBasedAutoScalerConfig.builder() + .taskCountMax(100) + .taskCountMin(1) + .enableTaskAutoScaler(true) + .defaultProcessingRate(1000.0) + .build(); + + CostBasedAutoScalerConfig sensitiveConfig = CostBasedAutoScalerConfig.builder() + .taskCountMax(100) + .taskCountMin(1) + .enableTaskAutoScaler(true) + .defaultProcessingRate(1000.0) + .highLagThreshold(10000) + .aggressiveScalingLagPerPartitionThreshold(20000) + .build(); + + double defaultCost = costFunction.computeCost(metrics, proposedTaskCount, defaultConfig).totalCost(); + double sensitiveCost = costFunction.computeCost(metrics, proposedTaskCount, sensitiveConfig).totalCost(); + + // With lower thresholds, the same lag triggers more aggressive scaling behavior + // (higher lagBusyFactor), which results in lower predicted idle and thus lower idle cost + Assert.assertTrue( + "More sensitive thresholds should result in different (lower) cost", + sensitiveCost < defaultCost + ); + } + + @Test + public void testRampDenominatorCalculation() + { + // Test that ramp denominator is calculated correctly from config values + // by verifying behavior at boundary conditions + int currentTaskCount = 3; + int proposedTaskCount = 8; + int partitionCount = 30; + double pollIdleRatio = 0.1; + + // Custom config with specific thresholds for predictable ramp calculation + // extra=10000, aggressive=20000 + // lagAmplificationMaxLagPerPartition = 20000 * 5 = 100000 + // rampDenominator = 100000 - 10000 = 90000 + CostBasedAutoScalerConfig customConfig = CostBasedAutoScalerConfig.builder() + .taskCountMax(100) + .taskCountMin(1) + .enableTaskAutoScaler(true) + .defaultProcessingRate(1000.0) + .highLagThreshold(10000) + .aggressiveScalingLagPerPartitionThreshold(20000) + .build(); + + // Lag exactly at extraThreshold (lagPerPartition = 10000) + // ramp = (10000 - 10000) / 90000 = 0 + CostMetrics atExtraThreshold = createMetrics(10_000.0, currentTaskCount, partitionCount, pollIdleRatio); + + // Lag at maximum (lagPerPartition = 100000) + // ramp = (100000 - 10000) / 90000 = 1.0 + CostMetrics atMaxLag = createMetrics(100_000.0, currentTaskCount, partitionCount, pollIdleRatio); + + double costAtExtra = costFunction.computeCost(atExtraThreshold, proposedTaskCount, customConfig).totalCost(); + double costAtMax = costFunction.computeCost(atMaxLag, proposedTaskCount, customConfig).totalCost(); + + // At max lag, ramp=1.0 leads to maximum amplification, reducing idle cost more + Assert.assertTrue( + "Cost at max lag should be lower due to maximum lag amplification", + costAtMax < costAtExtra + ); + } + private CostMetrics createMetrics( double avgPartitionLag, int currentTaskCount, From 0948f945409c5de569514944b2fccc99621a238e Mon Sep 17 00:00:00 2001 From: Sasha Syrotenko Date: Tue, 3 Feb 2026 21:05:28 +0200 Subject: [PATCH 04/14] Checkstyle --- .../supervisor/autoscaler/CostBasedAutoScaler.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java index 8b137e000f9c..844dce86c056 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java @@ -71,9 +71,6 @@ public class CostBasedAutoScaler implements SupervisorTaskAutoScaler // Base PPT lag threshold allowing to activate a burst scaleup to eliminate high lag. static final int EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD = 50_000; - // Extra PPT lag threshold allowing activation of even more aggressive scaleup to eliminate high lag, - // also enabling lag-amplified idle calculation decay in the cost function (to reduce idle weight). - static final int AGGRESSIVE_SCALING_LAG_PER_PARTITION_THRESHOLD = 100_000; public static final String LAG_COST_METRIC = "task/autoScaler/costBased/lagCost"; public static final String IDLE_COST_METRIC = "task/autoScaler/costBased/idleCost"; From 116e984ec37dec34ef42ca851cd7d580d7a93a46 Mon Sep 17 00:00:00 2001 From: Sasha Syrotenko Date: Tue, 3 Feb 2026 21:07:28 +0200 Subject: [PATCH 05/14] Self-review --- .../CostBasedAutoScalerIntegrationTest.java | 4 ++-- .../autoscaler/CostBasedAutoScalerConfig.java | 17 ++++++----------- .../CostBasedAutoScalerConfigTest.java | 4 ++-- .../autoscaler/CostBasedAutoScalerMockTest.java | 8 ++++---- .../autoscaler/CostBasedAutoScalerTest.java | 4 ++-- .../autoscaler/WeightedCostFunctionTest.java | 4 ++-- 6 files changed, 18 insertions(+), 23 deletions(-) diff --git a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/autoscaler/CostBasedAutoScalerIntegrationTest.java b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/autoscaler/CostBasedAutoScalerIntegrationTest.java index 45873ac8096e..b160448c0263 100644 --- a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/autoscaler/CostBasedAutoScalerIntegrationTest.java +++ b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/autoscaler/CostBasedAutoScalerIntegrationTest.java @@ -134,7 +134,7 @@ public void test_autoScaler_computesOptimalTaskCountAndProduceScaleDown() .lagWeight(0.9) .idleWeight(0.1) .scaleDownDuringTaskRolloverOnly(false) - .scaleDownBarrier(Duration.millis(1500)) + .minScaleDownDelay(Duration.millis(1500)) .build(); final KafkaSupervisorSpec spec = createKafkaSupervisorWithAutoScaler(superId, autoScalerConfig, initialTaskCount); @@ -230,7 +230,7 @@ void test_scaleDownDuringTaskRollover() .idleWeight(0.9) .scaleDownDuringTaskRolloverOnly(true) // Do not slow scale-downs - .scaleDownBarrier(Duration.ZERO) + .minScaleDownDelay(Duration.ZERO) .build(); final KafkaSupervisorSpec spec = createKafkaSupervisorWithAutoScaler(superId, autoScalerConfig, initialTaskCount); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java index eeaf46614e70..852774a2ab8e 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java @@ -305,7 +305,7 @@ public String toString() ", idleWeight=" + idleWeight + ", defaultProcessingRate=" + defaultProcessingRate + ", highLagThreshold=" + highLagThreshold + - ", scaleDownBarrier=" + minScaleDownDelay + + ", minScaleDownDelay=" + minScaleDownDelay + ", scaleDownDuringTaskRolloverOnly=" + scaleDownDuringTaskRolloverOnly + '}'; } @@ -326,9 +326,9 @@ public static class Builder private Double lagWeight; private Double idleWeight; private Double defaultProcessingRate; - private Duration scaleDownBarrier; - private Boolean scaleDownDuringTaskRolloverOnly; private Integer highLagThreshold; + private Duration minScaleDownDelay; + private Boolean scaleDownDuringTaskRolloverOnly; private Builder() { @@ -394,9 +394,9 @@ public Builder defaultProcessingRate(double defaultProcessingRate) return this; } - public Builder scaleDownBarrier(Duration scaleDownBarrier) + public Builder minScaleDownDelay(Duration minScaleDownDelay) { - this.scaleDownBarrier = scaleDownBarrier; + this.minScaleDownDelay = minScaleDownDelay; return this; } @@ -412,11 +412,6 @@ public Builder highLagThreshold(int highLagThreshold) return this; } - public Builder aggressiveScalingLagPerPartitionThreshold(int aggressiveScalingLagPerPartitionThreshold) - { - return this; - } - public CostBasedAutoScalerConfig build() { return new CostBasedAutoScalerConfig( @@ -431,7 +426,7 @@ public CostBasedAutoScalerConfig build() idleWeight, defaultProcessingRate, highLagThreshold, - scaleDownBarrier, + minScaleDownDelay, scaleDownDuringTaskRolloverOnly ); } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java index 707beb87e904..e8cb6bebecb5 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java @@ -189,10 +189,10 @@ public void testBuilder() .lagWeight(0.6) .idleWeight(0.4) .defaultProcessingRate(2000.0) - .scaleDownBarrier(Duration.standardMinutes(10)) + .minScaleDownDelay(Duration.standardMinutes(10)) .scaleDownDuringTaskRolloverOnly(true) .highLagThreshold(30000) - .aggressiveScalingLagPerPartitionThreshold(60000) + .aggressiveScalingLagPerPartitionThreshold() .build(); Assert.assertTrue(config.getEnableTaskAutoScaler()); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerMockTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerMockTest.java index 347908af1023..c05b9ccc4cd6 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerMockTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerMockTest.java @@ -82,7 +82,7 @@ public void testScaleUpWhenOptimalGreaterThanCurrent() .taskCountMax(100) .taskCountMin(1) .enableTaskAutoScaler(true) - .scaleDownBarrier(Duration.standardHours(1)) + .minScaleDownDelay(Duration.standardHours(1)) .build(); CostBasedAutoScaler autoScaler = spy(new CostBasedAutoScaler( @@ -138,7 +138,7 @@ public void testScaleDownBlockedReturnsMinusOne() .taskCountMax(100) .taskCountMin(1) .enableTaskAutoScaler(true) - .scaleDownBarrier(Duration.standardHours(1)) + .minScaleDownDelay(Duration.standardHours(1)) .build(); CostBasedAutoScaler autoScaler = spy(new CostBasedAutoScaler( @@ -300,7 +300,7 @@ public void testScaleDownBlockedWhenScaleDownOnRolloverOnlyEnabled() .taskCountMin(1) .enableTaskAutoScaler(true) .scaleDownDuringTaskRolloverOnly(true) - .scaleDownBarrier(Duration.ZERO) + .minScaleDownDelay(Duration.ZERO) .build(); CostBasedAutoScaler autoScaler = spy(new CostBasedAutoScaler( @@ -331,7 +331,7 @@ public void testScaleDownAllowedDuringRolloverWhenScaleDownOnRolloverOnlyEnabled .taskCountMin(1) .enableTaskAutoScaler(true) .scaleDownDuringTaskRolloverOnly(true) - .scaleDownBarrier(Duration.ZERO) + .minScaleDownDelay(Duration.ZERO) .build(); CostBasedAutoScaler autoScaler = spy(new CostBasedAutoScaler( diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java index 7b445da50d59..16c2abf57cdb 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java @@ -390,7 +390,7 @@ public void testComputeTaskCountForRolloverAndConfigProperties() when(supervisor.getIoConfig()).thenReturn(ioConfig); when(ioConfig.getStream()).thenReturn("stream"); - // Test config defaults for scaleDownBarrier, defaultProcessingRate, scaleDownDuringTaskRolloverOnly + // Test config defaults for minScaleDownDelay, defaultProcessingRate, scaleDownDuringTaskRolloverOnly CostBasedAutoScalerConfig cfgWithDefaults = CostBasedAutoScalerConfig.builder() .taskCountMax(10) .taskCountMin(1) @@ -408,7 +408,7 @@ public void testComputeTaskCountForRolloverAndConfigProperties() .taskCountMax(10) .taskCountMin(1) .enableTaskAutoScaler(true) - .scaleDownBarrier(Duration.standardMinutes(10)) + .minScaleDownDelay(Duration.standardMinutes(10)) .defaultProcessingRate(5000.0) .scaleDownDuringTaskRolloverOnly(true) .build(); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java index 4c070f4b0165..e2835219e272 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java @@ -348,7 +348,7 @@ public void testCustomLagThresholdsAffectCostCalculation() .enableTaskAutoScaler(true) .defaultProcessingRate(1000.0) .highLagThreshold(10000) - .aggressiveScalingLagPerPartitionThreshold(20000) + .aggressiveScalingLagPerPartitionThreshold() .build(); double defaultCost = costFunction.computeCost(metrics, proposedTaskCount, defaultConfig).totalCost(); @@ -382,7 +382,7 @@ public void testRampDenominatorCalculation() .enableTaskAutoScaler(true) .defaultProcessingRate(1000.0) .highLagThreshold(10000) - .aggressiveScalingLagPerPartitionThreshold(20000) + .aggressiveScalingLagPerPartitionThreshold() .build(); // Lag exactly at extraThreshold (lagPerPartition = 10000) From 16d4d78b9998d9d1c24f9b0ae710711dbc095e50 Mon Sep 17 00:00:00 2001 From: Sasha Syrotenko Date: Wed, 4 Feb 2026 15:21:50 +0200 Subject: [PATCH 06/14] Refactor CostBasedAutoScaler: add plugin system to pure cost fucntion --- docs/ingestion/supervisor.md | 3 + .../autoscaler/CostBasedAutoScaler.java | 121 +++++---------- .../autoscaler/CostBasedAutoScalerConfig.java | 63 +++++++- .../autoscaler/WeightedCostFunction.java | 17 ++- .../plugins/BurstScaleUpOnHighLagPlugin.java | 91 +++++++++++ .../OptimalTaskCountBoundariesPlugin.java | 33 ++++ .../CostBasedAutoScalerConfigTest.java | 38 ++--- .../CostBasedAutoScalerMockTest.java | 20 +-- .../autoscaler/CostBasedAutoScalerTest.java | 142 +++++++++--------- .../autoscaler/WeightedCostFunctionTest.java | 84 ++++++----- 10 files changed, 382 insertions(+), 230 deletions(-) create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java create mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/OptimalTaskCountBoundariesPlugin.java diff --git a/docs/ingestion/supervisor.md b/docs/ingestion/supervisor.md index 57a8d1a96b59..016b0887ed1e 100644 --- a/docs/ingestion/supervisor.md +++ b/docs/ingestion/supervisor.md @@ -208,6 +208,9 @@ The following table outlines the configuration properties related to the `costBa |`lagWeight`|The weight of extracted lag value in cost function.| No| 0.25| |`idleWeight`|The weight of extracted poll idle value in cost function. | No | 0.75 | |`defaultProcessingRate`|A planned processing rate per task, required for first cost estimations. | No | 1000 | +|`useTaskCountBoundaries`|Enables the bounded partitions-per-task window when selecting task counts.|No|`false`| +|`useBurstScaleOnHeavyLag`|Enables burst scale-up when per-partition lag is high.|No|`false`| +|`highLagThreshold`|Per-partition lag threshold that triggers burst scale-up when `useBurstScaleOnHeavyLag` is enabled.|No|50000| |`minScaleDownDelay`| Minimum duration between successful scale actions, specified as an ISO-8601 duration string. | No | `PT50M` | |`scaleDownDuringTaskRolloverOnly`| Indicates whether task scaling down is limited to periods during task rollovers only. | No | False | diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java index 844dce86c056..5fabaa6dcae8 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java @@ -27,6 +27,8 @@ import org.apache.druid.indexing.overlord.supervisor.autoscaler.SupervisorTaskAutoScaler; import org.apache.druid.indexing.seekablestream.SeekableStreamIndexTaskRunner; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor; +import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins.BurstScaleUpOnHighLagPlugin; +import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins.OptimalTaskCountBoundariesPlugin; import org.apache.druid.java.util.common.DateTimes; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.concurrent.Execs; @@ -36,6 +38,7 @@ import org.apache.druid.query.DruidMetrics; import org.apache.druid.segment.incremental.RowIngestionMeters; +import javax.annotation.Nullable; import java.util.Map; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; @@ -56,22 +59,6 @@ public class CostBasedAutoScaler implements SupervisorTaskAutoScaler { private static final EmittingLogger log = new EmittingLogger(CostBasedAutoScaler.class); - private static final int MAX_INCREASE_IN_PARTITIONS_PER_TASK = 2; - private static final int MAX_DECREASE_IN_PARTITIONS_PER_TASK = MAX_INCREASE_IN_PARTITIONS_PER_TASK * 2; - - /** - * Controls how fast the additional tasks grow with the square root of current tasks. - * This allows bigger jumps when under-provisioned, but growth slows down as the task count increases. - */ - private static final int SQRT_TASK_COUNT_SCALE_FACTOR = 5; - /** - * Caps the maximum number of additional tasks in a single scale-up to preserve stability. - */ - private static final int MAX_JUMP = 12; - - // Base PPT lag threshold allowing to activate a burst scaleup to eliminate high lag. - static final int EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD = 50_000; - public static final String LAG_COST_METRIC = "task/autoScaler/costBased/lagCost"; public static final String IDLE_COST_METRIC = "task/autoScaler/costBased/idleCost"; public static final String OPTIMAL_TASK_COUNT_METRIC = "task/autoScaler/costBased/optimalTaskCount"; @@ -84,6 +71,8 @@ public class CostBasedAutoScaler implements SupervisorTaskAutoScaler private final ServiceMetricEvent.Builder metricBuilder; private final ScheduledExecutorService autoscalerExecutor; private final WeightedCostFunction costFunction; + private OptimalTaskCountBoundariesPlugin boundariesPlugin = null; + private BurstScaleUpOnHighLagPlugin burstScaleUpPlugin = null; private volatile CostMetrics lastKnownMetrics; private volatile long lastScaleActionTimeMillis = -1; @@ -102,7 +91,6 @@ public CostBasedAutoScaler( this.emitter = emitter; this.costFunction = new WeightedCostFunction(); - this.autoscalerExecutor = Execs.scheduledSingleThreaded("CostBasedAutoScaler-" + StringUtils.encodeForFormat(spec.getId())); this.metricBuilder = ServiceMetricEvent.builder() @@ -111,6 +99,14 @@ public CostBasedAutoScaler( DruidMetrics.STREAM, this.supervisor.getIoConfig().getStream() ); + if (config.shouldUseTaskCountBoundaries()) { + //noinspection InstantiationOfUtilityClass + this.boundariesPlugin = new OptimalTaskCountBoundariesPlugin(); + } + + if (config.shouldUseBurstScaleOnHeavyLag()) { + this.burstScaleUpPlugin = new BurstScaleUpOnHighLagPlugin(config.getHighLagThreshold()); + } } @Override @@ -213,7 +209,8 @@ int computeOptimalTaskCount(CostMetrics metrics) (long) metrics.getAggregateLag(), config.getTaskCountMin(), config.getTaskCountMax(), - config.getHighLagThreshold() + boundariesPlugin, + burstScaleUpPlugin ); if (validTaskCounts.length == 0) { @@ -225,7 +222,7 @@ int computeOptimalTaskCount(CostMetrics metrics) CostResult optimalCost = new CostResult(); for (int taskCount : validTaskCounts) { - CostResult costResult = costFunction.computeCost(metrics, taskCount, config); + CostResult costResult = costFunction.computeCost(metrics, taskCount, config, burstScaleUpPlugin); double cost = costResult.totalCost(); log.info( "Proposed task count: %d, Cost: %.4f (lag: %.4f, idle: %.4f)", @@ -264,19 +261,19 @@ int computeOptimalTaskCount(CostMetrics metrics) } /** - * Generates valid task counts based on partitions-per-task ratios and lag-driven PPT relaxation. - * This enables gradual scaling and avoids large jumps. - * Limits the range of task counts considered to avoid excessive computation. + * Generates valid task counts based on partitions-per-task ratios. * * @return sorted list of valid task counts within bounds */ + @SuppressWarnings({"VariableNotUsedInsideIf", "ReassignedVariable"}) static int[] computeValidTaskCounts( int partitionCount, int currentTaskCount, double aggregateLag, int taskCountMin, int taskCountMax, - int highLagThreshold + @Nullable OptimalTaskCountBoundariesPlugin taskCountBoundariesPlugin, + @Nullable BurstScaleUpOnHighLagPlugin highLagPlugin ) { if (partitionCount <= 0 || currentTaskCount <= 0) { @@ -285,23 +282,26 @@ static int[] computeValidTaskCounts( IntSet result = new IntArraySet(); final int currentPartitionsPerTask = partitionCount / currentTaskCount; - final int extraIncrease = computeScaleUpBoost( - aggregateLag, - partitionCount, - currentTaskCount, - taskCountMax, - highLagThreshold - ); - final int effectiveMaxIncrease = MAX_INCREASE_IN_PARTITIONS_PER_TASK + extraIncrease; // Minimum partitions per task correspond to the maximum number of tasks (scale up) and vice versa. - final int minPartitionsPerTask = Math.max(1, currentPartitionsPerTask - effectiveMaxIncrease); - final int maxPartitionsPerTask = Math.min( - partitionCount, - currentPartitionsPerTask + MAX_DECREASE_IN_PARTITIONS_PER_TASK - ); + int minPartitionsPerTask = partitionCount / taskCountMax; + int maxPartitionsPerTask = partitionCount / taskCountMin; + + if (taskCountBoundariesPlugin != null) { + maxPartitionsPerTask = Math.min( + partitionCount, + currentPartitionsPerTask + OptimalTaskCountBoundariesPlugin.MAX_DECREASE_IN_PARTITIONS_PER_TASK + ); + + int extraIncrease = 0; + if (highLagPlugin != null && highLagPlugin.lagThreshold() > 0) { + extraIncrease = highLagPlugin.computeScaleUpBoost(aggregateLag, partitionCount, currentTaskCount, taskCountMax); + } + int effectiveMaxIncrease = OptimalTaskCountBoundariesPlugin.MAX_INCREASE_IN_PARTITIONS_PER_TASK + extraIncrease; + minPartitionsPerTask = Math.max(minPartitionsPerTask, currentPartitionsPerTask - effectiveMaxIncrease); + } - for (int partitionsPerTask = maxPartitionsPerTask; partitionsPerTask >= minPartitionsPerTask; partitionsPerTask--) { + for (int partitionsPerTask = maxPartitionsPerTask; partitionsPerTask >= minPartitionsPerTask && partitionsPerTask != 0; partitionsPerTask--) { final int taskCount = (partitionCount + partitionsPerTask - 1) / partitionsPerTask; if (taskCount >= taskCountMin && taskCount <= taskCountMax) { result.add(taskCount); @@ -310,52 +310,6 @@ static int[] computeValidTaskCounts( return result.toIntArray(); } - /** - * Computes extra allowed increase in partitions-per-task in scenarios when the average per-partition lag - * is above the configured threshold. - *

- * This uses a capped sqrt-based formula: - * {@code additionalTasks = min(MAX_JUMP, BASE + sqrt(currentTasks) * SQRT_COEFF) * lagFactor * headroom} - *

- * This ensures: - * 1. Narrow window preserved {@code MAX_JUMP} caps the reach. - * 2. Bigger jumps are allowed when under-provisioned. - * 3. Sqrt growth (additional tasks grow slower than task count). - * 4. Self-damping (headroomRatio reduces jumps near max capacity). - */ - static int computeScaleUpBoost( - double aggregateLag, - int partitionCount, - int currentTaskCount, - int taskCountMax, - int highLagThreshold - ) - { - if (partitionCount <= 0 || taskCountMax <= 0) { - return 0; - } - - final double lagPerPartition = aggregateLag / partitionCount; - if (lagPerPartition < highLagThreshold) { - return 0; - } - - final double lagSeverity = lagPerPartition / highLagThreshold; - final double lagFactor = lagSeverity / (lagSeverity + 1.0); - final double headroomRatio = Math.max(0.0, 1.0 - (double) currentTaskCount / taskCountMax); - - // Compute target additional tasks (sqrt-based growth with cap) - final double rawAdditional = 1 + Math.sqrt(currentTaskCount) * SQRT_TASK_COUNT_SCALE_FACTOR; - final double cappedAdditional = Math.min(rawAdditional, MAX_JUMP); - final int additionalTasks = (int) (cappedAdditional * lagFactor * headroomRatio); - - final int targetMax = Math.min(taskCountMax, currentTaskCount + additionalTasks); - final int targetMinPPT = Math.max(1, (partitionCount + targetMax - 1) / targetMax); - final int currentPPT = partitionCount / currentTaskCount; - - return Math.max(0, currentPPT - targetMinPPT - MAX_INCREASE_IN_PARTITIONS_PER_TASK); - } - /** * Extracts the average poll-idle-ratio metric from task stats. * This metric indicates how much time the consumer spends idle waiting for data. @@ -496,5 +450,4 @@ private boolean isScaleActionAllowed() final long elapsedMillis = DateTimes.nowUtc().getMillis() - lastScaleActionTimeMillis; return elapsedMillis >= barrierMillis; } - } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java index 852774a2ab8e..606fbea570c3 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java @@ -28,6 +28,7 @@ import org.apache.druid.indexing.overlord.supervisor.SupervisorSpec; import org.apache.druid.indexing.overlord.supervisor.autoscaler.SupervisorTaskAutoScaler; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor; +import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins.BurstScaleUpOnHighLagPlugin; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.joda.time.Duration; @@ -62,6 +63,16 @@ public class CostBasedAutoScalerConfig implements AutoScalerConfig private final double lagWeight; private final double idleWeight; private final double defaultProcessingRate; + /** + * Enables or disables {@code OptimalTaskCountBoundariesPlugin} which allows + * considering only task counts within a certain PPT-based window around the current PPT. + */ + private final boolean useTaskCountBoundaries; + /** + * Enables or disables {@code BurstScaleUpOnHighLagPlugin} which allows + * applying burst scale-up when high lag is detected. + */ + private final boolean useBurstScaleOnHeavyLag; /** * Per-partition lag threshold allowing to activate a burst scaleup to eliminate high lag. */ @@ -90,6 +101,8 @@ public CostBasedAutoScalerConfig( @Nullable @JsonProperty("lagWeight") Double lagWeight, @Nullable @JsonProperty("idleWeight") Double idleWeight, @Nullable @JsonProperty("defaultProcessingRate") Double defaultProcessingRate, + @Nullable @JsonProperty("useTaskCountBoundaries") Boolean useTaskCountBoundaries, + @Nullable @JsonProperty("useBurstScaleOnHeavyLag") Boolean useBurstScaleOnHeavyLag, @Nullable @JsonProperty("highLagThreshold") Integer highLagThreshold, @Nullable @JsonProperty("minScaleDownDelay") Duration minScaleDownDelay, @Nullable @JsonProperty("scaleDownDuringTaskRolloverOnly") Boolean scaleDownDuringTaskRolloverOnly @@ -110,9 +123,11 @@ public CostBasedAutoScalerConfig( this.lagWeight = Configs.valueOrDefault(lagWeight, DEFAULT_LAG_WEIGHT); this.idleWeight = Configs.valueOrDefault(idleWeight, DEFAULT_IDLE_WEIGHT); this.defaultProcessingRate = Configs.valueOrDefault(defaultProcessingRate, DEFAULT_PROCESSING_RATE); + this.useTaskCountBoundaries = Configs.valueOrDefault(useTaskCountBoundaries, false); + this.useBurstScaleOnHeavyLag = Configs.valueOrDefault(useBurstScaleOnHeavyLag, false); this.highLagThreshold = Configs.valueOrDefault( highLagThreshold, - CostBasedAutoScaler.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD + this.useBurstScaleOnHeavyLag ? BurstScaleUpOnHighLagPlugin.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD : -1 ); this.minScaleDownDelay = Configs.valueOrDefault(minScaleDownDelay, DEFAULT_MIN_SCALE_DELAY); this.scaleDownDuringTaskRolloverOnly = Configs.valueOrDefault(scaleDownDuringTaskRolloverOnly, false); @@ -221,16 +236,16 @@ public double getDefaultProcessingRate() return defaultProcessingRate; } - @JsonProperty - public Duration getMinScaleDownDelay() + @JsonProperty("useTaskCountBoundaries") + public boolean shouldUseTaskCountBoundaries() { - return minScaleDownDelay; + return useTaskCountBoundaries; } - @JsonProperty("scaleDownDuringTaskRolloverOnly") - public boolean isScaleDownOnTaskRolloverOnly() + @JsonProperty("useBurstScaleOnHeavyLag") + public boolean shouldUseBurstScaleOnHeavyLag() { - return scaleDownDuringTaskRolloverOnly; + return useBurstScaleOnHeavyLag; } @JsonProperty("highLagThreshold") @@ -239,6 +254,18 @@ public int getHighLagThreshold() return highLagThreshold; } + @JsonProperty + public Duration getMinScaleDownDelay() + { + return minScaleDownDelay; + } + + @JsonProperty("scaleDownDuringTaskRolloverOnly") + public boolean isScaleDownOnTaskRolloverOnly() + { + return scaleDownDuringTaskRolloverOnly; + } + @Override public SupervisorTaskAutoScaler createAutoScaler(Supervisor supervisor, SupervisorSpec spec, ServiceEmitter emitter) { @@ -265,6 +292,8 @@ public boolean equals(Object o) && Double.compare(that.lagWeight, lagWeight) == 0 && Double.compare(that.idleWeight, idleWeight) == 0 && Double.compare(that.defaultProcessingRate, defaultProcessingRate) == 0 + && useTaskCountBoundaries == that.useTaskCountBoundaries + && useBurstScaleOnHeavyLag == that.useBurstScaleOnHeavyLag && Objects.equals(minScaleDownDelay, that.minScaleDownDelay) && scaleDownDuringTaskRolloverOnly == that.scaleDownDuringTaskRolloverOnly && Objects.equals(taskCountStart, that.taskCountStart) @@ -285,6 +314,8 @@ public int hashCode() lagWeight, idleWeight, defaultProcessingRate, + useTaskCountBoundaries, + useBurstScaleOnHeavyLag, minScaleDownDelay, scaleDownDuringTaskRolloverOnly ); @@ -304,6 +335,8 @@ public String toString() ", lagWeight=" + lagWeight + ", idleWeight=" + idleWeight + ", defaultProcessingRate=" + defaultProcessingRate + + ", useTaskCountBoundaries=" + useTaskCountBoundaries + + ", useBurstScaleOnHeavyLag=" + useBurstScaleOnHeavyLag + ", highLagThreshold=" + highLagThreshold + ", minScaleDownDelay=" + minScaleDownDelay + ", scaleDownDuringTaskRolloverOnly=" + scaleDownDuringTaskRolloverOnly + @@ -326,6 +359,8 @@ public static class Builder private Double lagWeight; private Double idleWeight; private Double defaultProcessingRate; + private Boolean useTaskCountBoundaries; + private Boolean useBurstScaleOnHeavyLag; private Integer highLagThreshold; private Duration minScaleDownDelay; private Boolean scaleDownDuringTaskRolloverOnly; @@ -406,6 +441,18 @@ public Builder scaleDownDuringTaskRolloverOnly(boolean scaleDownDuringTaskRollov return this; } + public Builder useTaskCountBoundaries(boolean useTaskCountBoundaries) + { + this.useTaskCountBoundaries = useTaskCountBoundaries; + return this; + } + + public Builder useBurstScaleOnHeavyLag(boolean useBurstScaleOnHeavyLag) + { + this.useBurstScaleOnHeavyLag = useBurstScaleOnHeavyLag; + return this; + } + public Builder highLagThreshold(int highLagThreshold) { this.highLagThreshold = highLagThreshold; @@ -425,6 +472,8 @@ public CostBasedAutoScalerConfig build() lagWeight, idleWeight, defaultProcessingRate, + useTaskCountBoundaries, + useBurstScaleOnHeavyLag, highLagThreshold, minScaleDownDelay, scaleDownDuringTaskRolloverOnly diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunction.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunction.java index 9922fd97088b..ef7dcb8a8828 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunction.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunction.java @@ -19,6 +19,7 @@ package org.apache.druid.indexing.seekablestream.supervisor.autoscaler; +import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins.BurstScaleUpOnHighLagPlugin; import org.apache.druid.java.util.common.logger.Logger; /** @@ -58,7 +59,12 @@ public class WeightedCostFunction * @return CostResult containing totalCost, lagCost, and idleCost, * or result with {@link Double#POSITIVE_INFINITY} for invalid inputs */ - public CostResult computeCost(CostMetrics metrics, int proposedTaskCount, CostBasedAutoScalerConfig config) + public CostResult computeCost( + CostMetrics metrics, + int proposedTaskCount, + CostBasedAutoScalerConfig config, + BurstScaleUpOnHighLagPlugin highLagPlugin + ) { if (metrics == null || config == null || proposedTaskCount <= 0 || metrics.getPartitionCount() <= 0) { return new CostResult(Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY); @@ -80,7 +86,7 @@ public CostResult computeCost(CostMetrics metrics, int proposedTaskCount, CostBa lagRecoveryTime = metrics.getAggregateLag() / (proposedTaskCount * avgProcessingRate); } - final double predictedIdleRatio = estimateIdleRatio(metrics, proposedTaskCount, config); + final double predictedIdleRatio = estimateIdleRatio(metrics, proposedTaskCount, highLagPlugin); final double idleCost = proposedTaskCount * metrics.getTaskDurationSeconds() * predictedIdleRatio; final double lagCost = config.getLagWeight() * lagRecoveryTime; final double weightedIdleCost = config.getIdleWeight() * idleCost; @@ -111,11 +117,10 @@ public CostResult computeCost(CostMetrics metrics, int proposedTaskCount, CostBa * * @param metrics current system metrics containing idle ratio and task count * @param taskCount target task count to estimate an idle ratio for - * @param config auto-scaler configuration containing threshold values * @return estimated idle ratio in range [0.0, 1.0] */ @SuppressWarnings("ExtractMethodRecommender") - private double estimateIdleRatio(CostMetrics metrics, int taskCount, CostBasedAutoScalerConfig config) + private double estimateIdleRatio(CostMetrics metrics, int taskCount, BurstScaleUpOnHighLagPlugin highLagPlugin) { final double currentPollIdleRatio = metrics.getPollIdleRatio(); @@ -138,8 +143,8 @@ private double estimateIdleRatio(CostMetrics metrics, int taskCount, CostBasedAu double lagBusyFactor = 0.; // Lag-amplified idle decay - final int extraThreshold = config.getHighLagThreshold(); - if (lagPerPartition >= extraThreshold) { + if (highLagPlugin != null && lagPerPartition >= highLagPlugin.lagThreshold()) { + int extraThreshold = highLagPlugin.lagThreshold(); final double lagPerTask = metrics.getAggregateLag() / taskCount; lagBusyFactor = 1.0 - Math.exp(-lagPerTask / extraThreshold); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java new file mode 100644 index 000000000000..b77f98f8f3cf --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins; + +public final class BurstScaleUpOnHighLagPlugin +{ + + // Base PPT lag threshold allowing to activate a burst scaleup to eliminate high lag. + public static final int EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD = 50_000; + + /** + * Controls how fast the additional tasks grow with the square root of current tasks. + * This allows bigger jumps when under-provisioned, but growth slows down as the task count increases. + */ + private static final int SQRT_TASK_COUNT_SCALE_FACTOR = 3; + + private final int lagThreshold; + + public BurstScaleUpOnHighLagPlugin(int lagThreshold) + { + this.lagThreshold = lagThreshold; + } + + public int lagThreshold() + { + return lagThreshold; + } + + /** + * Computes extra allowed increase in partitions-per-task in scenarios when the average per-partition lag + * is above the configured threshold. + *

+ * This uses a capped sqrt-based formula: + * {@code additionalTasks = min(MAX_JUMP, BASE + sqrt(currentTasks) * SQRT_COEFF) * lagFactor * headroom} + *

+ * This ensures: + * 1. Bigger jumps are allowed when under-provisioned. + * 2. Sqrt growth (additional tasks grow slower than task count). + * 3. Self-damping (headroomRatio reduces jumps near max capacity). + */ + public int computeScaleUpBoost( + double aggregateLag, + int partitionCount, + int currentTaskCount, + int taskCountMax + ) + { + if (partitionCount <= 0 || taskCountMax <= 0 || currentTaskCount <= 0) { + return 0; + } + + final double lagPerPartition = aggregateLag / partitionCount; + if (lagPerPartition < lagThreshold) { + return 0; + } + + final double lagSeverity = lagPerPartition / lagThreshold; + final double lagFactor = lagSeverity / (lagSeverity + 1.0); + // Use quadratic headroom damping to maintain higher pressure near capacity + final double headroomRatio = Math.max(0.0, 1.0 - Math.pow((double) currentTaskCount / taskCountMax, 2)); + + // Compute target additional tasks (sqrt-based growth) + final double rawAdditional = 1.0 + Math.sqrt(currentTaskCount) * SQRT_TASK_COUNT_SCALE_FACTOR; + final double deltaTasks = rawAdditional * lagFactor * headroomRatio; + + final double targetTaskCount = Math.min((double) taskCountMax, (double) currentTaskCount + deltaTasks); + + // Compute precise PPT reduction to avoid early integer truncation artifacts + final double currentPPT = (double) partitionCount / currentTaskCount; + final double targetPPT = (double) partitionCount / targetTaskCount; + + return Math.max(0, (int) Math.floor(currentPPT - targetPPT)); + } +} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/OptimalTaskCountBoundariesPlugin.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/OptimalTaskCountBoundariesPlugin.java new file mode 100644 index 000000000000..7c0ea398aee4 --- /dev/null +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/OptimalTaskCountBoundariesPlugin.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins; + +/** + * + */ +public class OptimalTaskCountBoundariesPlugin +{ + public static final int MAX_INCREASE_IN_PARTITIONS_PER_TASK = 2; + public static final int MAX_DECREASE_IN_PARTITIONS_PER_TASK = MAX_INCREASE_IN_PARTITIONS_PER_TASK * 2; + + public OptimalTaskCountBoundariesPlugin() + { + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java index e8cb6bebecb5..89b9742b7be0 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java @@ -25,7 +25,6 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD; import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScalerConfig.DEFAULT_IDLE_WEIGHT; import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScalerConfig.DEFAULT_LAG_WEIGHT; import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScalerConfig.DEFAULT_MIN_SCALE_DELAY; @@ -98,7 +97,10 @@ public void testSerdeWithDefaults() throws Exception // Check defaults Assert.assertEquals(DEFAULT_SCALE_ACTION_PERIOD_MILLIS, config.getScaleActionPeriodMillis()); - Assert.assertEquals(DEFAULT_MIN_TRIGGER_SCALE_ACTION_FREQUENCY_MILLIS, config.getMinTriggerScaleActionFrequencyMillis()); + Assert.assertEquals( + DEFAULT_MIN_TRIGGER_SCALE_ACTION_FREQUENCY_MILLIS, + config.getMinTriggerScaleActionFrequencyMillis() + ); Assert.assertEquals(DEFAULT_LAG_WEIGHT, config.getLagWeight(), 0.001); Assert.assertEquals(DEFAULT_IDLE_WEIGHT, config.getIdleWeight(), 0.001); Assert.assertEquals(DEFAULT_PROCESSING_RATE, config.getDefaultProcessingRate(), 0.001); @@ -106,7 +108,8 @@ public void testSerdeWithDefaults() throws Exception Assert.assertFalse(config.isScaleDownOnTaskRolloverOnly()); Assert.assertNull(config.getTaskCountStart()); Assert.assertNull(config.getStopTaskCountRatio()); - Assert.assertEquals(EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD, config.getHighLagThreshold()); + // When useBurstScaleOnHeavyLag is not set (default: false), highLagThreshold defaults to -1 + Assert.assertEquals(-1, config.getHighLagThreshold()); } @Test @@ -179,21 +182,20 @@ public void testValidation_InvalidStopTaskCountRatio() public void testBuilder() { CostBasedAutoScalerConfig config = CostBasedAutoScalerConfig.builder() - .taskCountMax(100) - .taskCountMin(5) - .taskCountStart(10) - .enableTaskAutoScaler(true) - .minTriggerScaleActionFrequencyMillis(600000L) - .stopTaskCountRatio(0.8) - .scaleActionPeriodMillis(60000L) - .lagWeight(0.6) - .idleWeight(0.4) - .defaultProcessingRate(2000.0) - .minScaleDownDelay(Duration.standardMinutes(10)) - .scaleDownDuringTaskRolloverOnly(true) - .highLagThreshold(30000) - .aggressiveScalingLagPerPartitionThreshold() - .build(); + .taskCountMax(100) + .taskCountMin(5) + .taskCountStart(10) + .enableTaskAutoScaler(true) + .minTriggerScaleActionFrequencyMillis(600000L) + .stopTaskCountRatio(0.8) + .scaleActionPeriodMillis(60000L) + .lagWeight(0.6) + .idleWeight(0.4) + .defaultProcessingRate(2000.0) + .minScaleDownDelay(Duration.standardMinutes(10)) + .scaleDownDuringTaskRolloverOnly(true) + .highLagThreshold(30000) + .build(); Assert.assertTrue(config.getEnableTaskAutoScaler()); Assert.assertEquals(100, config.getTaskCountMax()); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerMockTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerMockTest.java index c05b9ccc4cd6..d0e9f90f5844 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerMockTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerMockTest.java @@ -79,11 +79,11 @@ public void testScaleUpWhenOptimalGreaterThanCurrent() { // Use config with a long barrier to test cooldown behavior CostBasedAutoScalerConfig barrierConfig = CostBasedAutoScalerConfig.builder() - .taskCountMax(100) - .taskCountMin(1) - .enableTaskAutoScaler(true) - .minScaleDownDelay(Duration.standardHours(1)) - .build(); + .taskCountMax(100) + .taskCountMin(1) + .enableTaskAutoScaler(true) + .minScaleDownDelay(Duration.standardHours(1)) + .build(); CostBasedAutoScaler autoScaler = spy(new CostBasedAutoScaler( mockSupervisor, @@ -135,11 +135,11 @@ public void testScaleDownBlockedReturnsMinusOne() { // Use config with a long barrier to test cooldown behavior CostBasedAutoScalerConfig barrierConfig = CostBasedAutoScalerConfig.builder() - .taskCountMax(100) - .taskCountMin(1) - .enableTaskAutoScaler(true) - .minScaleDownDelay(Duration.standardHours(1)) - .build(); + .taskCountMax(100) + .taskCountMin(1) + .enableTaskAutoScaler(true) + .minScaleDownDelay(Duration.standardHours(1)) + .build(); CostBasedAutoScaler autoScaler = spy(new CostBasedAutoScaler( mockSupervisor, diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java index 16c2abf57cdb..d8f5591333c6 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java @@ -23,11 +23,14 @@ import org.apache.druid.indexing.seekablestream.SeekableStreamIndexTaskRunner; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisorIOConfig; +import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins.BurstScaleUpOnHighLagPlugin; +import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins.OptimalTaskCountBoundariesPlugin; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.segment.incremental.RowIngestionMeters; import org.joda.time.Duration; import org.junit.Assert; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; import org.mockito.Mockito; @@ -39,16 +42,13 @@ import static org.apache.druid.indexing.common.stats.DropwizardRowIngestionMeters.FIFTEEN_MINUTE_NAME; import static org.apache.druid.indexing.common.stats.DropwizardRowIngestionMeters.FIVE_MINUTE_NAME; import static org.apache.druid.indexing.common.stats.DropwizardRowIngestionMeters.ONE_MINUTE_NAME; -import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD; -import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler.computeScaleUpBoost; import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler.computeValidTaskCounts; import static org.mockito.Mockito.when; -@SuppressWarnings("SameParameterValue") +@SuppressWarnings({"SameParameterValue", "InstantiationOfUtilityClass"}) public class CostBasedAutoScalerTest { private CostBasedAutoScaler autoScaler; - private CostBasedAutoScalerConfig config; @Before public void setUp() @@ -62,67 +62,67 @@ public void setUp() when(mockSupervisor.getIoConfig()).thenReturn(mockIoConfig); when(mockIoConfig.getStream()).thenReturn("test-stream"); - config = CostBasedAutoScalerConfig.builder() - .taskCountMax(100) - .taskCountMin(1) - .enableTaskAutoScaler(true) - .lagWeight(0.6) - .idleWeight(0.4) - .build(); + CostBasedAutoScalerConfig config = CostBasedAutoScalerConfig.builder() + .taskCountMax(100) + .taskCountMin(1) + .enableTaskAutoScaler(true) + .lagWeight(0.6) + .idleWeight(0.4) + .build(); autoScaler = new CostBasedAutoScaler(mockSupervisor, config, mockSupervisorSpec, mockEmitter); } + @SuppressWarnings("InstantiationOfUtilityClass") @Test public void testComputeValidTaskCounts() { + OptimalTaskCountBoundariesPlugin boundariesPlugin = new OptimalTaskCountBoundariesPlugin(); + BurstScaleUpOnHighLagPlugin highLagPlugin = new BurstScaleUpOnHighLagPlugin(BurstScaleUpOnHighLagPlugin.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); + // For 100 partitions at 25 tasks (4 partitions/task), valid counts include 25 and 34 - int[] validTaskCounts = computeValidTaskCounts(100, 25, 0L, 1, 100, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); + int[] validTaskCounts = computeValidTaskCounts(100, 25, 0L, 1, 100, boundariesPlugin, highLagPlugin); Assert.assertTrue("Should contain the current task count", contains(validTaskCounts, 25)); Assert.assertTrue("Should contain the next scale-up option", contains(validTaskCounts, 34)); - // Edge cases - Assert.assertEquals(0, computeValidTaskCounts(0, 10, 0L, 1, 100, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD).length); - Assert.assertEquals(0, computeValidTaskCounts(-5, 10, 0L, 1, 100, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD).length); - // Single partition - int[] singlePartition = computeValidTaskCounts(1, 1, 0L, 1, 100, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); + int[] singlePartition = computeValidTaskCounts(1, 1, 0L, 1, 100, boundariesPlugin, highLagPlugin); Assert.assertTrue("Single partition should have at least one valid count", singlePartition.length > 0); Assert.assertTrue("Single partition should contain 1", contains(singlePartition, 1)); // Current exceeds partitions - should still yield valid, deduplicated options - int[] exceedsPartitions = computeValidTaskCounts(2, 5, 0L, 1, 100, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); + int[] exceedsPartitions = computeValidTaskCounts(2, 5, 0L, 1, 100, boundariesPlugin, highLagPlugin); Assert.assertEquals(2, exceedsPartitions.length); Assert.assertTrue(contains(exceedsPartitions, 1)); Assert.assertTrue(contains(exceedsPartitions, 2)); // Lag expansion: low lag should not include max, high lag should - int[] lowLagCounts = computeValidTaskCounts(30, 3, 0L, 1, 30, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); + int[] lowLagCounts = computeValidTaskCounts(30, 3, 0L, 1, 30, boundariesPlugin, highLagPlugin); Assert.assertFalse("Low lag should not include max task count", contains(lowLagCounts, 30)); Assert.assertTrue("Low lag should cap scale up around 4 tasks", contains(lowLagCounts, 4)); long highAggregateLag = 30L * 500_000L; - int[] highLagCounts = computeValidTaskCounts(30, 3, highAggregateLag, 1, 30, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); + int[] highLagCounts = computeValidTaskCounts(30, 3, highAggregateLag, 1, 30, boundariesPlugin, highLagPlugin); // With capped scaling (1->10 tasks jump max), we won't reach 30 immediately from 3. // We expect it to reach 10. Assert.assertTrue("High lag should allow scaling to 10 tasks", contains(highLagCounts, 10)); Assert.assertFalse("Should not jump straight to 30 from 3", contains(highLagCounts, 30)); // Respects taskCountMax - int[] cappedCounts = computeValidTaskCounts(30, 4, highAggregateLag, 1, 3, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); + int[] cappedCounts = computeValidTaskCounts(30, 4, highAggregateLag, 1, 3, boundariesPlugin, highLagPlugin); Assert.assertTrue("Should include taskCountMax when doable", contains(cappedCounts, 3)); Assert.assertFalse("Should not exceed taskCountMax", contains(cappedCounts, 4)); // Respects taskCountMin - filters out values below the minimum // With partitionCount=100, currentTaskCount=10, the computed range includes values like 8, 9, 10, 12, 13 - int[] minCappedCounts = computeValidTaskCounts(100, 10, 0L, 10, 100, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); + int[] minCappedCounts = computeValidTaskCounts(100, 10, 0L, 10, 100, boundariesPlugin, highLagPlugin); Assert.assertFalse("Should not go below taskCountMin", contains(minCappedCounts, 8)); Assert.assertFalse("Should not go below taskCountMin", contains(minCappedCounts, 9)); Assert.assertTrue("Should include values at taskCountMin", contains(minCappedCounts, 10)); Assert.assertTrue("Should include values above taskCountMin", contains(minCappedCounts, 12)); // Both bounds applied together - int[] bothBounds = computeValidTaskCounts(100, 10, 0L, 10, 12, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); + int[] bothBounds = computeValidTaskCounts(100, 10, 0L, 10, 12, boundariesPlugin, highLagPlugin); Assert.assertFalse("Should not go below taskCountMin", contains(bothBounds, 8)); Assert.assertFalse("Should not go below taskCountMin", contains(bothBounds, 9)); Assert.assertFalse("Should not exceed taskCountMax", contains(bothBounds, 13)); @@ -131,6 +131,7 @@ public void testComputeValidTaskCounts() } @Test + @Ignore public void testScalingExamplesTable() { int partitionCount = 30; @@ -149,12 +150,12 @@ public void testScalingExamplesTable() when(mockIoConfig.getStream()).thenReturn("test-stream"); CostBasedAutoScalerConfig localConfig = CostBasedAutoScalerConfig.builder() - .taskCountMax(taskCountMax) - .taskCountMin(1) - .enableTaskAutoScaler(true) - .lagWeight(0.6) - .idleWeight(0.4) - .build(); + .taskCountMax(taskCountMax) + .taskCountMin(1) + .enableTaskAutoScaler(true) + .lagWeight(0.6) + .idleWeight(0.4) + .build(); CostBasedAutoScaler localAutoScaler = new CostBasedAutoScaler( mockSupervisor, @@ -189,9 +190,20 @@ class Example new Example(25, 500_000L, 30) }; + OptimalTaskCountBoundariesPlugin boundariesPlugin = new OptimalTaskCountBoundariesPlugin(); + BurstScaleUpOnHighLagPlugin highLagPlugin = new BurstScaleUpOnHighLagPlugin(BurstScaleUpOnHighLagPlugin.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); + for (Example example : examples) { long aggregateLag = example.lagPerPartition * partitionCount; - int[] validCounts = computeValidTaskCounts(partitionCount, example.currentTasks, aggregateLag, 1, taskCountMax, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); + int[] validCounts = computeValidTaskCounts( + partitionCount, + example.currentTasks, + aggregateLag, + 1, + taskCountMax, + boundariesPlugin, + highLagPlugin + ); Assert.assertTrue( "Should include expected task count for current=" + example.currentTasks + ", lag=" + example.lagPerPartition, contains(validCounts, example.expectedTasks) @@ -218,28 +230,6 @@ class Example } } - @Test - public void testComputeScaleUpBoost() - { - // No extra increase below the threshold - Assert.assertEquals(0, computeScaleUpBoost(30L * 49_000L, 30, 3, 30, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD)); - - // Test exact values based on the formula: - // boost = max(0, currentPPT - targetMinPPT - 2) - - // Case 1: 3 tasks, 50k lag -> target 7 tasks -> targetMinPPT 5 -> currentPPT 10 -> boost 10-5-2 = 3 - Assert.assertEquals(3, computeScaleUpBoost(30L * 50_000L, 30, 3, 30, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD)); - - // Case 2: 3 tasks, 300k lag -> target 10 tasks -> targetMinPPT 3 -> currentPPT 10 -> boost 10-3-2 = 5 - Assert.assertEquals(5, computeScaleUpBoost(30L * 300_000L, 30, 3, 30, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD)); - - // Case 3: 3 tasks, 500k lag -> target 10 tasks (capped) -> targetMinPPT 3 -> boost 5 - Assert.assertEquals(5, computeScaleUpBoost(30L * 500_000L, 30, 3, 30, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD)); - - // Case 4: Zero when on max task count - Assert.assertEquals(0, computeScaleUpBoost(30L * 500_000L, 30, 30, 30, EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD)); - } - @Test public void testComputeOptimalTaskCount() { @@ -333,22 +323,40 @@ public void testExtractMovingAverage() // Interval fallback: 15-minute preferred, then 5-minute, then 1-minute Map> fifteenMin = new HashMap<>(); - fifteenMin.put("0", Collections.singletonMap("task-0", buildTaskStatsWithMovingAverageForInterval(FIFTEEN_MINUTE_NAME, 1500.0))); + fifteenMin.put( + "0", + Collections.singletonMap( + "task-0", + buildTaskStatsWithMovingAverageForInterval(FIFTEEN_MINUTE_NAME, 1500.0) + ) + ); Assert.assertEquals(1500.0, CostBasedAutoScaler.extractMovingAverage(fifteenMin), 0.0001); // 1-minute as a final fallback Map> oneMin = new HashMap<>(); - oneMin.put("0", Collections.singletonMap("task-0", buildTaskStatsWithMovingAverageForInterval(ONE_MINUTE_NAME, 500.0))); + oneMin.put( + "0", + Collections.singletonMap("task-0", buildTaskStatsWithMovingAverageForInterval(ONE_MINUTE_NAME, 500.0)) + ); Assert.assertEquals(500.0, CostBasedAutoScaler.extractMovingAverage(oneMin), 0.0001); // 15-minute preferred over 5-minute when both available Map> allIntervals = new HashMap<>(); - allIntervals.put("0", Collections.singletonMap("task-0", buildTaskStatsWithMultipleMovingAverages(1500.0, 1000.0, 500.0))); + allIntervals.put( + "0", + Collections.singletonMap("task-0", buildTaskStatsWithMultipleMovingAverages(1500.0, 1000.0, 500.0)) + ); Assert.assertEquals(1500.0, CostBasedAutoScaler.extractMovingAverage(allIntervals), 0.0001); // Falls back to 5-minute when 15-minute is null Map> nullFifteen = new HashMap<>(); - nullFifteen.put("0", Collections.singletonMap("task-0", buildTaskStatsWithNullInterval(FIFTEEN_MINUTE_NAME, FIVE_MINUTE_NAME, 750.0))); + nullFifteen.put( + "0", + Collections.singletonMap( + "task-0", + buildTaskStatsWithNullInterval(FIFTEEN_MINUTE_NAME, FIVE_MINUTE_NAME, 750.0) + ) + ); Assert.assertEquals(750.0, CostBasedAutoScaler.extractMovingAverage(nullFifteen), 0.0001); // Falls back to 1-minute when both 15 and 5 are null @@ -392,10 +400,10 @@ public void testComputeTaskCountForRolloverAndConfigProperties() // Test config defaults for minScaleDownDelay, defaultProcessingRate, scaleDownDuringTaskRolloverOnly CostBasedAutoScalerConfig cfgWithDefaults = CostBasedAutoScalerConfig.builder() - .taskCountMax(10) - .taskCountMin(1) - .enableTaskAutoScaler(true) - .build(); + .taskCountMax(10) + .taskCountMin(1) + .enableTaskAutoScaler(true) + .build(); Assert.assertEquals( CostBasedAutoScalerConfig.DEFAULT_MIN_SCALE_DELAY, cfgWithDefaults.getMinScaleDownDelay() @@ -405,13 +413,13 @@ public void testComputeTaskCountForRolloverAndConfigProperties() // Test custom config values CostBasedAutoScalerConfig cfgWithCustom = CostBasedAutoScalerConfig.builder() - .taskCountMax(10) - .taskCountMin(1) - .enableTaskAutoScaler(true) - .minScaleDownDelay(Duration.standardMinutes(10)) - .defaultProcessingRate(5000.0) - .scaleDownDuringTaskRolloverOnly(true) - .build(); + .taskCountMax(10) + .taskCountMin(1) + .enableTaskAutoScaler(true) + .minScaleDownDelay(Duration.standardMinutes(10)) + .defaultProcessingRate(5000.0) + .scaleDownDuringTaskRolloverOnly(true) + .build(); Assert.assertEquals(Duration.standardMinutes(10), cfgWithCustom.getMinScaleDownDelay()); Assert.assertEquals(5000.0, cfgWithCustom.getDefaultProcessingRate(), 0.001); Assert.assertTrue(cfgWithCustom.isScaleDownOnTaskRolloverOnly()); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java index e2835219e272..7a50ff2cb0f7 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java @@ -19,6 +19,7 @@ package org.apache.druid.indexing.seekablestream.supervisor.autoscaler; +import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins.BurstScaleUpOnHighLagPlugin; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -47,13 +48,13 @@ public void testComputeCostInvalidInputs() { CostMetrics validMetrics = createMetrics(100000.0, 10, 100, 0.3); - Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(null, 10, config).totalCost(), 0.0); - Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(validMetrics, 10, null).totalCost(), 0.0); - Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(validMetrics, 0, config).totalCost(), 0.0); - Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(validMetrics, -5, config).totalCost(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(null, 10, config, null).totalCost(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(validMetrics, 10, null, null).totalCost(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(validMetrics, 0, config, null).totalCost(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(validMetrics, -5, config, null).totalCost(), 0.0); Assert.assertEquals( Double.POSITIVE_INFINITY, - costFunction.computeCost(createMetrics(0.0, 10, 0, 0.3), 10, config).totalCost(), + costFunction.computeCost(createMetrics(0.0, 10, 0, 0.3), 10, config, null).totalCost(), 0.0 ); } @@ -72,8 +73,8 @@ public void testScaleDownHasHigherLagCostThanCurrent() CostMetrics metrics = createMetrics(200000.0, 10, 200, 0.3); - double costCurrent = costFunction.computeCost(metrics, 10, lagOnlyConfig).totalCost(); - double costScaleDown = costFunction.computeCost(metrics, 5, lagOnlyConfig).totalCost(); + double costCurrent = costFunction.computeCost(metrics, 10, lagOnlyConfig, null).totalCost(); + double costScaleDown = costFunction.computeCost(metrics, 5, lagOnlyConfig, null).totalCost(); // Scale down uses absolute model: lag / (5 * rate) = higher recovery time // Current uses absolute model: lag / (10 * rate) = lower recovery time @@ -101,15 +102,15 @@ public void testLagCostWithMarginalModel() CostMetrics metrics = createMetrics(100000.0, 10, 100, 0.3); // Current (10 tasks): uses absolute model = 10M / (10 * 1000) = 1000s - double costCurrent = costFunction.computeCost(metrics, 10, lagOnlyConfig).totalCost(); + double costCurrent = costFunction.computeCost(metrics, 10, lagOnlyConfig, null).totalCost(); Assert.assertEquals("Cost at current tasks", 1000., costCurrent, 0.1); // Scale up by 5 (to 15): marginal model = 10M / (15 * 1000) = 666 - double costUp5 = costFunction.computeCost(metrics, 15, lagOnlyConfig).totalCost(); + double costUp5 = costFunction.computeCost(metrics, 15, lagOnlyConfig, null).totalCost(); Assert.assertEquals("Cost when scaling up by 5", 666.7, costUp5, 0.1); // Scale up by 10 (to 20): marginal model = 10M / (20 * 1000) = 500s - double costUp10 = costFunction.computeCost(metrics, 20, lagOnlyConfig).totalCost(); + double costUp10 = costFunction.computeCost(metrics, 20, lagOnlyConfig, null).totalCost(); Assert.assertEquals("Cost when scaling up by 10", 500.0, costUp10, 0.01); // Adding more tasks reduces lag recovery time @@ -121,8 +122,8 @@ public void testBalancedWeightsFavorStabilityOverScaleUpOnSmallLag() { // Validate idle ratio estimation and ensure balanced weights still favor stability. CostMetrics metrics = createMetrics(100.0, 10, 100, 0.3); - double costCurrent = costFunction.computeCost(metrics, 10, config).totalCost(); - double costScaleUp = costFunction.computeCost(metrics, 20, config).totalCost(); + double costCurrent = costFunction.computeCost(metrics, 10, config, null).totalCost(); + double costScaleUp = costFunction.computeCost(metrics, 20, config, null).totalCost(); Assert.assertTrue( "With balanced weights, staying at current count is cheaper than scale-up", @@ -153,8 +154,8 @@ public void testWeightsAffectCost() CostMetrics metrics = createMetrics(100000.0, 10, 100, 0.1); - double costLag = costFunction.computeCost(metrics, 10, lagOnly).totalCost(); - double costIdle = costFunction.computeCost(metrics, 10, idleOnly).totalCost(); + double costLag = costFunction.computeCost(metrics, 10, lagOnly, null).totalCost(); + double costIdle = costFunction.computeCost(metrics, 10, idleOnly, null).totalCost(); Assert.assertNotEquals("Different weights should produce different costs", costLag, costIdle, 0.0001); Assert.assertTrue("Lag-only cost should be positive", costLag > 0.0); @@ -169,9 +170,9 @@ public void testNoProcessingRateFavorsCurrentTaskCount() int currentTaskCount = 10; CostMetrics metricsNoRate = createMetricsWithRate(50000.0, currentTaskCount, 100, 0.3, 0.0); - double costAtCurrent = costFunction.computeCost(metricsNoRate, currentTaskCount, config).totalCost(); - double costScaleUp = costFunction.computeCost(metricsNoRate, currentTaskCount + 5, config).totalCost(); - double costScaleDown = costFunction.computeCost(metricsNoRate, currentTaskCount - 5, config).totalCost(); + double costAtCurrent = costFunction.computeCost(metricsNoRate, currentTaskCount, config, null).totalCost(); + double costScaleUp = costFunction.computeCost(metricsNoRate, currentTaskCount + 5, config, null).totalCost(); + double costScaleDown = costFunction.computeCost(metricsNoRate, currentTaskCount - 5, config, null).totalCost(); Assert.assertTrue( "Cost at current should be less than cost for scale up", @@ -200,8 +201,8 @@ public void testNoProcessingRateDeviationPenaltyIsSymmetric() .defaultProcessingRate(1000.0) .build(); - double costUp5 = costFunction.computeCost(metricsNoRate, currentTaskCount + 5, lagOnlyConfig).totalCost(); - double costDown5 = costFunction.computeCost(metricsNoRate, currentTaskCount - 5, lagOnlyConfig).totalCost(); + double costUp5 = costFunction.computeCost(metricsNoRate, currentTaskCount + 5, lagOnlyConfig, null).totalCost(); + double costDown5 = costFunction.computeCost(metricsNoRate, currentTaskCount - 5, lagOnlyConfig, null).totalCost(); Assert.assertEquals( "Lag cost for +5 and -5 deviation should be equal", @@ -228,10 +229,10 @@ public void testIdleCostMonotonicWithTaskCount() // Current: 10 tasks with 40% idle (60% busy) CostMetrics metrics = createMetrics(0.0, 10, 100, 0.4); - double costAt5 = costFunction.computeCost(metrics, 5, idleOnlyConfig).totalCost(); - double costAt10 = costFunction.computeCost(metrics, 10, idleOnlyConfig).totalCost(); - double costAt15 = costFunction.computeCost(metrics, 15, idleOnlyConfig).totalCost(); - double costAt20 = costFunction.computeCost(metrics, 20, idleOnlyConfig).totalCost(); + double costAt5 = costFunction.computeCost(metrics, 5, idleOnlyConfig, null).totalCost(); + double costAt10 = costFunction.computeCost(metrics, 10, idleOnlyConfig, null).totalCost(); + double costAt15 = costFunction.computeCost(metrics, 15, idleOnlyConfig, null).totalCost(); + double costAt20 = costFunction.computeCost(metrics, 20, idleOnlyConfig, null).totalCost(); // Monotonically increasing idle cost as tasks increase Assert.assertTrue("cost(5) < cost(10)", costAt5 < costAt10); @@ -255,7 +256,7 @@ public void testIdleRatioClampingAtBoundaries() // busyFraction = 0.6, taskRatio = 0.2 // predictedIdle = 1 - 0.6/0.2 = 1 - 3 = -2 → clamped to 0 CostMetrics metrics = createMetrics(0.0, 10, 100, 0.4); - double costAt2 = costFunction.computeCost(metrics, 2, idleOnlyConfig).totalCost(); + double costAt2 = costFunction.computeCost(metrics, 2, idleOnlyConfig, null).totalCost(); // idlenessCost = taskCount * taskDuration * 0.0 (clamped) = 0 Assert.assertEquals("Idle cost should be 0 when predicted idle is clamped to 0", 0.0, costAt2, 0.0001); @@ -265,7 +266,7 @@ public void testIdleRatioClampingAtBoundaries() // busyFraction = 0.9, taskRatio = 10 // predictedIdle = 1 - 0.9/10 = 1 - 0.09 = 0.91 (within bounds) CostMetrics lowIdle = createMetrics(0.0, 10, 100, 0.1); - double costAt100 = costFunction.computeCost(lowIdle, 100, idleOnlyConfig).totalCost(); + double costAt100 = costFunction.computeCost(lowIdle, 100, idleOnlyConfig, null).totalCost(); // idlenessCost = 100 * 3600 * 0.91 = 327600 Assert.assertTrue("Cost should be finite and positive", Double.isFinite(costAt100) && costAt100 > 0); } @@ -285,8 +286,8 @@ public void testIdleRatioWithMissingData() // Negative idle ratio indicates missing data → should default to 0.5 CostMetrics missingIdleData = createMetrics(0.0, 10, 100, -1.0); - double cost10 = costFunction.computeCost(missingIdleData, 10, idleOnlyConfig).totalCost(); - double cost20 = costFunction.computeCost(missingIdleData, 20, idleOnlyConfig).totalCost(); + double cost10 = costFunction.computeCost(missingIdleData, 10, idleOnlyConfig, null).totalCost(); + double cost20 = costFunction.computeCost(missingIdleData, 20, idleOnlyConfig, null).totalCost(); // With missing data, predicted idle = 0.5 for all task counts // idlenessCost at 10 = 10 * 3600 * 0.5 = 18000 @@ -310,11 +311,14 @@ public void testLagAmplificationReducesIdleUnderHighLag() int partitionCount = 30; double pollIdleRatio = 0.1; + // Plugin with threshold between lowLag (5000) and highLag (500000) + BurstScaleUpOnHighLagPlugin highLagPlugin = new BurstScaleUpOnHighLagPlugin(10_000); + CostMetrics lowLag = createMetrics(5_000.0, currentTaskCount, partitionCount, pollIdleRatio); CostMetrics highLag = createMetrics(500_000.0, currentTaskCount, partitionCount, pollIdleRatio); - double lowLagCost = costFunction.computeCost(lowLag, proposedTaskCount, idleOnlyConfig).totalCost(); - double highLagCost = costFunction.computeCost(highLag, proposedTaskCount, idleOnlyConfig).totalCost(); + double lowLagCost = costFunction.computeCost(lowLag, proposedTaskCount, idleOnlyConfig, highLagPlugin).totalCost(); + double highLagCost = costFunction.computeCost(highLag, proposedTaskCount, idleOnlyConfig, highLagPlugin).totalCost(); Assert.assertTrue( "Higher lag should reduce predicted idle more aggressively", lowLagCost > highLagCost @@ -330,9 +334,7 @@ public void testCustomLagThresholdsAffectCostCalculation() int partitionCount = 30; double pollIdleRatio = 0.1; - // Use high lag that exceeds both default and custom thresholds - // Default thresholds: extra=25000, aggressive=50000 - // Custom thresholds: extra=10000, aggressive=20000 (more sensitive) + // Use lag that exceeds sensitive threshold (10000) but not default threshold (50000) CostMetrics metrics = createMetrics(15_000.0, currentTaskCount, partitionCount, pollIdleRatio); CostBasedAutoScalerConfig defaultConfig = CostBasedAutoScalerConfig.builder() @@ -348,11 +350,15 @@ public void testCustomLagThresholdsAffectCostCalculation() .enableTaskAutoScaler(true) .defaultProcessingRate(1000.0) .highLagThreshold(10000) - .aggressiveScalingLagPerPartitionThreshold() .build(); - double defaultCost = costFunction.computeCost(metrics, proposedTaskCount, defaultConfig).totalCost(); - double sensitiveCost = costFunction.computeCost(metrics, proposedTaskCount, sensitiveConfig).totalCost(); + // Default plugin: threshold 50000, lag 15000 < 50000, no amplification + BurstScaleUpOnHighLagPlugin defaultPlugin = new BurstScaleUpOnHighLagPlugin(50000); + // Sensitive plugin: threshold 10000, lag 15000 > 10000, amplification happens + BurstScaleUpOnHighLagPlugin sensitivePlugin = new BurstScaleUpOnHighLagPlugin(10000); + + double defaultCost = costFunction.computeCost(metrics, proposedTaskCount, defaultConfig, defaultPlugin).totalCost(); + double sensitiveCost = costFunction.computeCost(metrics, proposedTaskCount, sensitiveConfig, sensitivePlugin).totalCost(); // With lower thresholds, the same lag triggers more aggressive scaling behavior // (higher lagBusyFactor), which results in lower predicted idle and thus lower idle cost @@ -382,9 +388,11 @@ public void testRampDenominatorCalculation() .enableTaskAutoScaler(true) .defaultProcessingRate(1000.0) .highLagThreshold(10000) - .aggressiveScalingLagPerPartitionThreshold() .build(); + // Plugin with threshold 10000 + BurstScaleUpOnHighLagPlugin highLagPlugin = new BurstScaleUpOnHighLagPlugin(10000); + // Lag exactly at extraThreshold (lagPerPartition = 10000) // ramp = (10000 - 10000) / 90000 = 0 CostMetrics atExtraThreshold = createMetrics(10_000.0, currentTaskCount, partitionCount, pollIdleRatio); @@ -393,8 +401,8 @@ public void testRampDenominatorCalculation() // ramp = (100000 - 10000) / 90000 = 1.0 CostMetrics atMaxLag = createMetrics(100_000.0, currentTaskCount, partitionCount, pollIdleRatio); - double costAtExtra = costFunction.computeCost(atExtraThreshold, proposedTaskCount, customConfig).totalCost(); - double costAtMax = costFunction.computeCost(atMaxLag, proposedTaskCount, customConfig).totalCost(); + double costAtExtra = costFunction.computeCost(atExtraThreshold, proposedTaskCount, customConfig, highLagPlugin).totalCost(); + double costAtMax = costFunction.computeCost(atMaxLag, proposedTaskCount, customConfig, highLagPlugin).totalCost(); // At max lag, ramp=1.0 leads to maximum amplification, reducing idle cost more Assert.assertTrue( From fe1b605d5d8334edc136e6d4219117c893056329 Mon Sep 17 00:00:00 2001 From: Sasha Syrotenko Date: Wed, 4 Feb 2026 15:27:58 +0200 Subject: [PATCH 07/14] Update indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java Co-authored-by: Kashif Faraz --- .../supervisor/autoscaler/CostBasedAutoScaler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java index 5fabaa6dcae8..3464e6a0475d 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java @@ -225,7 +225,7 @@ int computeOptimalTaskCount(CostMetrics metrics) CostResult costResult = costFunction.computeCost(metrics, taskCount, config, burstScaleUpPlugin); double cost = costResult.totalCost(); log.info( - "Proposed task count: %d, Cost: %.4f (lag: %.4f, idle: %.4f)", + "Proposed task count[%d] has total Cost[%.4f] = lagCost[%.4f] + idleCost[%.4f]", taskCount, cost, costResult.lagCost(), From 504966fc1f49813db9b79201b8f2b32c6bc68f63 Mon Sep 17 00:00:00 2001 From: Sasha Syrotenko Date: Wed, 4 Feb 2026 15:31:04 +0200 Subject: [PATCH 08/14] Clarify minScaleDownDelay --- docs/ingestion/supervisor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ingestion/supervisor.md b/docs/ingestion/supervisor.md index 016b0887ed1e..3e4757253e6d 100644 --- a/docs/ingestion/supervisor.md +++ b/docs/ingestion/supervisor.md @@ -211,7 +211,7 @@ The following table outlines the configuration properties related to the `costBa |`useTaskCountBoundaries`|Enables the bounded partitions-per-task window when selecting task counts.|No|`false`| |`useBurstScaleOnHeavyLag`|Enables burst scale-up when per-partition lag is high.|No|`false`| |`highLagThreshold`|Per-partition lag threshold that triggers burst scale-up when `useBurstScaleOnHeavyLag` is enabled.|No|50000| -|`minScaleDownDelay`| Minimum duration between successful scale actions, specified as an ISO-8601 duration string. | No | `PT50M` | +|`minScaleDownDelay`| Minimum duration between successful scale actions, specified as an ISO-8601 duration string. | No | `PT30M` | |`scaleDownDuringTaskRolloverOnly`| Indicates whether task scaling down is limited to periods during task rollovers only. | No | False | The following example shows a supervisor spec with `lagBased` autoscaler: From 75b15d67b903d2b8541721c599c5e44a1a44ee85 Mon Sep 17 00:00:00 2001 From: Sasha Syrotenko Date: Wed, 4 Feb 2026 17:15:39 +0200 Subject: [PATCH 09/14] Move BurstScaleUpOnHighLagPlugin on logarithm base instead of sqrt --- .../plugins/BurstScaleUpOnHighLagPlugin.java | 31 ++--- .../autoscaler/CostBasedAutoScalerTest.java | 110 +---------------- .../BurstScaleUpOnHighLagPluginTest.java | 114 ++++++++++++++++++ 3 files changed, 135 insertions(+), 120 deletions(-) create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPluginTest.java diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java index b77f98f8f3cf..15f7da8ce882 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java @@ -19,6 +19,7 @@ package org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins; +@SuppressWarnings("ClassCanBeRecord") public final class BurstScaleUpOnHighLagPlugin { @@ -26,10 +27,11 @@ public final class BurstScaleUpOnHighLagPlugin public static final int EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD = 50_000; /** - * Controls how fast the additional tasks grow with the square root of current tasks. - * This allows bigger jumps when under-provisioned, but growth slows down as the task count increases. + * Divisor for partition count in the K formula: K = (partitionCount / K_PARTITION_DIVISOR) / sqrt(currentTaskCount). + * This controls how aggressive the scaling is relative to partition count. + * That value was chosen by carefully analyzing the math model behind the implementation. */ - private static final int SQRT_TASK_COUNT_SCALE_FACTOR = 3; + private static final double K_PARTITION_DIVISOR = 6.4; private final int lagThreshold; @@ -47,13 +49,14 @@ public int lagThreshold() * Computes extra allowed increase in partitions-per-task in scenarios when the average per-partition lag * is above the configured threshold. *

- * This uses a capped sqrt-based formula: - * {@code additionalTasks = min(MAX_JUMP, BASE + sqrt(currentTasks) * SQRT_COEFF) * lagFactor * headroom} + * This uses a logarithmic formula for consistent absolute growth: + * {@code deltaTasks = K * ln(lagSeverity)} + * where {@code K = (partitionCount / 6.4) / sqrt(currentTaskCount)} *

* This ensures: - * 1. Bigger jumps are allowed when under-provisioned. - * 2. Sqrt growth (additional tasks grow slower than task count). - * 3. Self-damping (headroomRatio reduces jumps near max capacity). + * 1. Partition-aware scaling: larger datasets get more aggressive scaling. + * 2. Small taskCount's get a massive relative boost, while large taskCount's receive more measured, stable increases. + * 3. Logarithmic lag response: diminishing returns at extreme lag values. */ public int computeScaleUpBoost( double aggregateLag, @@ -72,15 +75,13 @@ public int computeScaleUpBoost( } final double lagSeverity = lagPerPartition / lagThreshold; - final double lagFactor = lagSeverity / (lagSeverity + 1.0); - // Use quadratic headroom damping to maintain higher pressure near capacity - final double headroomRatio = Math.max(0.0, 1.0 - Math.pow((double) currentTaskCount / taskCountMax, 2)); - // Compute target additional tasks (sqrt-based growth) - final double rawAdditional = 1.0 + Math.sqrt(currentTaskCount) * SQRT_TASK_COUNT_SCALE_FACTOR; - final double deltaTasks = rawAdditional * lagFactor * headroomRatio; - final double targetTaskCount = Math.min((double) taskCountMax, (double) currentTaskCount + deltaTasks); + // Logarithmic growth: ln(lagSeverity) is positive when lagSeverity > 1 + // First multoplier decreases with sqrt(currentTaskCount): aggressive when small, conservative when large + final double deltaTasks = (partitionCount / K_PARTITION_DIVISOR) / Math.sqrt(currentTaskCount) * Math.log(lagSeverity); + + final double targetTaskCount = Math.min(taskCountMax, (double) currentTaskCount + deltaTasks); // Compute precise PPT reduction to avoid early integer truncation artifacts final double currentPPT = (double) partitionCount / currentTaskCount; diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java index d8f5591333c6..522be184ba7e 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java @@ -30,11 +30,9 @@ import org.joda.time.Duration; import org.junit.Assert; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; import org.mockito.Mockito; -import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.Map; @@ -96,16 +94,18 @@ public void testComputeValidTaskCounts() Assert.assertTrue(contains(exceedsPartitions, 1)); Assert.assertTrue(contains(exceedsPartitions, 2)); - // Lag expansion: low lag should not include max, high lag should + // Lag expansion: low lag should not include max, high lag should allow aggressive scaling int[] lowLagCounts = computeValidTaskCounts(30, 3, 0L, 1, 30, boundariesPlugin, highLagPlugin); Assert.assertFalse("Low lag should not include max task count", contains(lowLagCounts, 30)); Assert.assertTrue("Low lag should cap scale up around 4 tasks", contains(lowLagCounts, 4)); + // High lag uses logarithmic formula: K * ln(lagSeverity) where K = P/(6.4*sqrt(C)) + // For P=30, C=3, lagPerPartition=500K, threshold=50K: lagSeverity=10, K=2.7, delta=6.2 + // This allows controlled scaling to ~10-15 tasks (not all the way to max) long highAggregateLag = 30L * 500_000L; int[] highLagCounts = computeValidTaskCounts(30, 3, highAggregateLag, 1, 30, boundariesPlugin, highLagPlugin); - // With capped scaling (1->10 tasks jump max), we won't reach 30 immediately from 3. - // We expect it to reach 10. Assert.assertTrue("High lag should allow scaling to 10 tasks", contains(highLagCounts, 10)); + Assert.assertTrue("High lag should allow scaling to 15 tasks", contains(highLagCounts, 15)); Assert.assertFalse("Should not jump straight to 30 from 3", contains(highLagCounts, 30)); // Respects taskCountMax @@ -130,106 +130,6 @@ public void testComputeValidTaskCounts() Assert.assertTrue("Should include values at taskCountMax", contains(bothBounds, 12)); } - @Test - @Ignore - public void testScalingExamplesTable() - { - int partitionCount = 30; - int taskCountMax = 30; - double pollIdleRatio = 0.1; - double avgProcessingRate = 10.0; - - // Create a local autoScaler with taskCountMax matching the test parameters - SupervisorSpec mockSpec = Mockito.mock(SupervisorSpec.class); - SeekableStreamSupervisor mockSupervisor = Mockito.mock(SeekableStreamSupervisor.class); - ServiceEmitter mockEmitter = Mockito.mock(ServiceEmitter.class); - SeekableStreamSupervisorIOConfig mockIoConfig = Mockito.mock(SeekableStreamSupervisorIOConfig.class); - - when(mockSpec.getId()).thenReturn("test-supervisor"); - when(mockSupervisor.getIoConfig()).thenReturn(mockIoConfig); - when(mockIoConfig.getStream()).thenReturn("test-stream"); - - CostBasedAutoScalerConfig localConfig = CostBasedAutoScalerConfig.builder() - .taskCountMax(taskCountMax) - .taskCountMin(1) - .enableTaskAutoScaler(true) - .lagWeight(0.6) - .idleWeight(0.4) - .build(); - - CostBasedAutoScaler localAutoScaler = new CostBasedAutoScaler( - mockSupervisor, - localConfig, - mockSpec, - mockEmitter - ); - - class Example - { - final int currentTasks; - final long lagPerPartition; - final int expectedTasks; - - Example(int currentTasks, long lagPerPartition, int expectedTasks) - { - this.currentTasks = currentTasks; - this.lagPerPartition = lagPerPartition; - this.expectedTasks = expectedTasks; - } - } - - // Updated expectations based on capped sqrt-based scaling - Example[] examples = new Example[]{ - new Example(3, 50_000L, 6), - new Example(3, 300_000L, 10), - new Example(3, 500_000L, 10), - new Example(10, 100_000L, 30), - new Example(10, 300_000L, 30), - new Example(10, 500_000L, 30), - new Example(20, 500_000L, 30), - new Example(25, 500_000L, 30) - }; - - OptimalTaskCountBoundariesPlugin boundariesPlugin = new OptimalTaskCountBoundariesPlugin(); - BurstScaleUpOnHighLagPlugin highLagPlugin = new BurstScaleUpOnHighLagPlugin(BurstScaleUpOnHighLagPlugin.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); - - for (Example example : examples) { - long aggregateLag = example.lagPerPartition * partitionCount; - int[] validCounts = computeValidTaskCounts( - partitionCount, - example.currentTasks, - aggregateLag, - 1, - taskCountMax, - boundariesPlugin, - highLagPlugin - ); - Assert.assertTrue( - "Should include expected task count for current=" + example.currentTasks + ", lag=" + example.lagPerPartition, - contains(validCounts, example.expectedTasks) - ); - - CostMetrics metrics = createMetricsWithRate( - example.lagPerPartition, - example.currentTasks, - partitionCount, - pollIdleRatio, - avgProcessingRate - ); - int actualOptimal = localAutoScaler.computeOptimalTaskCount(metrics); - if (actualOptimal == -1) { - actualOptimal = example.currentTasks; - } - Assert.assertEquals( - "Optimal task count should match for current=" + example.currentTasks - + ", lag=" + example.lagPerPartition - + ", valid=" + Arrays.toString(validCounts), - example.expectedTasks, - actualOptimal - ); - } - } - @Test public void testComputeOptimalTaskCount() { diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPluginTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPluginTest.java new file mode 100644 index 000000000000..45cffbd4f5ba --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPluginTest.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins; + +import org.junit.Assert; +import org.junit.Test; + +/** + * Tests for {@link BurstScaleUpOnHighLagPlugin}. + *

+ * The plugin uses a logarithmic formula for burst scaling: + * {@code deltaTasks = K * ln(lagSeverity)} + * where {@code K = (partitionCount / 6.4) / sqrt(currentTaskCount)} + */ +public class BurstScaleUpOnHighLagPluginTest +{ + private static final int LAG_THRESHOLD = BurstScaleUpOnHighLagPlugin.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD; + private static final int PARTITION_COUNT = 48; + private static final int TASK_COUNT_MAX = 48; + + /** + * Tests scaling behavior across different lag levels and task counts. + *

+ * Expected behavior for 48 partitions with threshold=50K: + *

+   * | Current | Lag/Part | Boost | Notes                                      |
+   * |---------|----------|-------|---------------------------------------------|
+   * | any     | <50K     | 0     | Below threshold                             |
+   * | any     | =50K     | 0     | ln(1) = 0                                   |
+   * | 1       | 100K     | 40    | Significant boost for emergency recovery    |
+   * | 1       | 200K     | 43    | Large boost                                 |
+   * | 4       | 200K     | 6     | Moderate boost (K decreases with sqrt(C))   |
+   * | 12      | 200K     | 0     | Delta too small for PPT change              |
+   * | 24      | 200K     | 0     | Delta too small for PPT change              |
+   * 
+ * At high task counts (C=12, C=24), the delta tasks from the formula is small + * (due to K decreasing with sqrt(C)), resulting in no PPT reduction. + */ + @Test + public void testComputeScaleUpBoost() + { + BurstScaleUpOnHighLagPlugin plugin = new BurstScaleUpOnHighLagPlugin(LAG_THRESHOLD); + + // Below threshold: no boost + Assert.assertEquals( + "Below threshold should return 0", + 0, + plugin.computeScaleUpBoost(PARTITION_COUNT * 40_000L, PARTITION_COUNT, 4, TASK_COUNT_MAX) + ); + + // At threshold (lagSeverity=1, ln(1)=0): no boost + Assert.assertEquals( + "At threshold should return 0", + 0, + plugin.computeScaleUpBoost(PARTITION_COUNT * 50_000L, PARTITION_COUNT, 4, TASK_COUNT_MAX) + ); + + // C=1, 100K lag (2x threshold): significant boost for emergency recovery + int boost1_100k = plugin.computeScaleUpBoost(PARTITION_COUNT * 100_000L, PARTITION_COUNT, 1, TASK_COUNT_MAX); + Assert.assertEquals("C=1, 100K lag boost", 40, boost1_100k); + + // C=1, 200K lag (4x threshold): large boost + int boost1_200k = plugin.computeScaleUpBoost(PARTITION_COUNT * 200_000L, PARTITION_COUNT, 1, TASK_COUNT_MAX); + Assert.assertEquals("C=1, 200K lag boost", 43, boost1_200k); + + // C=4, 200K lag: moderate boost (K decreases with sqrt(C)) + int boost4_200k = plugin.computeScaleUpBoost(PARTITION_COUNT * 200_000L, PARTITION_COUNT, 4, TASK_COUNT_MAX); + Assert.assertEquals("C=4, 200K lag boost", 6, boost4_200k); + + // C=12, 200K lag: delta too small to change PPT + int boost12_200k = plugin.computeScaleUpBoost(PARTITION_COUNT * 200_000L, PARTITION_COUNT, 12, TASK_COUNT_MAX); + Assert.assertEquals("C=12, 200K lag boost", 0, boost12_200k); + + // C=24, 200K lag: delta too small to change PPT + int boost24_200k = plugin.computeScaleUpBoost(PARTITION_COUNT * 200_000L, PARTITION_COUNT, 24, TASK_COUNT_MAX); + Assert.assertEquals("C=24, 200K lag boost", 0, boost24_200k); + } + + @Test + public void testComputeScaleUpBoostInvalidInputs() + { + BurstScaleUpOnHighLagPlugin plugin = new BurstScaleUpOnHighLagPlugin(LAG_THRESHOLD); + + Assert.assertEquals(0, plugin.computeScaleUpBoost(1_000_000, 0, 4, 48)); + Assert.assertEquals(0, plugin.computeScaleUpBoost(1_000_000, 48, 0, 48)); + Assert.assertEquals(0, plugin.computeScaleUpBoost(1_000_000, 48, 4, 0)); + Assert.assertEquals(0, plugin.computeScaleUpBoost(1_000_000, -1, 4, 48)); + } + + @Test + public void testLagThreshold() + { + int customThreshold = 100_000; + BurstScaleUpOnHighLagPlugin plugin = new BurstScaleUpOnHighLagPlugin(customThreshold); + Assert.assertEquals(customThreshold, plugin.lagThreshold()); + } +} From e3b27330cd516ecccd06f85b1781e1addf115c07 Mon Sep 17 00:00:00 2001 From: Sasha Syrotenko Date: Wed, 4 Feb 2026 17:26:27 +0200 Subject: [PATCH 10/14] Get rid of useBurstScaleOnHeavyLag flag --- docs/ingestion/supervisor.md | 11 +++--- .../autoscaler/CostBasedAutoScaler.java | 2 +- .../autoscaler/CostBasedAutoScalerConfig.java | 34 +++---------------- .../plugins/BurstScaleUpOnHighLagPlugin.java | 3 -- .../CostBasedAutoScalerConfigTest.java | 2 +- .../autoscaler/CostBasedAutoScalerTest.java | 2 +- .../BurstScaleUpOnHighLagPluginTest.java | 2 +- 7 files changed, 13 insertions(+), 43 deletions(-) diff --git a/docs/ingestion/supervisor.md b/docs/ingestion/supervisor.md index 3e4757253e6d..e71f102ffdc7 100644 --- a/docs/ingestion/supervisor.md +++ b/docs/ingestion/supervisor.md @@ -202,15 +202,14 @@ Note: Kinesis is not supported yet, support is in progress. The following table outlines the configuration properties related to the `costBased` autoscaler strategy: -| Property|Description|Required|Default| -|---------|---------------------------------------------------|---|-----| +| Property|Description|Required| Default | +|---------|---------------------------------------------------|---|--| |`scaleActionPeriodMillis`|The frequency in milliseconds to check if a scale action is triggered. | No | 60000 | -|`lagWeight`|The weight of extracted lag value in cost function.| No| 0.25| +|`lagWeight`|The weight of extracted lag value in cost function.| No| 0.25 | |`idleWeight`|The weight of extracted poll idle value in cost function. | No | 0.75 | |`defaultProcessingRate`|A planned processing rate per task, required for first cost estimations. | No | 1000 | -|`useTaskCountBoundaries`|Enables the bounded partitions-per-task window when selecting task counts.|No|`false`| -|`useBurstScaleOnHeavyLag`|Enables burst scale-up when per-partition lag is high.|No|`false`| -|`highLagThreshold`|Per-partition lag threshold that triggers burst scale-up when `useBurstScaleOnHeavyLag` is enabled.|No|50000| +|`useTaskCountBoundaries`|Enables the bounded partitions-per-task window when selecting task counts.|No| `false` | +|`highLagThreshold`|Per-partition lag threshold that triggers burst scale-up when set to `0` or higher. Set to a negative value to disable burst scale-up.|No|-1| |`minScaleDownDelay`| Minimum duration between successful scale actions, specified as an ISO-8601 duration string. | No | `PT30M` | |`scaleDownDuringTaskRolloverOnly`| Indicates whether task scaling down is limited to periods during task rollovers only. | No | False | diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java index 3464e6a0475d..8f2e5c55ce26 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java @@ -104,7 +104,7 @@ public CostBasedAutoScaler( this.boundariesPlugin = new OptimalTaskCountBoundariesPlugin(); } - if (config.shouldUseBurstScaleOnHeavyLag()) { + if (config.getHighLagThreshold() >= 0) { this.burstScaleUpPlugin = new BurstScaleUpOnHighLagPlugin(config.getHighLagThreshold()); } } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java index 606fbea570c3..df397d5d59e1 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java @@ -28,7 +28,6 @@ import org.apache.druid.indexing.overlord.supervisor.SupervisorSpec; import org.apache.druid.indexing.overlord.supervisor.autoscaler.SupervisorTaskAutoScaler; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor; -import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins.BurstScaleUpOnHighLagPlugin; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.joda.time.Duration; @@ -68,11 +67,6 @@ public class CostBasedAutoScalerConfig implements AutoScalerConfig * considering only task counts within a certain PPT-based window around the current PPT. */ private final boolean useTaskCountBoundaries; - /** - * Enables or disables {@code BurstScaleUpOnHighLagPlugin} which allows - * applying burst scale-up when high lag is detected. - */ - private final boolean useBurstScaleOnHeavyLag; /** * Per-partition lag threshold allowing to activate a burst scaleup to eliminate high lag. */ @@ -102,7 +96,6 @@ public CostBasedAutoScalerConfig( @Nullable @JsonProperty("idleWeight") Double idleWeight, @Nullable @JsonProperty("defaultProcessingRate") Double defaultProcessingRate, @Nullable @JsonProperty("useTaskCountBoundaries") Boolean useTaskCountBoundaries, - @Nullable @JsonProperty("useBurstScaleOnHeavyLag") Boolean useBurstScaleOnHeavyLag, @Nullable @JsonProperty("highLagThreshold") Integer highLagThreshold, @Nullable @JsonProperty("minScaleDownDelay") Duration minScaleDownDelay, @Nullable @JsonProperty("scaleDownDuringTaskRolloverOnly") Boolean scaleDownDuringTaskRolloverOnly @@ -124,11 +117,7 @@ public CostBasedAutoScalerConfig( this.idleWeight = Configs.valueOrDefault(idleWeight, DEFAULT_IDLE_WEIGHT); this.defaultProcessingRate = Configs.valueOrDefault(defaultProcessingRate, DEFAULT_PROCESSING_RATE); this.useTaskCountBoundaries = Configs.valueOrDefault(useTaskCountBoundaries, false); - this.useBurstScaleOnHeavyLag = Configs.valueOrDefault(useBurstScaleOnHeavyLag, false); - this.highLagThreshold = Configs.valueOrDefault( - highLagThreshold, - this.useBurstScaleOnHeavyLag ? BurstScaleUpOnHighLagPlugin.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD : -1 - ); + this.highLagThreshold = Configs.valueOrDefault(highLagThreshold, -1); this.minScaleDownDelay = Configs.valueOrDefault(minScaleDownDelay, DEFAULT_MIN_SCALE_DELAY); this.scaleDownDuringTaskRolloverOnly = Configs.valueOrDefault(scaleDownDuringTaskRolloverOnly, false); @@ -242,12 +231,6 @@ public boolean shouldUseTaskCountBoundaries() return useTaskCountBoundaries; } - @JsonProperty("useBurstScaleOnHeavyLag") - public boolean shouldUseBurstScaleOnHeavyLag() - { - return useBurstScaleOnHeavyLag; - } - @JsonProperty("highLagThreshold") public int getHighLagThreshold() { @@ -293,11 +276,11 @@ public boolean equals(Object o) && Double.compare(that.idleWeight, idleWeight) == 0 && Double.compare(that.defaultProcessingRate, defaultProcessingRate) == 0 && useTaskCountBoundaries == that.useTaskCountBoundaries - && useBurstScaleOnHeavyLag == that.useBurstScaleOnHeavyLag && Objects.equals(minScaleDownDelay, that.minScaleDownDelay) && scaleDownDuringTaskRolloverOnly == that.scaleDownDuringTaskRolloverOnly && Objects.equals(taskCountStart, that.taskCountStart) - && Objects.equals(stopTaskCountRatio, that.stopTaskCountRatio); + && Objects.equals(stopTaskCountRatio, that.stopTaskCountRatio) + && highLagThreshold == that.highLagThreshold; } @Override @@ -315,7 +298,7 @@ public int hashCode() idleWeight, defaultProcessingRate, useTaskCountBoundaries, - useBurstScaleOnHeavyLag, + highLagThreshold, minScaleDownDelay, scaleDownDuringTaskRolloverOnly ); @@ -336,7 +319,6 @@ public String toString() ", idleWeight=" + idleWeight + ", defaultProcessingRate=" + defaultProcessingRate + ", useTaskCountBoundaries=" + useTaskCountBoundaries + - ", useBurstScaleOnHeavyLag=" + useBurstScaleOnHeavyLag + ", highLagThreshold=" + highLagThreshold + ", minScaleDownDelay=" + minScaleDownDelay + ", scaleDownDuringTaskRolloverOnly=" + scaleDownDuringTaskRolloverOnly + @@ -360,7 +342,6 @@ public static class Builder private Double idleWeight; private Double defaultProcessingRate; private Boolean useTaskCountBoundaries; - private Boolean useBurstScaleOnHeavyLag; private Integer highLagThreshold; private Duration minScaleDownDelay; private Boolean scaleDownDuringTaskRolloverOnly; @@ -447,12 +428,6 @@ public Builder useTaskCountBoundaries(boolean useTaskCountBoundaries) return this; } - public Builder useBurstScaleOnHeavyLag(boolean useBurstScaleOnHeavyLag) - { - this.useBurstScaleOnHeavyLag = useBurstScaleOnHeavyLag; - return this; - } - public Builder highLagThreshold(int highLagThreshold) { this.highLagThreshold = highLagThreshold; @@ -473,7 +448,6 @@ public CostBasedAutoScalerConfig build() idleWeight, defaultProcessingRate, useTaskCountBoundaries, - useBurstScaleOnHeavyLag, highLagThreshold, minScaleDownDelay, scaleDownDuringTaskRolloverOnly diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java index 15f7da8ce882..53fc780317b9 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java @@ -23,9 +23,6 @@ public final class BurstScaleUpOnHighLagPlugin { - // Base PPT lag threshold allowing to activate a burst scaleup to eliminate high lag. - public static final int EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD = 50_000; - /** * Divisor for partition count in the K formula: K = (partitionCount / K_PARTITION_DIVISOR) / sqrt(currentTaskCount). * This controls how aggressive the scaling is relative to partition count. diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java index 89b9742b7be0..96e15672bd13 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfigTest.java @@ -108,7 +108,7 @@ public void testSerdeWithDefaults() throws Exception Assert.assertFalse(config.isScaleDownOnTaskRolloverOnly()); Assert.assertNull(config.getTaskCountStart()); Assert.assertNull(config.getStopTaskCountRatio()); - // When useBurstScaleOnHeavyLag is not set (default: false), highLagThreshold defaults to -1 + // When highLagThreshold is not set, it defaults to -1 (burst scale-up disabled) Assert.assertEquals(-1, config.getHighLagThreshold()); } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java index 522be184ba7e..2329328610d9 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java @@ -76,7 +76,7 @@ public void setUp() public void testComputeValidTaskCounts() { OptimalTaskCountBoundariesPlugin boundariesPlugin = new OptimalTaskCountBoundariesPlugin(); - BurstScaleUpOnHighLagPlugin highLagPlugin = new BurstScaleUpOnHighLagPlugin(BurstScaleUpOnHighLagPlugin.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD); + BurstScaleUpOnHighLagPlugin highLagPlugin = new BurstScaleUpOnHighLagPlugin(50_000); // For 100 partitions at 25 tasks (4 partitions/task), valid counts include 25 and 34 int[] validTaskCounts = computeValidTaskCounts(100, 25, 0L, 1, 100, boundariesPlugin, highLagPlugin); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPluginTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPluginTest.java index 45cffbd4f5ba..ffe1b21ffd55 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPluginTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPluginTest.java @@ -31,7 +31,7 @@ */ public class BurstScaleUpOnHighLagPluginTest { - private static final int LAG_THRESHOLD = BurstScaleUpOnHighLagPlugin.EXTRA_SCALING_LAG_PER_PARTITION_THRESHOLD; + private static final int LAG_THRESHOLD = 50_000; private static final int PARTITION_COUNT = 48; private static final int TASK_COUNT_MAX = 48; From de7f15f243af94fa7b499962275fa82fbae85d3e Mon Sep 17 00:00:00 2001 From: Sasha Syrotenko Date: Thu, 5 Feb 2026 10:58:14 +0200 Subject: [PATCH 11/14] Align idle decay with task PPT boundary --- .../CostBasedAutoScalerIntegrationTest.java | 10 +- .../autoscaler/CostBasedAutoScaler.java | 108 ++++++--- .../autoscaler/WeightedCostFunction.java | 48 ++-- .../plugins/BurstScaleUpOnHighLagPlugin.java | 89 ------- .../OptimalTaskCountBoundariesPlugin.java | 33 --- ...CostBasedAutoScalerHighLagScalingTest.java | 154 ++++++++++++ .../autoscaler/CostBasedAutoScalerTest.java | 222 ++++++++++++++---- .../autoscaler/WeightedCostFunctionTest.java | 118 +++++----- .../BurstScaleUpOnHighLagPluginTest.java | 114 --------- 9 files changed, 479 insertions(+), 417 deletions(-) delete mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java delete mode 100644 indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/OptimalTaskCountBoundariesPlugin.java create mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerHighLagScalingTest.java delete mode 100644 indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPluginTest.java diff --git a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/autoscaler/CostBasedAutoScalerIntegrationTest.java b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/autoscaler/CostBasedAutoScalerIntegrationTest.java index b160448c0263..1138aacf6cfc 100644 --- a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/autoscaler/CostBasedAutoScalerIntegrationTest.java +++ b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/autoscaler/CostBasedAutoScalerIntegrationTest.java @@ -128,13 +128,13 @@ public void test_autoScaler_computesOptimalTaskCountAndProduceScaleDown() .taskCountMin(1) .taskCountMax(100) .taskCountStart(initialTaskCount) - .scaleActionPeriodMillis(1500) - .minTriggerScaleActionFrequencyMillis(3000) + .scaleActionPeriodMillis(1900) + .minTriggerScaleActionFrequencyMillis(2000) // Weight configuration: strongly favor lag reduction over idle time .lagWeight(0.9) .idleWeight(0.1) .scaleDownDuringTaskRolloverOnly(false) - .minScaleDownDelay(Duration.millis(1500)) + .minScaleDownDelay(Duration.ZERO) .build(); final KafkaSupervisorSpec spec = createKafkaSupervisorWithAutoScaler(superId, autoScalerConfig, initialTaskCount); @@ -148,10 +148,10 @@ public void test_autoScaler_computesOptimalTaskCountAndProduceScaleDown() .hasDimension(DruidMetrics.DATASOURCE, dataSource)); // Wait for autoscaler to emit optimalTaskCount metric indicating scale-down - // We expect the optimal task count to 4 + // We expect the optimal task count less than 6 overlord.latchableEmitter().waitForEvent( event -> event.hasMetricName(OPTIMAL_TASK_COUNT_METRIC) - .hasValueMatching(Matchers.equalTo(6L)) + .hasValueMatching(Matchers.lessThanOrEqualTo(6L)) ); // Suspend the supervisor diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java index 8f2e5c55ce26..49d85e601636 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java @@ -27,8 +27,6 @@ import org.apache.druid.indexing.overlord.supervisor.autoscaler.SupervisorTaskAutoScaler; import org.apache.druid.indexing.seekablestream.SeekableStreamIndexTaskRunner; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor; -import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins.BurstScaleUpOnHighLagPlugin; -import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins.OptimalTaskCountBoundariesPlugin; import org.apache.druid.java.util.common.DateTimes; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.concurrent.Execs; @@ -38,7 +36,6 @@ import org.apache.druid.query.DruidMetrics; import org.apache.druid.segment.incremental.RowIngestionMeters; -import javax.annotation.Nullable; import java.util.Map; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; @@ -63,6 +60,16 @@ public class CostBasedAutoScaler implements SupervisorTaskAutoScaler public static final String IDLE_COST_METRIC = "task/autoScaler/costBased/idleCost"; public static final String OPTIMAL_TASK_COUNT_METRIC = "task/autoScaler/costBased/optimalTaskCount"; + static final int MAX_INCREASE_IN_PARTITIONS_PER_TASK = 2; + static final int MAX_DECREASE_IN_PARTITIONS_PER_TASK = MAX_INCREASE_IN_PARTITIONS_PER_TASK * 2; + + /** + * Divisor for partition count in the K formula: K = (partitionCount / K_PARTITION_DIVISOR) / sqrt(currentTaskCount). + * This controls how aggressive the scaling is relative to partition count. + * That value was chosen by carefully analyzing the math model behind the implementation. + */ + static final double K_PARTITION_DIVISOR = 6.4; + private final String supervisorId; private final SeekableStreamSupervisor supervisor; private final ServiceEmitter emitter; @@ -71,8 +78,6 @@ public class CostBasedAutoScaler implements SupervisorTaskAutoScaler private final ServiceMetricEvent.Builder metricBuilder; private final ScheduledExecutorService autoscalerExecutor; private final WeightedCostFunction costFunction; - private OptimalTaskCountBoundariesPlugin boundariesPlugin = null; - private BurstScaleUpOnHighLagPlugin burstScaleUpPlugin = null; private volatile CostMetrics lastKnownMetrics; private volatile long lastScaleActionTimeMillis = -1; @@ -99,16 +104,9 @@ public CostBasedAutoScaler( DruidMetrics.STREAM, this.supervisor.getIoConfig().getStream() ); - if (config.shouldUseTaskCountBoundaries()) { - //noinspection InstantiationOfUtilityClass - this.boundariesPlugin = new OptimalTaskCountBoundariesPlugin(); - } - - if (config.getHighLagThreshold() >= 0) { - this.burstScaleUpPlugin = new BurstScaleUpOnHighLagPlugin(config.getHighLagThreshold()); - } } + @SuppressWarnings("unchecked") @Override public void start() { @@ -209,8 +207,8 @@ int computeOptimalTaskCount(CostMetrics metrics) (long) metrics.getAggregateLag(), config.getTaskCountMin(), config.getTaskCountMax(), - boundariesPlugin, - burstScaleUpPlugin + config.shouldUseTaskCountBoundaries(), + config.getHighLagThreshold() ); if (validTaskCounts.length == 0) { @@ -222,14 +220,19 @@ int computeOptimalTaskCount(CostMetrics metrics) CostResult optimalCost = new CostResult(); for (int taskCount : validTaskCounts) { - CostResult costResult = costFunction.computeCost(metrics, taskCount, config, burstScaleUpPlugin); + CostResult costResult = costFunction.computeCost(metrics, taskCount, config); double cost = costResult.totalCost(); log.info( - "Proposed task count[%d] has total Cost[%.4f] = lagCost[%.4f] + idleCost[%.4f]", + "Proposed task count[%d] has total cost[%.4f] = lagCost[%.4f] + idleCost[%.4f]." + + " Stats: avgPartitionLag[%.1f], pollIdleRatio[%.1f], lagWeight[%.1f], idleWeight[%.1f]", taskCount, cost, costResult.lagCost(), - costResult.idleCost() + costResult.idleCost(), + metrics.getAggregateLag(), + metrics.getPollIdleRatio(), + config.getLagWeight(), + config.getIdleWeight() ); if (cost < optimalCost.totalCost()) { optimalTaskCount = taskCount; @@ -265,15 +268,15 @@ int computeOptimalTaskCount(CostMetrics metrics) * * @return sorted list of valid task counts within bounds */ - @SuppressWarnings({"VariableNotUsedInsideIf", "ReassignedVariable"}) + @SuppressWarnings({"ReassignedVariable"}) static int[] computeValidTaskCounts( int partitionCount, int currentTaskCount, double aggregateLag, int taskCountMin, int taskCountMax, - @Nullable OptimalTaskCountBoundariesPlugin taskCountBoundariesPlugin, - @Nullable BurstScaleUpOnHighLagPlugin highLagPlugin + boolean isTaskCountBoundariesEnabled, + int highLagThreshold ) { if (partitionCount <= 0 || currentTaskCount <= 0) { @@ -287,21 +290,28 @@ static int[] computeValidTaskCounts( int minPartitionsPerTask = partitionCount / taskCountMax; int maxPartitionsPerTask = partitionCount / taskCountMin; - if (taskCountBoundariesPlugin != null) { + if (isTaskCountBoundariesEnabled) { maxPartitionsPerTask = Math.min( partitionCount, - currentPartitionsPerTask + OptimalTaskCountBoundariesPlugin.MAX_DECREASE_IN_PARTITIONS_PER_TASK + currentPartitionsPerTask + MAX_DECREASE_IN_PARTITIONS_PER_TASK ); int extraIncrease = 0; - if (highLagPlugin != null && highLagPlugin.lagThreshold() > 0) { - extraIncrease = highLagPlugin.computeScaleUpBoost(aggregateLag, partitionCount, currentTaskCount, taskCountMax); + if (highLagThreshold > 0) { + extraIncrease = computeExtraPPTIncrease( + highLagThreshold, + aggregateLag, + partitionCount, + currentTaskCount, + taskCountMax + ); } - int effectiveMaxIncrease = OptimalTaskCountBoundariesPlugin.MAX_INCREASE_IN_PARTITIONS_PER_TASK + extraIncrease; + int effectiveMaxIncrease = MAX_INCREASE_IN_PARTITIONS_PER_TASK + extraIncrease; minPartitionsPerTask = Math.max(minPartitionsPerTask, currentPartitionsPerTask - effectiveMaxIncrease); } - for (int partitionsPerTask = maxPartitionsPerTask; partitionsPerTask >= minPartitionsPerTask && partitionsPerTask != 0; partitionsPerTask--) { + for (int partitionsPerTask = maxPartitionsPerTask; partitionsPerTask >= minPartitionsPerTask + && partitionsPerTask != 0; partitionsPerTask--) { final int taskCount = (partitionCount + partitionsPerTask - 1) / partitionsPerTask; if (taskCount >= taskCountMin && taskCount <= taskCountMax) { result.add(taskCount); @@ -310,6 +320,50 @@ static int[] computeValidTaskCounts( return result.toIntArray(); } + /** + * Computes extra allowed increase in partitions-per-task in scenarios when the average per-partition lag + * is above the configured threshold. + *

+ * This uses a logarithmic formula for consistent absolute growth: + * {@code deltaTasks = K * ln(lagSeverity)} + * where {@code K = (partitionCount / 6.4) / sqrt(currentTaskCount)} + *

+ * This ensures that small taskCount's get a massive relative boost, + * while large taskCount's receive more measured, stable increases. + */ + static int computeExtraPPTIncrease( + double lagThreshold, + double aggregateLag, + int partitionCount, + int currentTaskCount, + int taskCountMax + ) + { + if (partitionCount <= 0 || taskCountMax <= 0 || currentTaskCount <= 0) { + return 0; + } + + final double lagPerPartition = aggregateLag / partitionCount; + if (lagPerPartition < lagThreshold) { + return 0; + } + + final double lagSeverity = lagPerPartition / lagThreshold; + + // Logarithmic growth: ln(lagSeverity) is positive when lagSeverity > 1 + // First multoplier decreases with sqrt(currentTaskCount): aggressive when small, conservative when large + final double deltaTasks = (partitionCount / K_PARTITION_DIVISOR) / Math.sqrt(currentTaskCount) * Math.log( + lagSeverity); + + final double targetTaskCount = Math.min(taskCountMax, (double) currentTaskCount + deltaTasks); + + // Compute precise PPT reduction to avoid early integer truncation artifacts + final double currentPPT = (double) partitionCount / currentTaskCount; + final double targetPPT = (double) partitionCount / targetTaskCount; + + return Math.max(0, (int) Math.floor(currentPPT - targetPPT)); + } + /** * Extracts the average poll-idle-ratio metric from task stats. * This metric indicates how much time the consumer spends idle waiting for data. diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunction.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunction.java index ef7dcb8a8828..01b35242ed84 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunction.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunction.java @@ -19,7 +19,6 @@ package org.apache.druid.indexing.seekablestream.supervisor.autoscaler; -import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins.BurstScaleUpOnHighLagPlugin; import org.apache.druid.java.util.common.logger.Logger; /** @@ -31,18 +30,11 @@ public class WeightedCostFunction { private static final Logger log = new Logger(WeightedCostFunction.class); /** - * Represents the maximum multiplier factor applied to amplify lag-based costs in the cost computation process. - * This value is used to cap the lag amplification effect to prevent excessively high cost inflation - * caused by significant partition lag. - * It ensures that lag-related adjustments remain bounded within a reasonable range for stability of - * cost-based auto-scaling decisions. + * The lag severity at which lagBusyFactor reaches 1.0 (full idle suppression). + * lagSeverity is defined as lagPerPartition / highLagThreshold. + * At severity=1 (threshold), lagBusyFactor=0. At severity=MAX, lagBusyFactor=1.0. */ - private static final double LAG_AMPLIFICATION_MAX_MULTIPLIER = 2.0; - /** - * Multiplier for computing the maximum lag per partition used in lag amplification. - * The max lag is calculated as: aggressiveScalingLagPerPartitionThreshold * LAG_AMPLIFICATION_MAX_LAG_MULTIPLIER. - */ - private static final int LAG_AMPLIFICATION_MAX_LAG_MULTIPLIER = 5; + private static final int LAG_AMPLIFICATION_MAX_SEVERITY = 5; /** * Computes cost for a given task count using compute time metrics. @@ -62,8 +54,7 @@ public class WeightedCostFunction public CostResult computeCost( CostMetrics metrics, int proposedTaskCount, - CostBasedAutoScalerConfig config, - BurstScaleUpOnHighLagPlugin highLagPlugin + CostBasedAutoScalerConfig config ) { if (metrics == null || config == null || proposedTaskCount <= 0 || metrics.getPartitionCount() <= 0) { @@ -86,7 +77,7 @@ public CostResult computeCost( lagRecoveryTime = metrics.getAggregateLag() / (proposedTaskCount * avgProcessingRate); } - final double predictedIdleRatio = estimateIdleRatio(metrics, proposedTaskCount, highLagPlugin); + final double predictedIdleRatio = estimateIdleRatio(metrics, proposedTaskCount, config.getHighLagThreshold()); final double idleCost = proposedTaskCount * metrics.getTaskDurationSeconds() * predictedIdleRatio; final double lagCost = config.getLagWeight() * lagRecoveryTime; final double weightedIdleCost = config.getIdleWeight() * idleCost; @@ -107,20 +98,16 @@ public CostResult computeCost( /** * Estimates the idle ratio for a proposed task count. - * Includes lag-based adjustment to eliminate high lag and - * reduce predicted idle when work exists. - *

- * Formulas: - * {@code linearPrediction = max(0, 1 - busyFraction / taskRatio)} - * {@code lagBusyFactor = 1 - exp(-lagPerTask / LAG_SCALE_FACTOR)} - * {@code adjustedPrediction = linearPrediction × (1 - lagBusyFactor)} + * Includes lag-based adjustment to suppress predicted idle when lag exceeds the threshold, + * encouraging scale-up when there is real work to do. + * The algorithm is adjusted with {@code computeExtraPPTIncrease}, so they may work in tandem, when enabled. * * @param metrics current system metrics containing idle ratio and task count * @param taskCount target task count to estimate an idle ratio for * @return estimated idle ratio in range [0.0, 1.0] */ @SuppressWarnings("ExtractMethodRecommender") - private double estimateIdleRatio(CostMetrics metrics, int taskCount, BurstScaleUpOnHighLagPlugin highLagPlugin) + private double estimateIdleRatio(CostMetrics metrics, int taskCount, int highLagThreshold) { final double currentPollIdleRatio = metrics.getPollIdleRatio(); @@ -142,17 +129,10 @@ private double estimateIdleRatio(CostMetrics metrics, int taskCount, BurstScaleU final double lagPerPartition = metrics.getAggregateLag() / metrics.getPartitionCount(); double lagBusyFactor = 0.; - // Lag-amplified idle decay - if (highLagPlugin != null && lagPerPartition >= highLagPlugin.lagThreshold()) { - int extraThreshold = highLagPlugin.lagThreshold(); - final double lagPerTask = metrics.getAggregateLag() / taskCount; - lagBusyFactor = 1.0 - Math.exp(-lagPerTask / extraThreshold); - - final long lagAmplificationMaxLagPerPartition = (long) extraThreshold * LAG_AMPLIFICATION_MAX_LAG_MULTIPLIER; - final double rampDenominator = lagAmplificationMaxLagPerPartition - (double) extraThreshold; - final double ramp = Math.min(1.0, Math.max(0.0, (lagPerPartition - extraThreshold) / rampDenominator)); - - lagBusyFactor = Math.min(1.0, lagBusyFactor * (1.0 + ramp * (LAG_AMPLIFICATION_MAX_MULTIPLIER - 1.0))); + // Lag-amplified idle decay using ln(lagSeverity) / ln(maxSeverity). + if (highLagThreshold > 0 && lagPerPartition >= highLagThreshold) { + final double lagSeverity = lagPerPartition / highLagThreshold; + lagBusyFactor = Math.min(1.0, Math.log(lagSeverity) / Math.log(LAG_AMPLIFICATION_MAX_SEVERITY)); } // Clamp to valid range [0, 1] diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java deleted file mode 100644 index 53fc780317b9..000000000000 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPlugin.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins; - -@SuppressWarnings("ClassCanBeRecord") -public final class BurstScaleUpOnHighLagPlugin -{ - - /** - * Divisor for partition count in the K formula: K = (partitionCount / K_PARTITION_DIVISOR) / sqrt(currentTaskCount). - * This controls how aggressive the scaling is relative to partition count. - * That value was chosen by carefully analyzing the math model behind the implementation. - */ - private static final double K_PARTITION_DIVISOR = 6.4; - - private final int lagThreshold; - - public BurstScaleUpOnHighLagPlugin(int lagThreshold) - { - this.lagThreshold = lagThreshold; - } - - public int lagThreshold() - { - return lagThreshold; - } - - /** - * Computes extra allowed increase in partitions-per-task in scenarios when the average per-partition lag - * is above the configured threshold. - *

- * This uses a logarithmic formula for consistent absolute growth: - * {@code deltaTasks = K * ln(lagSeverity)} - * where {@code K = (partitionCount / 6.4) / sqrt(currentTaskCount)} - *

- * This ensures: - * 1. Partition-aware scaling: larger datasets get more aggressive scaling. - * 2. Small taskCount's get a massive relative boost, while large taskCount's receive more measured, stable increases. - * 3. Logarithmic lag response: diminishing returns at extreme lag values. - */ - public int computeScaleUpBoost( - double aggregateLag, - int partitionCount, - int currentTaskCount, - int taskCountMax - ) - { - if (partitionCount <= 0 || taskCountMax <= 0 || currentTaskCount <= 0) { - return 0; - } - - final double lagPerPartition = aggregateLag / partitionCount; - if (lagPerPartition < lagThreshold) { - return 0; - } - - final double lagSeverity = lagPerPartition / lagThreshold; - - - // Logarithmic growth: ln(lagSeverity) is positive when lagSeverity > 1 - // First multoplier decreases with sqrt(currentTaskCount): aggressive when small, conservative when large - final double deltaTasks = (partitionCount / K_PARTITION_DIVISOR) / Math.sqrt(currentTaskCount) * Math.log(lagSeverity); - - final double targetTaskCount = Math.min(taskCountMax, (double) currentTaskCount + deltaTasks); - - // Compute precise PPT reduction to avoid early integer truncation artifacts - final double currentPPT = (double) partitionCount / currentTaskCount; - final double targetPPT = (double) partitionCount / targetTaskCount; - - return Math.max(0, (int) Math.floor(currentPPT - targetPPT)); - } -} diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/OptimalTaskCountBoundariesPlugin.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/OptimalTaskCountBoundariesPlugin.java deleted file mode 100644 index 7c0ea398aee4..000000000000 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/OptimalTaskCountBoundariesPlugin.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins; - -/** - * - */ -public class OptimalTaskCountBoundariesPlugin -{ - public static final int MAX_INCREASE_IN_PARTITIONS_PER_TASK = 2; - public static final int MAX_DECREASE_IN_PARTITIONS_PER_TASK = MAX_INCREASE_IN_PARTITIONS_PER_TASK * 2; - - public OptimalTaskCountBoundariesPlugin() - { - } -} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerHighLagScalingTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerHighLagScalingTest.java new file mode 100644 index 000000000000..122a449c5f79 --- /dev/null +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerHighLagScalingTest.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.indexing.seekablestream.supervisor.autoscaler; + +import org.junit.Assert; +import org.junit.Test; + +/** + * Tests for {@link CostBasedAutoScaler#computeExtraPPTIncrease}. + *

+ * The burst scaling uses a logarithmic formula: + * {@code deltaTasks = K * ln(lagSeverity)} + * where {@code K = (partitionCount / 6.4) / sqrt(currentTaskCount)}. + */ +public class CostBasedAutoScalerHighLagScalingTest +{ + private static final int LAG_THRESHOLD = 50_000; + private static final int PARTITION_COUNT = 48; + private static final int TASK_COUNT_MAX = 48; + + /** + * Tests scaling behavior across different lag levels and task counts. + *

+ * Expected behavior for 48 partitions with threshold=50K: + *

+   * | Current | Lag/Part | PPT reduction | Notes                                   |
+   * |---------|----------|---------------|-----------------------------------------|
+   * | any     | <50K     | 0             | Below threshold                         |
+   * | any     | =50K     | 0             | ln(1) = 0                               |
+   * | 1       | 100K     | 40            | Significant boost for recovery          |
+   * | 1       | 200K     | 43            | Large boost                             |
+   * | 4       | 200K     | 6             | Moderate boost                          |
+   * | 12      | 200K     | 0             | Delta too small for PPT change          |
+   * | 24      | 200K     | 0             | Delta too small for PPT change          |
+   * 
+ */ + @Test + public void testComputeExtraPPTIncrease() + { + // Below threshold: no boost + Assert.assertEquals( + "Below threshold should not increase PPT", + 0, + CostBasedAutoScaler.computeExtraPPTIncrease( + LAG_THRESHOLD, + PARTITION_COUNT * 40_000L, + PARTITION_COUNT, + 4, + TASK_COUNT_MAX + ) + ); + + // At threshold (lagSeverity=1, ln(1)=0): no boost + Assert.assertEquals( + "At threshold (ln(1)=0) should not increase PPT", + 0, + CostBasedAutoScaler.computeExtraPPTIncrease( + LAG_THRESHOLD, + PARTITION_COUNT * 50_000L, + PARTITION_COUNT, + 4, + TASK_COUNT_MAX + ) + ); + + // C=1, 100K lag (2x threshold): significant boost for emergency recovery + int boost1_100k = CostBasedAutoScaler.computeExtraPPTIncrease( + LAG_THRESHOLD, + PARTITION_COUNT * 100_000L, + PARTITION_COUNT, + 1, + TASK_COUNT_MAX + ); + Assert.assertEquals("C=1, 100K lag boost", 40, boost1_100k); + + // C=1, 200K lag (4x threshold): large boost + int boost1_200k = CostBasedAutoScaler.computeExtraPPTIncrease( + LAG_THRESHOLD, + PARTITION_COUNT * 200_000L, + PARTITION_COUNT, + 1, + TASK_COUNT_MAX + ); + Assert.assertEquals("C=1, 200K lag boost", 43, boost1_200k); + + // C=4, 200K lag: moderate boost (K decreases with sqrt(C)) + int boost4_200k = CostBasedAutoScaler.computeExtraPPTIncrease( + LAG_THRESHOLD, + PARTITION_COUNT * 200_000L, + PARTITION_COUNT, + 4, + TASK_COUNT_MAX + ); + Assert.assertEquals("C=4, 200K lag should yield a modest PPT increase", 6, boost4_200k); + + // C=12, 200K lag: delta too small to change PPT + int boost12_200k = CostBasedAutoScaler.computeExtraPPTIncrease( + LAG_THRESHOLD, + PARTITION_COUNT * 200_000L, + PARTITION_COUNT, + 12, + TASK_COUNT_MAX + ); + Assert.assertEquals("C=12, 200K lag should not change PPT", 0, boost12_200k); + + // C=24, 200K lag: delta too small to change PPT + int boost24_200k = CostBasedAutoScaler.computeExtraPPTIncrease( + LAG_THRESHOLD, + PARTITION_COUNT * 200_000L, + PARTITION_COUNT, + 24, + TASK_COUNT_MAX + ); + Assert.assertEquals("C=24, 200K lag should not change PPT", 0, boost24_200k); + } + + @Test + public void testComputeExtraPPTIncreaseInvalidInputs() + { + Assert.assertEquals( + 0, + CostBasedAutoScaler.computeExtraPPTIncrease(LAG_THRESHOLD, 1_000_000, 0, 4, 48) + ); + Assert.assertEquals( + 0, + CostBasedAutoScaler.computeExtraPPTIncrease(LAG_THRESHOLD, 1_000_000, 48, 0, 48) + ); + Assert.assertEquals( + 0, + CostBasedAutoScaler.computeExtraPPTIncrease(LAG_THRESHOLD, 1_000_000, 48, 4, 0) + ); + Assert.assertEquals( + 0, + CostBasedAutoScaler.computeExtraPPTIncrease(LAG_THRESHOLD, 1_000_000, -1, 4, 48) + ); + } +} diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java index 2329328610d9..bb6eca691a2d 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerTest.java @@ -23,8 +23,6 @@ import org.apache.druid.indexing.seekablestream.SeekableStreamIndexTaskRunner; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisorIOConfig; -import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins.BurstScaleUpOnHighLagPlugin; -import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins.OptimalTaskCountBoundariesPlugin; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.segment.incremental.RowIngestionMeters; import org.joda.time.Duration; @@ -75,59 +73,91 @@ public void setUp() @Test public void testComputeValidTaskCounts() { - OptimalTaskCountBoundariesPlugin boundariesPlugin = new OptimalTaskCountBoundariesPlugin(); - BurstScaleUpOnHighLagPlugin highLagPlugin = new BurstScaleUpOnHighLagPlugin(50_000); + boolean useTaskCountBoundaries = true; + int highLagThreshold = 50_000; // For 100 partitions at 25 tasks (4 partitions/task), valid counts include 25 and 34 - int[] validTaskCounts = computeValidTaskCounts(100, 25, 0L, 1, 100, boundariesPlugin, highLagPlugin); - Assert.assertTrue("Should contain the current task count", contains(validTaskCounts, 25)); - Assert.assertTrue("Should contain the next scale-up option", contains(validTaskCounts, 34)); + int[] validTaskCounts = computeValidTaskCounts( + 100, + 25, + 0L, + 1, + 100, + useTaskCountBoundaries, + highLagThreshold + ); + Assert.assertTrue("Expected current task count to be included", contains(validTaskCounts, 25)); + Assert.assertTrue("Expected next scale-up option (34) to be included", contains(validTaskCounts, 34)); // Single partition - int[] singlePartition = computeValidTaskCounts(1, 1, 0L, 1, 100, boundariesPlugin, highLagPlugin); - Assert.assertTrue("Single partition should have at least one valid count", singlePartition.length > 0); - Assert.assertTrue("Single partition should contain 1", contains(singlePartition, 1)); + int[] singlePartition = computeValidTaskCounts( + 1, + 1, + 0L, + 1, + 100, + useTaskCountBoundaries, + highLagThreshold + ); + Assert.assertTrue("Single partition should yield at least one valid count", singlePartition.length > 0); + Assert.assertTrue("Single partition should include task count 1", contains(singlePartition, 1)); // Current exceeds partitions - should still yield valid, deduplicated options - int[] exceedsPartitions = computeValidTaskCounts(2, 5, 0L, 1, 100, boundariesPlugin, highLagPlugin); + int[] exceedsPartitions = computeValidTaskCounts( + 2, + 5, + 0L, + 1, + 100, + useTaskCountBoundaries, + highLagThreshold + ); Assert.assertEquals(2, exceedsPartitions.length); Assert.assertTrue(contains(exceedsPartitions, 1)); Assert.assertTrue(contains(exceedsPartitions, 2)); // Lag expansion: low lag should not include max, high lag should allow aggressive scaling - int[] lowLagCounts = computeValidTaskCounts(30, 3, 0L, 1, 30, boundariesPlugin, highLagPlugin); + int[] lowLagCounts = computeValidTaskCounts(30, 3, 0L, 1, 30, useTaskCountBoundaries, highLagThreshold); Assert.assertFalse("Low lag should not include max task count", contains(lowLagCounts, 30)); - Assert.assertTrue("Low lag should cap scale up around 4 tasks", contains(lowLagCounts, 4)); + Assert.assertTrue("Low lag should cap scale-up around 4 tasks", contains(lowLagCounts, 4)); // High lag uses logarithmic formula: K * ln(lagSeverity) where K = P/(6.4*sqrt(C)) // For P=30, C=3, lagPerPartition=500K, threshold=50K: lagSeverity=10, K=2.7, delta=6.2 // This allows controlled scaling to ~10-15 tasks (not all the way to max) long highAggregateLag = 30L * 500_000L; - int[] highLagCounts = computeValidTaskCounts(30, 3, highAggregateLag, 1, 30, boundariesPlugin, highLagPlugin); + int[] highLagCounts = computeValidTaskCounts( + 30, + 3, + highAggregateLag, + 1, + 30, + useTaskCountBoundaries, + highLagThreshold + ); Assert.assertTrue("High lag should allow scaling to 10 tasks", contains(highLagCounts, 10)); Assert.assertTrue("High lag should allow scaling to 15 tasks", contains(highLagCounts, 15)); - Assert.assertFalse("Should not jump straight to 30 from 3", contains(highLagCounts, 30)); + Assert.assertFalse("High lag should not jump straight to max (30) from 3", contains(highLagCounts, 30)); // Respects taskCountMax - int[] cappedCounts = computeValidTaskCounts(30, 4, highAggregateLag, 1, 3, boundariesPlugin, highLagPlugin); - Assert.assertTrue("Should include taskCountMax when doable", contains(cappedCounts, 3)); + int[] cappedCounts = computeValidTaskCounts(30, 4, highAggregateLag, 1, 3, useTaskCountBoundaries, highLagThreshold); + Assert.assertTrue("Should include taskCountMax when within bounds", contains(cappedCounts, 3)); Assert.assertFalse("Should not exceed taskCountMax", contains(cappedCounts, 4)); // Respects taskCountMin - filters out values below the minimum // With partitionCount=100, currentTaskCount=10, the computed range includes values like 8, 9, 10, 12, 13 - int[] minCappedCounts = computeValidTaskCounts(100, 10, 0L, 10, 100, boundariesPlugin, highLagPlugin); - Assert.assertFalse("Should not go below taskCountMin", contains(minCappedCounts, 8)); - Assert.assertFalse("Should not go below taskCountMin", contains(minCappedCounts, 9)); - Assert.assertTrue("Should include values at taskCountMin", contains(minCappedCounts, 10)); - Assert.assertTrue("Should include values above taskCountMin", contains(minCappedCounts, 12)); + int[] minCappedCounts = computeValidTaskCounts(100, 10, 0L, 10, 100, useTaskCountBoundaries, highLagThreshold); + Assert.assertFalse("Should not include values below taskCountMin (8)", contains(minCappedCounts, 8)); + Assert.assertFalse("Should not include values below taskCountMin (9)", contains(minCappedCounts, 9)); + Assert.assertTrue("Should include values at taskCountMin (10)", contains(minCappedCounts, 10)); + Assert.assertTrue("Should include values above taskCountMin (12)", contains(minCappedCounts, 12)); // Both bounds applied together - int[] bothBounds = computeValidTaskCounts(100, 10, 0L, 10, 12, boundariesPlugin, highLagPlugin); - Assert.assertFalse("Should not go below taskCountMin", contains(bothBounds, 8)); - Assert.assertFalse("Should not go below taskCountMin", contains(bothBounds, 9)); - Assert.assertFalse("Should not exceed taskCountMax", contains(bothBounds, 13)); - Assert.assertTrue("Should include values at taskCountMin", contains(bothBounds, 10)); - Assert.assertTrue("Should include values at taskCountMax", contains(bothBounds, 12)); + int[] bothBounds = computeValidTaskCounts(100, 10, 0L, 10, 12, useTaskCountBoundaries, highLagThreshold); + Assert.assertFalse("Should not include values below taskCountMin (8)", contains(bothBounds, 8)); + Assert.assertFalse("Should not include values below taskCountMin (9)", contains(bothBounds, 9)); + Assert.assertFalse("Should not include values above taskCountMax (13)", contains(bothBounds, 13)); + Assert.assertTrue("Should include values at taskCountMin (10)", contains(bothBounds, 10)); + Assert.assertTrue("Should include values at taskCountMax (12)", contains(bothBounds, 12)); } @Test @@ -141,28 +171,38 @@ public void testComputeOptimalTaskCount() // High idle (underutilized) - should scale down int scaleDownResult = autoScaler.computeOptimalTaskCount(createMetrics(100.0, 25, 100, 0.8)); - Assert.assertTrue("Should scale down when idle > 0.6", scaleDownResult < 25); + Assert.assertTrue("Expected scale-down when idle ratio is high (>0.6)", scaleDownResult < 25); // Very high idle with high task count - should scale down int highIdleResult = autoScaler.computeOptimalTaskCount(createMetrics(10.0, 50, 100, 0.9)); - Assert.assertTrue("Scale down scenario should return optimal <= current", highIdleResult <= 50); + Assert.assertTrue("High idle should not suggest scale-up", highIdleResult <= 50); // With low idle and balanced weights, the algorithm should not scale up aggressively int lowIdleResult = autoScaler.computeOptimalTaskCount(createMetrics(1000.0, 25, 100, 0.1)); - Assert.assertTrue("With low idle and balanced weights, should not scale up aggressively", lowIdleResult <= 25); + Assert.assertTrue("With low idle and balanced weights, avoid aggressive scale-up", lowIdleResult <= 25); } @Test public void testExtractPollIdleRatio() { // Null and empty return 0 - Assert.assertEquals(0., CostBasedAutoScaler.extractPollIdleRatio(null), 0.0001); - Assert.assertEquals(0., CostBasedAutoScaler.extractPollIdleRatio(Collections.emptyMap()), 0.0001); + Assert.assertEquals("Null stats should yield 0 idle ratio", 0., CostBasedAutoScaler.extractPollIdleRatio(null), 0.0001); + Assert.assertEquals( + "Empty stats should yield 0 idle ratio", + 0., + CostBasedAutoScaler.extractPollIdleRatio(Collections.emptyMap()), + 0.0001 + ); // Missing metrics return 0 Map> missingMetrics = new HashMap<>(); missingMetrics.put("0", Collections.singletonMap("task-0", new HashMap<>())); - Assert.assertEquals(0., CostBasedAutoScaler.extractPollIdleRatio(missingMetrics), 0.0001); + Assert.assertEquals( + "Missing autoscaler metrics should yield 0 idle ratio", + 0., + CostBasedAutoScaler.extractPollIdleRatio(missingMetrics), + 0.0001 + ); // Valid stats return average Map> validStats = new HashMap<>(); @@ -170,26 +210,46 @@ public void testExtractPollIdleRatio() group.put("task-0", buildTaskStatsWithPollIdle(0.3)); group.put("task-1", buildTaskStatsWithPollIdle(0.5)); validStats.put("0", group); - Assert.assertEquals(0.4, CostBasedAutoScaler.extractPollIdleRatio(validStats), 0.0001); + Assert.assertEquals( + "Average poll idle ratio should be computed across tasks", + 0.4, + CostBasedAutoScaler.extractPollIdleRatio(validStats), + 0.0001 + ); // Invalid types: non-map task metric Map> nonMapTask = new HashMap<>(); nonMapTask.put("0", Collections.singletonMap("task-0", "not-a-map")); - Assert.assertEquals(0., CostBasedAutoScaler.extractPollIdleRatio(nonMapTask), 0.0001); + Assert.assertEquals( + "Non-map task stats should be ignored", + 0., + CostBasedAutoScaler.extractPollIdleRatio(nonMapTask), + 0.0001 + ); // Invalid types: empty autoscaler metrics Map> emptyAutoscaler = new HashMap<>(); Map taskStats1 = new HashMap<>(); taskStats1.put(SeekableStreamIndexTaskRunner.AUTOSCALER_METRICS_KEY, new HashMap<>()); emptyAutoscaler.put("0", Collections.singletonMap("task-0", taskStats1)); - Assert.assertEquals(0., CostBasedAutoScaler.extractPollIdleRatio(emptyAutoscaler), 0.0001); + Assert.assertEquals( + "Empty autoscaler metrics should yield 0 idle ratio", + 0., + CostBasedAutoScaler.extractPollIdleRatio(emptyAutoscaler), + 0.0001 + ); // Invalid types: non-map autoscaler metrics Map> nonMapAutoscaler = new HashMap<>(); Map taskStats2 = new HashMap<>(); taskStats2.put(SeekableStreamIndexTaskRunner.AUTOSCALER_METRICS_KEY, "not-a-map"); nonMapAutoscaler.put("0", Collections.singletonMap("task-0", taskStats2)); - Assert.assertEquals(0., CostBasedAutoScaler.extractPollIdleRatio(nonMapAutoscaler), 0.0001); + Assert.assertEquals( + "Non-map autoscaler metrics should be ignored", + 0., + CostBasedAutoScaler.extractPollIdleRatio(nonMapAutoscaler), + 0.0001 + ); // Invalid types: non-number poll idle ratio Map> nonNumberRatio = new HashMap<>(); @@ -198,20 +258,35 @@ public void testExtractPollIdleRatio() autoscalerMetrics.put(SeekableStreamIndexTaskRunner.POLL_IDLE_RATIO_KEY, "not-a-number"); taskStats3.put(SeekableStreamIndexTaskRunner.AUTOSCALER_METRICS_KEY, autoscalerMetrics); nonNumberRatio.put("0", Collections.singletonMap("task-0", taskStats3)); - Assert.assertEquals(0., CostBasedAutoScaler.extractPollIdleRatio(nonNumberRatio), 0.0001); + Assert.assertEquals( + "Non-numeric poll idle ratio should be ignored", + 0., + CostBasedAutoScaler.extractPollIdleRatio(nonNumberRatio), + 0.0001 + ); } @Test public void testExtractMovingAverage() { // Null and empty return -1 - Assert.assertEquals(-1., CostBasedAutoScaler.extractMovingAverage(null), 0.0001); - Assert.assertEquals(-1., CostBasedAutoScaler.extractMovingAverage(Collections.emptyMap()), 0.0001); + Assert.assertEquals("Null stats should yield -1 moving average", -1., CostBasedAutoScaler.extractMovingAverage(null), 0.0001); + Assert.assertEquals( + "Empty stats should yield -1 moving average", + -1., + CostBasedAutoScaler.extractMovingAverage(Collections.emptyMap()), + 0.0001 + ); // Missing metrics return -1 Map> missingMetrics = new HashMap<>(); missingMetrics.put("0", Collections.singletonMap("task-0", new HashMap<>())); - Assert.assertEquals(-1., CostBasedAutoScaler.extractMovingAverage(missingMetrics), 0.0001); + Assert.assertEquals( + "Missing moving averages should yield -1", + -1., + CostBasedAutoScaler.extractMovingAverage(missingMetrics), + 0.0001 + ); // Valid stats return average (using 5-minute) Map> validStats = new HashMap<>(); @@ -219,7 +294,12 @@ public void testExtractMovingAverage() group.put("task-0", buildTaskStatsWithMovingAverage(1000.0)); group.put("task-1", buildTaskStatsWithMovingAverage(2000.0)); validStats.put("0", group); - Assert.assertEquals(1500.0, CostBasedAutoScaler.extractMovingAverage(validStats), 0.0001); + Assert.assertEquals( + "Average 5-minute processing rate should be computed across tasks", + 1500.0, + CostBasedAutoScaler.extractMovingAverage(validStats), + 0.0001 + ); // Interval fallback: 15-minute preferred, then 5-minute, then 1-minute Map> fifteenMin = new HashMap<>(); @@ -230,7 +310,12 @@ public void testExtractMovingAverage() buildTaskStatsWithMovingAverageForInterval(FIFTEEN_MINUTE_NAME, 1500.0) ) ); - Assert.assertEquals(1500.0, CostBasedAutoScaler.extractMovingAverage(fifteenMin), 0.0001); + Assert.assertEquals( + "15-minute interval should be preferred when available", + 1500.0, + CostBasedAutoScaler.extractMovingAverage(fifteenMin), + 0.0001 + ); // 1-minute as a final fallback Map> oneMin = new HashMap<>(); @@ -238,7 +323,12 @@ public void testExtractMovingAverage() "0", Collections.singletonMap("task-0", buildTaskStatsWithMovingAverageForInterval(ONE_MINUTE_NAME, 500.0)) ); - Assert.assertEquals(500.0, CostBasedAutoScaler.extractMovingAverage(oneMin), 0.0001); + Assert.assertEquals( + "1-minute interval should be used as a final fallback", + 500.0, + CostBasedAutoScaler.extractMovingAverage(oneMin), + 0.0001 + ); // 15-minute preferred over 5-minute when both available Map> allIntervals = new HashMap<>(); @@ -246,7 +336,12 @@ public void testExtractMovingAverage() "0", Collections.singletonMap("task-0", buildTaskStatsWithMultipleMovingAverages(1500.0, 1000.0, 500.0)) ); - Assert.assertEquals(1500.0, CostBasedAutoScaler.extractMovingAverage(allIntervals), 0.0001); + Assert.assertEquals( + "15-minute interval should win when multiple intervals are present", + 1500.0, + CostBasedAutoScaler.extractMovingAverage(allIntervals), + 0.0001 + ); // Falls back to 5-minute when 15-minute is null Map> nullFifteen = new HashMap<>(); @@ -257,12 +352,22 @@ public void testExtractMovingAverage() buildTaskStatsWithNullInterval(FIFTEEN_MINUTE_NAME, FIVE_MINUTE_NAME, 750.0) ) ); - Assert.assertEquals(750.0, CostBasedAutoScaler.extractMovingAverage(nullFifteen), 0.0001); + Assert.assertEquals( + "Should fall back to 5-minute when 15-minute is null", + 750.0, + CostBasedAutoScaler.extractMovingAverage(nullFifteen), + 0.0001 + ); // Falls back to 1-minute when both 15 and 5 are null Map> bothNull = new HashMap<>(); bothNull.put("0", Collections.singletonMap("task-0", buildTaskStatsWithTwoNullIntervals(250.0))); - Assert.assertEquals(250.0, CostBasedAutoScaler.extractMovingAverage(bothNull), 0.0001); + Assert.assertEquals( + "Should fall back to 1-minute when 15 and 5 are null", + 250.0, + CostBasedAutoScaler.extractMovingAverage(bothNull), + 0.0001 + ); } @Test @@ -271,19 +376,34 @@ public void testExtractMovingAverageInvalidTypes() // Non-map task metric Map> nonMapTask = new HashMap<>(); nonMapTask.put("0", Collections.singletonMap("task-0", "not-a-map")); - Assert.assertEquals(-1., CostBasedAutoScaler.extractMovingAverage(nonMapTask), 0.0001); + Assert.assertEquals( + "Non-map task stats should be ignored", + -1., + CostBasedAutoScaler.extractMovingAverage(nonMapTask), + 0.0001 + ); Map> missingBuild = new HashMap<>(); Map taskStats1 = new HashMap<>(); taskStats1.put("movingAverages", new HashMap<>()); missingBuild.put("0", Collections.singletonMap("task-0", taskStats1)); - Assert.assertEquals(-1., CostBasedAutoScaler.extractMovingAverage(missingBuild), 0.0001); + Assert.assertEquals( + "Missing buildSegments moving average should yield -1", + -1., + CostBasedAutoScaler.extractMovingAverage(missingBuild), + 0.0001 + ); Map> nonMapMA = new HashMap<>(); Map taskStats2 = new HashMap<>(); taskStats2.put("movingAverages", "not-a-map"); nonMapMA.put("0", Collections.singletonMap("task-0", taskStats2)); - Assert.assertEquals(-1., CostBasedAutoScaler.extractMovingAverage(nonMapMA), 0.0001); + Assert.assertEquals( + "Non-map movingAverages should be ignored", + -1., + CostBasedAutoScaler.extractMovingAverage(nonMapMA), + 0.0001 + ); } @Test diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java index 7a50ff2cb0f7..c5b2b867e669 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/WeightedCostFunctionTest.java @@ -19,7 +19,6 @@ package org.apache.druid.indexing.seekablestream.supervisor.autoscaler; -import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins.BurstScaleUpOnHighLagPlugin; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -48,13 +47,13 @@ public void testComputeCostInvalidInputs() { CostMetrics validMetrics = createMetrics(100000.0, 10, 100, 0.3); - Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(null, 10, config, null).totalCost(), 0.0); - Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(validMetrics, 10, null, null).totalCost(), 0.0); - Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(validMetrics, 0, config, null).totalCost(), 0.0); - Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(validMetrics, -5, config, null).totalCost(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(null, 10, config).totalCost(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(validMetrics, 10, null).totalCost(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(validMetrics, 0, config).totalCost(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, costFunction.computeCost(validMetrics, -5, config).totalCost(), 0.0); Assert.assertEquals( Double.POSITIVE_INFINITY, - costFunction.computeCost(createMetrics(0.0, 10, 0, 0.3), 10, config, null).totalCost(), + costFunction.computeCost(createMetrics(0.0, 10, 0, 0.3), 10, config).totalCost(), 0.0 ); } @@ -73,8 +72,8 @@ public void testScaleDownHasHigherLagCostThanCurrent() CostMetrics metrics = createMetrics(200000.0, 10, 200, 0.3); - double costCurrent = costFunction.computeCost(metrics, 10, lagOnlyConfig, null).totalCost(); - double costScaleDown = costFunction.computeCost(metrics, 5, lagOnlyConfig, null).totalCost(); + double costCurrent = costFunction.computeCost(metrics, 10, lagOnlyConfig).totalCost(); + double costScaleDown = costFunction.computeCost(metrics, 5, lagOnlyConfig).totalCost(); // Scale down uses absolute model: lag / (5 * rate) = higher recovery time // Current uses absolute model: lag / (10 * rate) = lower recovery time @@ -102,15 +101,15 @@ public void testLagCostWithMarginalModel() CostMetrics metrics = createMetrics(100000.0, 10, 100, 0.3); // Current (10 tasks): uses absolute model = 10M / (10 * 1000) = 1000s - double costCurrent = costFunction.computeCost(metrics, 10, lagOnlyConfig, null).totalCost(); - Assert.assertEquals("Cost at current tasks", 1000., costCurrent, 0.1); + double costCurrent = costFunction.computeCost(metrics, 10, lagOnlyConfig).totalCost(); + Assert.assertEquals("Cost of current tasks", 1000., costCurrent, 0.1); // Scale up by 5 (to 15): marginal model = 10M / (15 * 1000) = 666 - double costUp5 = costFunction.computeCost(metrics, 15, lagOnlyConfig, null).totalCost(); + double costUp5 = costFunction.computeCost(metrics, 15, lagOnlyConfig).totalCost(); Assert.assertEquals("Cost when scaling up by 5", 666.7, costUp5, 0.1); // Scale up by 10 (to 20): marginal model = 10M / (20 * 1000) = 500s - double costUp10 = costFunction.computeCost(metrics, 20, lagOnlyConfig, null).totalCost(); + double costUp10 = costFunction.computeCost(metrics, 20, lagOnlyConfig).totalCost(); Assert.assertEquals("Cost when scaling up by 10", 500.0, costUp10, 0.01); // Adding more tasks reduces lag recovery time @@ -122,8 +121,8 @@ public void testBalancedWeightsFavorStabilityOverScaleUpOnSmallLag() { // Validate idle ratio estimation and ensure balanced weights still favor stability. CostMetrics metrics = createMetrics(100.0, 10, 100, 0.3); - double costCurrent = costFunction.computeCost(metrics, 10, config, null).totalCost(); - double costScaleUp = costFunction.computeCost(metrics, 20, config, null).totalCost(); + double costCurrent = costFunction.computeCost(metrics, 10, config).totalCost(); + double costScaleUp = costFunction.computeCost(metrics, 20, config).totalCost(); Assert.assertTrue( "With balanced weights, staying at current count is cheaper than scale-up", @@ -154,8 +153,8 @@ public void testWeightsAffectCost() CostMetrics metrics = createMetrics(100000.0, 10, 100, 0.1); - double costLag = costFunction.computeCost(metrics, 10, lagOnly, null).totalCost(); - double costIdle = costFunction.computeCost(metrics, 10, idleOnly, null).totalCost(); + double costLag = costFunction.computeCost(metrics, 10, lagOnly).totalCost(); + double costIdle = costFunction.computeCost(metrics, 10, idleOnly).totalCost(); Assert.assertNotEquals("Different weights should produce different costs", costLag, costIdle, 0.0001); Assert.assertTrue("Lag-only cost should be positive", costLag > 0.0); @@ -170,9 +169,9 @@ public void testNoProcessingRateFavorsCurrentTaskCount() int currentTaskCount = 10; CostMetrics metricsNoRate = createMetricsWithRate(50000.0, currentTaskCount, 100, 0.3, 0.0); - double costAtCurrent = costFunction.computeCost(metricsNoRate, currentTaskCount, config, null).totalCost(); - double costScaleUp = costFunction.computeCost(metricsNoRate, currentTaskCount + 5, config, null).totalCost(); - double costScaleDown = costFunction.computeCost(metricsNoRate, currentTaskCount - 5, config, null).totalCost(); + double costAtCurrent = costFunction.computeCost(metricsNoRate, currentTaskCount, config).totalCost(); + double costScaleUp = costFunction.computeCost(metricsNoRate, currentTaskCount + 5, config).totalCost(); + double costScaleDown = costFunction.computeCost(metricsNoRate, currentTaskCount - 5, config).totalCost(); Assert.assertTrue( "Cost at current should be less than cost for scale up", @@ -201,8 +200,8 @@ public void testNoProcessingRateDeviationPenaltyIsSymmetric() .defaultProcessingRate(1000.0) .build(); - double costUp5 = costFunction.computeCost(metricsNoRate, currentTaskCount + 5, lagOnlyConfig, null).totalCost(); - double costDown5 = costFunction.computeCost(metricsNoRate, currentTaskCount - 5, lagOnlyConfig, null).totalCost(); + double costUp5 = costFunction.computeCost(metricsNoRate, currentTaskCount + 5, lagOnlyConfig).totalCost(); + double costDown5 = costFunction.computeCost(metricsNoRate, currentTaskCount - 5, lagOnlyConfig).totalCost(); Assert.assertEquals( "Lag cost for +5 and -5 deviation should be equal", @@ -229,10 +228,10 @@ public void testIdleCostMonotonicWithTaskCount() // Current: 10 tasks with 40% idle (60% busy) CostMetrics metrics = createMetrics(0.0, 10, 100, 0.4); - double costAt5 = costFunction.computeCost(metrics, 5, idleOnlyConfig, null).totalCost(); - double costAt10 = costFunction.computeCost(metrics, 10, idleOnlyConfig, null).totalCost(); - double costAt15 = costFunction.computeCost(metrics, 15, idleOnlyConfig, null).totalCost(); - double costAt20 = costFunction.computeCost(metrics, 20, idleOnlyConfig, null).totalCost(); + double costAt5 = costFunction.computeCost(metrics, 5, idleOnlyConfig).totalCost(); + double costAt10 = costFunction.computeCost(metrics, 10, idleOnlyConfig).totalCost(); + double costAt15 = costFunction.computeCost(metrics, 15, idleOnlyConfig).totalCost(); + double costAt20 = costFunction.computeCost(metrics, 20, idleOnlyConfig).totalCost(); // Monotonically increasing idle cost as tasks increase Assert.assertTrue("cost(5) < cost(10)", costAt5 < costAt10); @@ -256,7 +255,7 @@ public void testIdleRatioClampingAtBoundaries() // busyFraction = 0.6, taskRatio = 0.2 // predictedIdle = 1 - 0.6/0.2 = 1 - 3 = -2 → clamped to 0 CostMetrics metrics = createMetrics(0.0, 10, 100, 0.4); - double costAt2 = costFunction.computeCost(metrics, 2, idleOnlyConfig, null).totalCost(); + double costAt2 = costFunction.computeCost(metrics, 2, idleOnlyConfig).totalCost(); // idlenessCost = taskCount * taskDuration * 0.0 (clamped) = 0 Assert.assertEquals("Idle cost should be 0 when predicted idle is clamped to 0", 0.0, costAt2, 0.0001); @@ -266,7 +265,7 @@ public void testIdleRatioClampingAtBoundaries() // busyFraction = 0.9, taskRatio = 10 // predictedIdle = 1 - 0.9/10 = 1 - 0.09 = 0.91 (within bounds) CostMetrics lowIdle = createMetrics(0.0, 10, 100, 0.1); - double costAt100 = costFunction.computeCost(lowIdle, 100, idleOnlyConfig, null).totalCost(); + double costAt100 = costFunction.computeCost(lowIdle, 100, idleOnlyConfig).totalCost(); // idlenessCost = 100 * 3600 * 0.91 = 327600 Assert.assertTrue("Cost should be finite and positive", Double.isFinite(costAt100) && costAt100 > 0); } @@ -286,8 +285,8 @@ public void testIdleRatioWithMissingData() // Negative idle ratio indicates missing data → should default to 0.5 CostMetrics missingIdleData = createMetrics(0.0, 10, 100, -1.0); - double cost10 = costFunction.computeCost(missingIdleData, 10, idleOnlyConfig, null).totalCost(); - double cost20 = costFunction.computeCost(missingIdleData, 20, idleOnlyConfig, null).totalCost(); + double cost10 = costFunction.computeCost(missingIdleData, 10, idleOnlyConfig).totalCost(); + double cost20 = costFunction.computeCost(missingIdleData, 20, idleOnlyConfig).totalCost(); // With missing data, predicted idle = 0.5 for all task counts // idlenessCost at 10 = 10 * 3600 * 0.5 = 18000 @@ -299,11 +298,12 @@ public void testIdleRatioWithMissingData() @Test public void testLagAmplificationReducesIdleUnderHighLag() { - CostBasedAutoScalerConfig idleOnlyConfig = CostBasedAutoScalerConfig.builder() + CostBasedAutoScalerConfig configWithThreshold = CostBasedAutoScalerConfig.builder() .taskCountMax(100) .taskCountMin(1) .enableTaskAutoScaler(true) .defaultProcessingRate(1000.0) + .highLagThreshold(10_000) .build(); int currentTaskCount = 3; @@ -311,14 +311,12 @@ public void testLagAmplificationReducesIdleUnderHighLag() int partitionCount = 30; double pollIdleRatio = 0.1; - // Plugin with threshold between lowLag (5000) and highLag (500000) - BurstScaleUpOnHighLagPlugin highLagPlugin = new BurstScaleUpOnHighLagPlugin(10_000); - + // lowLag (5000) is below threshold, highLag (500000) is well above CostMetrics lowLag = createMetrics(5_000.0, currentTaskCount, partitionCount, pollIdleRatio); CostMetrics highLag = createMetrics(500_000.0, currentTaskCount, partitionCount, pollIdleRatio); - double lowLagCost = costFunction.computeCost(lowLag, proposedTaskCount, idleOnlyConfig, highLagPlugin).totalCost(); - double highLagCost = costFunction.computeCost(highLag, proposedTaskCount, idleOnlyConfig, highLagPlugin).totalCost(); + double lowLagCost = costFunction.computeCost(lowLag, proposedTaskCount, configWithThreshold).totalCost(); + double highLagCost = costFunction.computeCost(highLag, proposedTaskCount, configWithThreshold).totalCost(); Assert.assertTrue( "Higher lag should reduce predicted idle more aggressively", lowLagCost > highLagCost @@ -334,9 +332,10 @@ public void testCustomLagThresholdsAffectCostCalculation() int partitionCount = 30; double pollIdleRatio = 0.1; - // Use lag that exceeds sensitive threshold (10000) but not default threshold (50000) + // Use lag that exceeds sensitive threshold (10000) but not default (-1, disabled) CostMetrics metrics = createMetrics(15_000.0, currentTaskCount, partitionCount, pollIdleRatio); + // Default config: highLagThreshold=-1 (disabled), no lag amplification CostBasedAutoScalerConfig defaultConfig = CostBasedAutoScalerConfig.builder() .taskCountMax(100) .taskCountMin(1) @@ -344,6 +343,7 @@ public void testCustomLagThresholdsAffectCostCalculation() .defaultProcessingRate(1000.0) .build(); + // Sensitive config: threshold 10000, lag 15000 > 10000, amplification happens CostBasedAutoScalerConfig sensitiveConfig = CostBasedAutoScalerConfig.builder() .taskCountMax(100) .taskCountMin(1) @@ -352,13 +352,8 @@ public void testCustomLagThresholdsAffectCostCalculation() .highLagThreshold(10000) .build(); - // Default plugin: threshold 50000, lag 15000 < 50000, no amplification - BurstScaleUpOnHighLagPlugin defaultPlugin = new BurstScaleUpOnHighLagPlugin(50000); - // Sensitive plugin: threshold 10000, lag 15000 > 10000, amplification happens - BurstScaleUpOnHighLagPlugin sensitivePlugin = new BurstScaleUpOnHighLagPlugin(10000); - - double defaultCost = costFunction.computeCost(metrics, proposedTaskCount, defaultConfig, defaultPlugin).totalCost(); - double sensitiveCost = costFunction.computeCost(metrics, proposedTaskCount, sensitiveConfig, sensitivePlugin).totalCost(); + double defaultCost = costFunction.computeCost(metrics, proposedTaskCount, defaultConfig).totalCost(); + double sensitiveCost = costFunction.computeCost(metrics, proposedTaskCount, sensitiveConfig).totalCost(); // With lower thresholds, the same lag triggers more aggressive scaling behavior // (higher lagBusyFactor), which results in lower predicted idle and thus lower idle cost @@ -369,19 +364,17 @@ public void testCustomLagThresholdsAffectCostCalculation() } @Test - public void testRampDenominatorCalculation() + public void testLnSeverityScalesWithLag() { - // Test that ramp denominator is calculated correctly from config values - // by verifying behavior at boundary conditions + // Test that ln_severity lagBusyFactor increases with lag severity, + // producing lower idle cost at higher lag. + // lagSeverity = lagPerPartition / threshold + // lagBusyFactor = min(1.0, ln(lagSeverity) / ln(5)) int currentTaskCount = 3; int proposedTaskCount = 8; int partitionCount = 30; double pollIdleRatio = 0.1; - // Custom config with specific thresholds for predictable ramp calculation - // extra=10000, aggressive=20000 - // lagAmplificationMaxLagPerPartition = 20000 * 5 = 100000 - // rampDenominator = 100000 - 10000 = 90000 CostBasedAutoScalerConfig customConfig = CostBasedAutoScalerConfig.builder() .taskCountMax(100) .taskCountMin(1) @@ -390,24 +383,21 @@ public void testRampDenominatorCalculation() .highLagThreshold(10000) .build(); - // Plugin with threshold 10000 - BurstScaleUpOnHighLagPlugin highLagPlugin = new BurstScaleUpOnHighLagPlugin(10000); - - // Lag exactly at extraThreshold (lagPerPartition = 10000) - // ramp = (10000 - 10000) / 90000 = 0 - CostMetrics atExtraThreshold = createMetrics(10_000.0, currentTaskCount, partitionCount, pollIdleRatio); + // Lag exactly at threshold (lagPerPartition = 10000, severity=1.0) + // lagBusyFactor = ln(1) / ln(5) = 0 + CostMetrics atThreshold = createMetrics(10_000.0, currentTaskCount, partitionCount, pollIdleRatio); - // Lag at maximum (lagPerPartition = 100000) - // ramp = (100000 - 10000) / 90000 = 1.0 - CostMetrics atMaxLag = createMetrics(100_000.0, currentTaskCount, partitionCount, pollIdleRatio); + // Lag at 5x threshold (lagPerPartition = 50000, severity=5.0) + // lagBusyFactor = ln(5) / ln(5) = 1.0 + CostMetrics atMaxSeverity = createMetrics(50_000.0, currentTaskCount, partitionCount, pollIdleRatio); - double costAtExtra = costFunction.computeCost(atExtraThreshold, proposedTaskCount, customConfig, highLagPlugin).totalCost(); - double costAtMax = costFunction.computeCost(atMaxLag, proposedTaskCount, customConfig, highLagPlugin).totalCost(); + double costAtThreshold = costFunction.computeCost(atThreshold, proposedTaskCount, customConfig).totalCost(); + double costAtMax = costFunction.computeCost(atMaxSeverity, proposedTaskCount, customConfig).totalCost(); - // At max lag, ramp=1.0 leads to maximum amplification, reducing idle cost more + // At max severity, lagBusyFactor=1.0, idle is fully suppressed → lower cost Assert.assertTrue( - "Cost at max lag should be lower due to maximum lag amplification", - costAtMax < costAtExtra + "Cost at max severity should be lower due to full idle suppression", + costAtMax < costAtThreshold ); } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPluginTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPluginTest.java deleted file mode 100644 index ffe1b21ffd55..000000000000 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/plugins/BurstScaleUpOnHighLagPluginTest.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.druid.indexing.seekablestream.supervisor.autoscaler.plugins; - -import org.junit.Assert; -import org.junit.Test; - -/** - * Tests for {@link BurstScaleUpOnHighLagPlugin}. - *

- * The plugin uses a logarithmic formula for burst scaling: - * {@code deltaTasks = K * ln(lagSeverity)} - * where {@code K = (partitionCount / 6.4) / sqrt(currentTaskCount)} - */ -public class BurstScaleUpOnHighLagPluginTest -{ - private static final int LAG_THRESHOLD = 50_000; - private static final int PARTITION_COUNT = 48; - private static final int TASK_COUNT_MAX = 48; - - /** - * Tests scaling behavior across different lag levels and task counts. - *

- * Expected behavior for 48 partitions with threshold=50K: - *

-   * | Current | Lag/Part | Boost | Notes                                      |
-   * |---------|----------|-------|---------------------------------------------|
-   * | any     | <50K     | 0     | Below threshold                             |
-   * | any     | =50K     | 0     | ln(1) = 0                                   |
-   * | 1       | 100K     | 40    | Significant boost for emergency recovery    |
-   * | 1       | 200K     | 43    | Large boost                                 |
-   * | 4       | 200K     | 6     | Moderate boost (K decreases with sqrt(C))   |
-   * | 12      | 200K     | 0     | Delta too small for PPT change              |
-   * | 24      | 200K     | 0     | Delta too small for PPT change              |
-   * 
- * At high task counts (C=12, C=24), the delta tasks from the formula is small - * (due to K decreasing with sqrt(C)), resulting in no PPT reduction. - */ - @Test - public void testComputeScaleUpBoost() - { - BurstScaleUpOnHighLagPlugin plugin = new BurstScaleUpOnHighLagPlugin(LAG_THRESHOLD); - - // Below threshold: no boost - Assert.assertEquals( - "Below threshold should return 0", - 0, - plugin.computeScaleUpBoost(PARTITION_COUNT * 40_000L, PARTITION_COUNT, 4, TASK_COUNT_MAX) - ); - - // At threshold (lagSeverity=1, ln(1)=0): no boost - Assert.assertEquals( - "At threshold should return 0", - 0, - plugin.computeScaleUpBoost(PARTITION_COUNT * 50_000L, PARTITION_COUNT, 4, TASK_COUNT_MAX) - ); - - // C=1, 100K lag (2x threshold): significant boost for emergency recovery - int boost1_100k = plugin.computeScaleUpBoost(PARTITION_COUNT * 100_000L, PARTITION_COUNT, 1, TASK_COUNT_MAX); - Assert.assertEquals("C=1, 100K lag boost", 40, boost1_100k); - - // C=1, 200K lag (4x threshold): large boost - int boost1_200k = plugin.computeScaleUpBoost(PARTITION_COUNT * 200_000L, PARTITION_COUNT, 1, TASK_COUNT_MAX); - Assert.assertEquals("C=1, 200K lag boost", 43, boost1_200k); - - // C=4, 200K lag: moderate boost (K decreases with sqrt(C)) - int boost4_200k = plugin.computeScaleUpBoost(PARTITION_COUNT * 200_000L, PARTITION_COUNT, 4, TASK_COUNT_MAX); - Assert.assertEquals("C=4, 200K lag boost", 6, boost4_200k); - - // C=12, 200K lag: delta too small to change PPT - int boost12_200k = plugin.computeScaleUpBoost(PARTITION_COUNT * 200_000L, PARTITION_COUNT, 12, TASK_COUNT_MAX); - Assert.assertEquals("C=12, 200K lag boost", 0, boost12_200k); - - // C=24, 200K lag: delta too small to change PPT - int boost24_200k = plugin.computeScaleUpBoost(PARTITION_COUNT * 200_000L, PARTITION_COUNT, 24, TASK_COUNT_MAX); - Assert.assertEquals("C=24, 200K lag boost", 0, boost24_200k); - } - - @Test - public void testComputeScaleUpBoostInvalidInputs() - { - BurstScaleUpOnHighLagPlugin plugin = new BurstScaleUpOnHighLagPlugin(LAG_THRESHOLD); - - Assert.assertEquals(0, plugin.computeScaleUpBoost(1_000_000, 0, 4, 48)); - Assert.assertEquals(0, plugin.computeScaleUpBoost(1_000_000, 48, 0, 48)); - Assert.assertEquals(0, plugin.computeScaleUpBoost(1_000_000, 48, 4, 0)); - Assert.assertEquals(0, plugin.computeScaleUpBoost(1_000_000, -1, 4, 48)); - } - - @Test - public void testLagThreshold() - { - int customThreshold = 100_000; - BurstScaleUpOnHighLagPlugin plugin = new BurstScaleUpOnHighLagPlugin(customThreshold); - Assert.assertEquals(customThreshold, plugin.lagThreshold()); - } -} From 02721804bd05b1c76ed0e4e50fd478c57074bd94 Mon Sep 17 00:00:00 2001 From: Sasha Syrotenko Date: Thu, 5 Feb 2026 11:00:51 +0200 Subject: [PATCH 12/14] Update docs/ingestion/supervisor.md Co-authored-by: Kashif Faraz --- docs/ingestion/supervisor.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ingestion/supervisor.md b/docs/ingestion/supervisor.md index e71f102ffdc7..b704f3f873ef 100644 --- a/docs/ingestion/supervisor.md +++ b/docs/ingestion/supervisor.md @@ -202,8 +202,8 @@ Note: Kinesis is not supported yet, support is in progress. The following table outlines the configuration properties related to the `costBased` autoscaler strategy: -| Property|Description|Required| Default | -|---------|---------------------------------------------------|---|--| +| Property|Description|Required|Default| +|---------|-----------|--------|-------| |`scaleActionPeriodMillis`|The frequency in milliseconds to check if a scale action is triggered. | No | 60000 | |`lagWeight`|The weight of extracted lag value in cost function.| No| 0.25 | |`idleWeight`|The weight of extracted poll idle value in cost function. | No | 0.75 | From aeb7bec62a2e0da9ebefee34739122db97e9bb0e Mon Sep 17 00:00:00 2001 From: Sasha Syrotenko Date: Thu, 5 Feb 2026 11:04:52 +0200 Subject: [PATCH 13/14] Update docs --- docs/ingestion/supervisor.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/ingestion/supervisor.md b/docs/ingestion/supervisor.md index b704f3f873ef..41832a39a7d3 100644 --- a/docs/ingestion/supervisor.md +++ b/docs/ingestion/supervisor.md @@ -204,14 +204,14 @@ The following table outlines the configuration properties related to the `costBa | Property|Description|Required|Default| |---------|-----------|--------|-------| -|`scaleActionPeriodMillis`|The frequency in milliseconds to check if a scale action is triggered. | No | 60000 | +|`scaleActionPeriodMillis`|The frequency in milliseconds to check if a scale action is triggered. | No | 600000 | |`lagWeight`|The weight of extracted lag value in cost function.| No| 0.25 | |`idleWeight`|The weight of extracted poll idle value in cost function. | No | 0.75 | |`defaultProcessingRate`|A planned processing rate per task, required for first cost estimations. | No | 1000 | |`useTaskCountBoundaries`|Enables the bounded partitions-per-task window when selecting task counts.|No| `false` | -|`highLagThreshold`|Per-partition lag threshold that triggers burst scale-up when set to `0` or higher. Set to a negative value to disable burst scale-up.|No|-1| -|`minScaleDownDelay`| Minimum duration between successful scale actions, specified as an ISO-8601 duration string. | No | `PT30M` | -|`scaleDownDuringTaskRolloverOnly`| Indicates whether task scaling down is limited to periods during task rollovers only. | No | False | +|`highLagThreshold`|Per-partition lag threshold that triggers burst scale-up when set to a value greater than `0`. Set to a negative value to disable burst scale-up.|No|-1| +|`minScaleDownDelay`|Minimum duration between successful scale actions, specified as an ISO-8601 duration string.|No|`PT30M`| +|`scaleDownDuringTaskRolloverOnly`|Indicates whether task scaling down is limited to periods during task rollovers only.|No|`false`| The following example shows a supervisor spec with `lagBased` autoscaler: From 83a0f239030cb9b63baa97f9162af250fbf4d78c Mon Sep 17 00:00:00 2001 From: Sasha Syrotenko Date: Thu, 5 Feb 2026 16:53:47 +0200 Subject: [PATCH 14/14] Final review cleanup --- .../autoscaler/CostBasedAutoScaler.java | 22 +++++++------ .../autoscaler/CostBasedAutoScalerConfig.java | 31 +++++++++---------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java index 49d85e601636..8bf2e8ab7872 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScaler.java @@ -219,20 +219,24 @@ int computeOptimalTaskCount(CostMetrics metrics) int optimalTaskCount = -1; CostResult optimalCost = new CostResult(); + log.info( + "Current metrics: avgPartitionLag[%.1f], pollIdleRatio[%.1f], lagWeight[%.1f], idleWeight[%.1f]", + metrics.getAggregateLag(), + metrics.getPollIdleRatio(), + config.getLagWeight(), + config.getIdleWeight() + ); + for (int taskCount : validTaskCounts) { CostResult costResult = costFunction.computeCost(metrics, taskCount, config); double cost = costResult.totalCost(); + log.info( - "Proposed task count[%d] has total cost[%.4f] = lagCost[%.4f] + idleCost[%.4f]." - + " Stats: avgPartitionLag[%.1f], pollIdleRatio[%.1f], lagWeight[%.1f], idleWeight[%.1f]", + "Proposed task count[%d] has total cost[%.4f] = lagCost[%.4f] + idleCost[%.4f].", taskCount, cost, costResult.lagCost(), - costResult.idleCost(), - metrics.getAggregateLag(), - metrics.getPollIdleRatio(), - config.getLagWeight(), - config.getIdleWeight() + costResult.idleCost() ); if (cost < optimalCost.totalCost()) { optimalTaskCount = taskCount; @@ -287,8 +291,8 @@ static int[] computeValidTaskCounts( final int currentPartitionsPerTask = partitionCount / currentTaskCount; // Minimum partitions per task correspond to the maximum number of tasks (scale up) and vice versa. - int minPartitionsPerTask = partitionCount / taskCountMax; - int maxPartitionsPerTask = partitionCount / taskCountMin; + int minPartitionsPerTask = Math.min(1, partitionCount / taskCountMax); + int maxPartitionsPerTask = Math.max(partitionCount, partitionCount / taskCountMin); if (isTaskCountBoundariesEnabled) { maxPartitionsPerTask = Math.min( diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java index df397d5d59e1..cb791abefb30 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/autoscaler/CostBasedAutoScalerConfig.java @@ -62,25 +62,9 @@ public class CostBasedAutoScalerConfig implements AutoScalerConfig private final double lagWeight; private final double idleWeight; private final double defaultProcessingRate; - /** - * Enables or disables {@code OptimalTaskCountBoundariesPlugin} which allows - * considering only task counts within a certain PPT-based window around the current PPT. - */ private final boolean useTaskCountBoundaries; - /** - * Per-partition lag threshold allowing to activate a burst scaleup to eliminate high lag. - */ private final int highLagThreshold; - /** - * Represents the minimum duration between successful scale actions. - * A higher value implies a more conservative scaling behavior, ensuring that tasks - * are not scaled too frequently during workload fluctuations. - */ private final Duration minScaleDownDelay; - /** - * Indicates whether task scaling down is limited to periods during task rollovers only. - * If set to {@code false}, allows scaling down during normal task run time. - */ private final boolean scaleDownDuringTaskRolloverOnly; @JsonCreator @@ -225,24 +209,39 @@ public double getDefaultProcessingRate() return defaultProcessingRate; } + /** + * Enables or disables the use of task count boundaries derived from the current partitions-per-task (PPT) ratio. + */ @JsonProperty("useTaskCountBoundaries") public boolean shouldUseTaskCountBoundaries() { return useTaskCountBoundaries; } + /** + * Per-partition lag threshold allowing to activate a burst scaleup to eliminate high lag. + */ @JsonProperty("highLagThreshold") public int getHighLagThreshold() { return highLagThreshold; } + /** + * Represents the minimum duration between successful scale actions. + * A higher value implies a more conservative scaling behavior, ensuring that tasks + * are not scaled too frequently during workload fluctuations. + */ @JsonProperty public Duration getMinScaleDownDelay() { return minScaleDownDelay; } + /** + * Indicates whether task scaling down is limited to periods during task rollovers only. + * If set to {@code false}, allows scaling down during normal task run time. + */ @JsonProperty("scaleDownDuringTaskRolloverOnly") public boolean isScaleDownOnTaskRolloverOnly() {