From 131f9a68fe6ea4db64a2c54c31425dd215851a36 Mon Sep 17 00:00:00 2001 From: capistrant Date: Tue, 30 Sep 2025 11:34:50 -0500 Subject: [PATCH 01/17] Revert "Create Kubernetes peon lifecycle task log persist timeout (#18444)" This reverts commit 90be6827f8d2be3accd8eb925d106fb316edada7. --- docs/development/extensions-core/k8s-jobs.md | 1 - .../k8s/overlord/KubernetesPeonLifecycle.java | 37 +--- .../KubernetesPeonLifecycleFactory.java | 8 +- .../overlord/KubernetesTaskRunnerConfig.java | 23 --- .../overlord/KubernetesTaskRunnerFactory.java | 7 +- .../overlord/KubernetesPeonLifecycleTest.java | 167 ++++-------------- .../k8s/overlord/KubernetesWorkItemTest.java | 21 +-- 7 files changed, 45 insertions(+), 219 deletions(-) diff --git a/docs/development/extensions-core/k8s-jobs.md b/docs/development/extensions-core/k8s-jobs.md index 4d75a65ba6ff..e16479406b08 100644 --- a/docs/development/extensions-core/k8s-jobs.md +++ b/docs/development/extensions-core/k8s-jobs.md @@ -792,7 +792,6 @@ Should you require the needed permissions for interacting across Kubernetes name | `druid.indexer.runner.graceTerminationPeriodSeconds` | `Long` | Number of seconds you want to wait after a sigterm for container lifecycle hooks to complete. Keep at a smaller value if you want tasks to hold locks for shorter periods. | `PT30S` (K8s default) | No | | `druid.indexer.runner.capacity` | `Integer` | Number of concurrent jobs that can be sent to Kubernetes. | `2147483647` | No | | `druid.indexer.runner.cpuCoreInMicro` | `Integer` | Number of CPU micro core for the task. | `1000` | No | -| `druid.indexer.runner.logSaveTimeout` | `Duration` | How long to wait for task logs to be saved before giving up. | `PT300S` | NO | ### Metrics added diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java index dae8890a089b..a0e618b43249 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java @@ -34,7 +34,6 @@ import org.apache.druid.indexer.TaskStatus; import org.apache.druid.indexing.common.task.Task; import org.apache.druid.java.util.common.StringUtils; -import org.apache.druid.java.util.common.concurrent.Execs; import org.apache.druid.java.util.emitter.EmittingLogger; import org.apache.druid.k8s.overlord.common.DruidK8sConstants; import org.apache.druid.k8s.overlord.common.JobResponse; @@ -50,11 +49,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicReference; /** @@ -97,7 +92,6 @@ protected enum State private final ObjectMapper mapper; private final TaskStateListener stateListener; private final SettableFuture taskStartedSuccessfullyFuture; - private final long logSaveTimeoutMs; @MonotonicNonNull private LogWatch logWatch; @@ -110,8 +104,7 @@ protected KubernetesPeonLifecycle( KubernetesPeonClient kubernetesClient, TaskLogs taskLogs, ObjectMapper mapper, - TaskStateListener stateListener, - long logSaveTimeoutMs + TaskStateListener stateListener ) { this.task = task; @@ -121,7 +114,6 @@ protected KubernetesPeonLifecycle( this.mapper = mapper; this.stateListener = stateListener; this.taskStartedSuccessfullyFuture = SettableFuture.create(); - this.logSaveTimeoutMs = logSaveTimeoutMs; } /** @@ -355,33 +347,6 @@ protected void startWatchingLogs() } protected void saveLogs() - { - ExecutorService executor = Executors.newSingleThreadExecutor(Execs.makeThreadFactory("k8s-tasklog-persist-%d")); - try { - Future future = executor.submit(this::doSaveLogs); - future.get(logSaveTimeoutMs, TimeUnit.MILLISECONDS); - } - catch (TimeoutException e) { - log.warn("Persisting task logs timed out after %d ms for task [%s]. This does not have any impact on the" - + " work done by the task, but the logs may be innaccessible. If this continues to happen, check" - + " Kubernetes server logs for potential errors.", logSaveTimeoutMs, taskId.getOriginalTaskId()); - } - catch (Exception e) { - log.error(e, "Persisting task logs failed for task [%s] This does not have any impact on the" - + " work done by the task, but the logs may be innaccessible. If this continues to happen, check" - + " Kubernetes server logs for potential errors.", taskId.getOriginalTaskId()); - } - finally { - executor.shutdownNow(); - // shutdownNow does not always allow finally blocks to run, so we make sure to close the logWatch here too if it - // wasn't closed in doSaveLogs - if (logWatch != null) { - logWatch.close(); - } - } - } - - private void doSaveLogs() { try { Path file = Files.createTempFile(taskId.getOriginalTaskId(), "log"); diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java index 8bd7db2ebf3e..71c37619357f 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java @@ -30,19 +30,16 @@ public class KubernetesPeonLifecycleFactory implements PeonLifecycleFactory private final KubernetesPeonClient client; private final TaskLogs taskLogs; private final ObjectMapper mapper; - private final long logSaveTimeoutMs; public KubernetesPeonLifecycleFactory( KubernetesPeonClient client, TaskLogs taskLogs, - ObjectMapper mapper, - long logSaveTimeoutMs + ObjectMapper mapper ) { this.client = client; this.taskLogs = taskLogs; this.mapper = mapper; - this.logSaveTimeoutMs = logSaveTimeoutMs; } @Override @@ -54,8 +51,7 @@ public KubernetesPeonLifecycle build(Task task, K8sTaskId k8sTaskId, KubernetesP client, taskLogs, mapper, - stateListener, - logSaveTimeoutMs + stateListener ); } } diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java index d810c9ee23a2..ef4f0740e056 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java @@ -105,11 +105,6 @@ public class KubernetesTaskRunnerConfig // how long to wait for the peon k8s job to launch private Period k8sjobLaunchTimeout = new Period("PT1H"); - @JsonProperty - @NotNull - // how long to wait for log saving operations to complete - private Period logSaveTimeout = new Period("PT300S"); - @JsonProperty // ForkingTaskRunner inherits the monitors from the MM, in k8s mode // the peon inherits the monitors from the overlord, so if someone specifies @@ -157,7 +152,6 @@ private KubernetesTaskRunnerConfig( Period taskCleanupDelay, Period taskCleanupInterval, Period k8sjobLaunchTimeout, - Period logSaveTimeout, List peonMonitors, List javaOptsArray, int cpuCoreInMicro, @@ -214,10 +208,6 @@ private KubernetesTaskRunnerConfig( taskJoinTimeout, this.taskJoinTimeout ); - this.logSaveTimeout = ObjectUtils.defaultIfNull( - logSaveTimeout, - this.logSaveTimeout - ); this.peonMonitors = ObjectUtils.defaultIfNull( peonMonitors, this.peonMonitors @@ -316,11 +306,6 @@ public Period getTaskLaunchTimeout() return k8sjobLaunchTimeout; } - public Period getLogSaveTimeout() - { - return logSaveTimeout; - } - public List getPeonMonitors() { return peonMonitors; @@ -378,7 +363,6 @@ public static class Builder private Map annotations; private Integer capacity; private Period taskJoinTimeout; - private Period logSaveTimeout; public Builder() { @@ -505,12 +489,6 @@ public Builder withTaskJoinTimeout(Period taskJoinTimeout) return this; } - public Builder withLogSaveTimeout(Period logSaveTimeout) - { - this.logSaveTimeout = logSaveTimeout; - return this; - } - public KubernetesTaskRunnerConfig build() { return new KubernetesTaskRunnerConfig( @@ -527,7 +505,6 @@ public KubernetesTaskRunnerConfig build() this.taskCleanupDelay, this.taskCleanupInterval, this.k8sjobLaunchTimeout, - this.logSaveTimeout, this.peonMonitors, this.javaOptsArray, this.cpuCoreInMicro, diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java index dd8111ed49ed..aac79e8d665d 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java @@ -93,12 +93,7 @@ public KubernetesTaskRunner build() kubernetesTaskRunnerConfig, peonClient, httpClient, - new KubernetesPeonLifecycleFactory( - peonClient, - taskLogs, - smileMapper, - kubernetesTaskRunnerConfig.getLogSaveTimeout().toStandardDuration().getMillis() - ), + new KubernetesPeonLifecycleFactory(peonClient, taskLogs, smileMapper), emitter ); return runner; diff --git a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java index 4484a1076189..669df02b5894 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java +++ b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java @@ -41,7 +41,6 @@ import org.easymock.EasyMockRunner; import org.easymock.EasyMockSupport; import org.easymock.Mock; -import org.joda.time.Period; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -60,8 +59,6 @@ public class KubernetesPeonLifecycleTest extends EasyMockSupport { private static final String ID = "id"; private static final TaskStatus SUCCESS = TaskStatus.success(ID); - private static final Period LOG_SAVE_TIMEOUT = new Period("PT300S"); - private static final Period SHORT_LOG_SAVE_TIMEOUT = new Period("PT1S"); @Mock KubernetesPeonClient kubernetesClient; @Mock TaskLogs taskLogs; @@ -91,8 +88,7 @@ public void test_run() throws IOException kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ) { @Override @@ -138,8 +134,7 @@ public void test_run_useTaskManager() throws IOException kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ) { @Override @@ -184,8 +179,7 @@ public void test_run_whenCalledMultipleTimes_raisesIllegalStateException() throw kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ) { @Override @@ -235,8 +229,7 @@ public void test_run_whenExceptionRaised_setsRunnerTaskStateToNone() kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ) { @Override @@ -281,8 +274,7 @@ public void test_run_whenExceptionRaised_setsStartStatusFutureToFalse() throws E kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ) { @Override @@ -330,8 +322,7 @@ public void test_join_withoutJob_returnsFailedTaskStatus() throws IOException kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); EasyMock.expect(kubernetesClient.waitForPeonJobCompletion( @@ -370,8 +361,7 @@ public void test_join() throws IOException kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); Assert.assertFalse(peonLifecycle.getTaskStartedSuccessfullyFuture().isDone()); @@ -402,7 +392,7 @@ public void test_join() throws IOException stateListener.stateChanged(KubernetesPeonLifecycle.State.STOPPED, ID); EasyMock.expectLastCall().once(); logWatch.close(); - EasyMock.expectLastCall().atLeastOnce(); + EasyMock.expectLastCall(); Assert.assertEquals(KubernetesPeonLifecycle.State.NOT_STARTED, peonLifecycle.getState()); @@ -425,8 +415,7 @@ public void test_join_whenCalledMultipleTimes_raisesIllegalStateException() thro kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); Job job = new JobBuilder() @@ -452,11 +441,13 @@ public void test_join_whenCalledMultipleTimes_raisesIllegalStateException() thro taskLogs.pushTaskLog(EasyMock.eq(ID), EasyMock.anyObject(File.class)); EasyMock.expectLastCall(); logWatch.close(); - EasyMock.expectLastCall().atLeastOnce(); + EasyMock.expectLastCall(); stateListener.stateChanged(KubernetesPeonLifecycle.State.RUNNING, ID); EasyMock.expectLastCall().once(); stateListener.stateChanged(KubernetesPeonLifecycle.State.STOPPED, ID); EasyMock.expectLastCall().once(); + logWatch.close(); + EasyMock.expectLastCall(); Assert.assertEquals(KubernetesPeonLifecycle.State.NOT_STARTED, peonLifecycle.getState()); @@ -485,8 +476,7 @@ public void test_join_withoutTaskStatus_returnsFailedTaskStatus() throws IOExcep kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); Job job = new JobBuilder() @@ -534,8 +524,7 @@ public void test_join_whenIOExceptionThrownWhileStreamingTaskStatus_returnsFaile kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); Job job = new JobBuilder() @@ -561,7 +550,7 @@ public void test_join_whenIOExceptionThrownWhileStreamingTaskStatus_returnsFaile stateListener.stateChanged(KubernetesPeonLifecycle.State.STOPPED, ID); EasyMock.expectLastCall().once(); logWatch.close(); - EasyMock.expectLastCall().atLeastOnce(); + EasyMock.expectLastCall(); Assert.assertEquals(KubernetesPeonLifecycle.State.NOT_STARTED, peonLifecycle.getState()); @@ -586,8 +575,7 @@ public void test_join_whenIOExceptionThrownWhileStreamingTaskLogs_isIgnored() th kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); Job job = new JobBuilder() @@ -615,7 +603,7 @@ public void test_join_whenIOExceptionThrownWhileStreamingTaskLogs_isIgnored() th stateListener.stateChanged(KubernetesPeonLifecycle.State.STOPPED, ID); EasyMock.expectLastCall().once(); logWatch.close(); - EasyMock.expectLastCall().atLeastOnce(); + EasyMock.expectLastCall(); Assert.assertEquals(KubernetesPeonLifecycle.State.NOT_STARTED, peonLifecycle.getState()); @@ -638,8 +626,7 @@ public void test_join_whenRuntimeExceptionThrownWhileWaitingForKubernetesJob_thr kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); EasyMock.expect(kubernetesClient.waitForPeonJobCompletion( @@ -657,7 +644,7 @@ public void test_join_whenRuntimeExceptionThrownWhileWaitingForKubernetesJob_thr stateListener.stateChanged(KubernetesPeonLifecycle.State.STOPPED, ID); EasyMock.expectLastCall().once(); logWatch.close(); - EasyMock.expectLastCall().atLeastOnce(); + EasyMock.expectLastCall(); Assert.assertEquals(KubernetesPeonLifecycle.State.NOT_STARTED, peonLifecycle.getState()); @@ -679,8 +666,7 @@ public void test_shutdown_withNotStartedTaskState() kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); peonLifecycle.shutdown(); } @@ -694,8 +680,7 @@ public void test_shutdown_withPendingTaskState() throws NoSuchFieldException, Il kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.PENDING); @@ -717,8 +702,7 @@ public void test_shutdown_withRunningTaskState() throws NoSuchFieldException, Il kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -740,8 +724,7 @@ public void test_shutdown_withStoppedTaskState() throws NoSuchFieldException, Il kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.STOPPED); @@ -757,8 +740,7 @@ public void test_streamLogs_withNotStartedTaskState() throws NoSuchFieldExceptio kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.NOT_STARTED); @@ -774,8 +756,7 @@ public void test_streamLogs_withPendingTaskState() throws NoSuchFieldException, kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.PENDING); @@ -791,8 +772,7 @@ public void test_streamLogs_withRunningTaskState() throws NoSuchFieldException, kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -816,8 +796,7 @@ public void test_streamLogs_withStoppedTaskState() throws NoSuchFieldException, kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.STOPPED); @@ -834,8 +813,7 @@ public void test_getTaskLocation_withNotStartedTaskState_returnsUnknown() kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.NOT_STARTED); @@ -852,8 +830,7 @@ public void test_getTaskLocation_withPendingTaskState_returnsUnknown() kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.PENDING); @@ -870,8 +847,7 @@ public void test_getTaskLocation_withRunningTaskState_withoutPeonPod_returnsUnkn kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -894,8 +870,7 @@ public void test_getTaskLocation_withRunningTaskState_withPeonPodWithoutStatus_r kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -924,8 +899,7 @@ public void test_getTaskLocation_withRunningTaskState_withPeonPodWithStatus_retu kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -962,8 +936,7 @@ public void test_getTaskLocation_saveTaskLocation() kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -1000,8 +973,7 @@ public void test_getTaskLocation_withRunningTaskState_withPeonPodWithStatusWithT kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -1039,8 +1011,7 @@ public void test_getTaskLocation_withStoppedTaskState_returnsUnknown() kubernetesClient, taskLogs, mapper, - stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + stateListener ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.STOPPED); EasyMock.expect(kubernetesClient.getPeonPod(k8sTaskId.getK8sJobName())).andReturn(Optional.absent()).once(); @@ -1050,74 +1021,6 @@ public void test_getTaskLocation_withStoppedTaskState_returnsUnknown() verifyAll(); } - @Test - public void test_join_saveLogsTimeout_handledGracefully() throws IOException - { - KubernetesPeonLifecycle peonLifecycle = new KubernetesPeonLifecycle( - task, - k8sTaskId, - kubernetesClient, - taskLogs, - mapper, - stateListener, - SHORT_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() // 1 second timeout for task log save - ); - - Job job = new JobBuilder() - .withNewMetadata() - .withName(ID) - .endMetadata() - .withNewStatus() - .withSucceeded(1) - .withStartTime("2022-09-19T23:31:50Z") - .withCompletionTime("2022-09-19T23:32:48Z") - .endStatus() - .build(); - - EasyMock.expect(kubernetesClient.waitForPeonJobCompletion( - EasyMock.eq(k8sTaskId), - EasyMock.anyLong(), - EasyMock.eq(TimeUnit.MILLISECONDS) - )).andReturn(new JobResponse(job, PeonPhase.SUCCEEDED)); - - EasyMock.expect(kubernetesClient.getPeonLogWatcher(k8sTaskId)).andReturn(Optional.of(logWatch)); - EasyMock.expect(taskLogs.streamTaskStatus(ID)).andReturn(Optional.of( - IOUtils.toInputStream(mapper.writeValueAsString(SUCCESS), StandardCharsets.UTF_8) - )); - - // Mock pushTaskLog to sleep longer than the timeout (2 seconds > 1 second timeout) - taskLogs.pushTaskLog(EasyMock.eq(ID), EasyMock.anyObject(File.class)); - EasyMock.expectLastCall().andAnswer(() -> { - Thread.sleep(2000); // Sleep for 2 seconds - return null; - }); - - stateListener.stateChanged(KubernetesPeonLifecycle.State.RUNNING, ID); - EasyMock.expectLastCall().once(); - stateListener.stateChanged(KubernetesPeonLifecycle.State.STOPPED, ID); - EasyMock.expectLastCall().once(); - logWatch.close(); - EasyMock.expectLastCall().atLeastOnce(); - - Assert.assertEquals(KubernetesPeonLifecycle.State.NOT_STARTED, peonLifecycle.getState()); - - replayAll(); - - // The test should complete without hanging, even though log save times out - long startTime = System.currentTimeMillis(); - TaskStatus taskStatus = peonLifecycle.join(0L); - long duration = System.currentTimeMillis() - startTime; - - // Should complete in ~1 second (timeout), not 5+ seconds (sleep duration) - Assert.assertTrue("Test should complete quickly due to timeout", duration < 1500); - - verifyAll(); - - // Task should still succeed - log timeout doesn't affect task result - Assert.assertEquals(SUCCESS.withDuration(58000), taskStatus); - Assert.assertEquals(KubernetesPeonLifecycle.State.STOPPED, peonLifecycle.getState()); - } - private void setPeonLifecycleState(KubernetesPeonLifecycle peonLifecycle, KubernetesPeonLifecycle.State state) throws NoSuchFieldException, IllegalAccessException { diff --git a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java index 23c4b444811a..bfda735011b1 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java +++ b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java @@ -28,7 +28,6 @@ import org.easymock.EasyMockRunner; import org.easymock.EasyMockSupport; import org.easymock.Mock; -import org.joda.time.Period; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -37,8 +36,6 @@ @RunWith(EasyMockRunner.class) public class KubernetesWorkItemTest extends EasyMockSupport { - private static final Period LOG_SAVE_TIMEOUT = new Period("PT300S"); - private KubernetesWorkItem workItem; private Task task; private K8sTaskId taskId; @@ -133,8 +130,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_returnsPending() null, null, null, - null, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + null ); workItem = new KubernetesWorkItem(task, null, peonLifecycle); @@ -150,8 +146,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_inPendingState_r null, null, null, - null, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + null ) { @Override protected State getState() @@ -174,8 +169,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_inRunningState_r null, null, null, - null, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + null ) { @Override protected State getState() @@ -198,8 +192,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_inStoppedState_r null, null, null, - null, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + null ) { @Override protected State getState() @@ -222,8 +215,7 @@ public void test_streamTaskLogs_withKubernetesPeonLifecycle() null, null, null, - null, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + null ); workItem = new KubernetesWorkItem(task, null, peonLifecycle); Assert.assertFalse(workItem.streamTaskLogs().isPresent()); @@ -238,8 +230,7 @@ public void test_getLocation_withKubernetesPeonLifecycle() null, null, null, - null, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + null ); workItem = new KubernetesWorkItem(task, null, peonLifecycle); From c77dbf8096d8eaf52ae43beb21d20c0a803b033d Mon Sep 17 00:00:00 2001 From: capistrant Date: Tue, 30 Sep 2025 12:32:30 -0500 Subject: [PATCH 02/17] Expand LogWatch druid side timeouts to cover more surface area --- docs/development/extensions-core/k8s-jobs.md | 3 + .../k8s/overlord/KubernetesPeonLifecycle.java | 68 +++++++++- .../KubernetesPeonLifecycleFactory.java | 12 +- .../overlord/KubernetesTaskRunnerConfig.java | 50 +++++++- .../overlord/KubernetesTaskRunnerFactory.java | 8 +- .../overlord/KubernetesPeonLifecycleTest.java | 117 +++++++++++++----- .../k8s/overlord/KubernetesWorkItemTest.java | 27 +++- 7 files changed, 241 insertions(+), 44 deletions(-) diff --git a/docs/development/extensions-core/k8s-jobs.md b/docs/development/extensions-core/k8s-jobs.md index e16479406b08..c5ba9edb7dba 100644 --- a/docs/development/extensions-core/k8s-jobs.md +++ b/docs/development/extensions-core/k8s-jobs.md @@ -792,6 +792,9 @@ Should you require the needed permissions for interacting across Kubernetes name | `druid.indexer.runner.graceTerminationPeriodSeconds` | `Long` | Number of seconds you want to wait after a sigterm for container lifecycle hooks to complete. Keep at a smaller value if you want tasks to hold locks for shorter periods. | `PT30S` (K8s default) | No | | `druid.indexer.runner.capacity` | `Integer` | Number of concurrent jobs that can be sent to Kubernetes. | `2147483647` | No | | `druid.indexer.runner.cpuCoreInMicro` | `Integer` | Number of CPU micro core for the task. | `1000` | No | +| `druid.indexer.runner.logSaveTimeout` | `Duration` | How long to wait for task logs to be saved before giving up. | `PT300S` | NO | +| `druid.indexer.runner.logWatchInitializationTimeout` | `Duration` | How long to wait for Initializing a LogWatch on k8s peon pod before giving up. | `PT30S` | NO | + ### Metrics added diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java index a0e618b43249..249d66e0a7fe 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java @@ -34,6 +34,7 @@ import org.apache.druid.indexer.TaskStatus; import org.apache.druid.indexing.common.task.Task; import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.java.util.common.concurrent.Execs; import org.apache.druid.java.util.emitter.EmittingLogger; import org.apache.druid.k8s.overlord.common.DruidK8sConstants; import org.apache.druid.k8s.overlord.common.JobResponse; @@ -49,7 +50,11 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicReference; /** @@ -92,6 +97,9 @@ protected enum State private final ObjectMapper mapper; private final TaskStateListener stateListener; private final SettableFuture taskStartedSuccessfullyFuture; + private final ExecutorService logWatchExecutor; + private final long logWatchInitializationTimeoutMs; + private final long logWatchCopyLogTimeoutMs; @MonotonicNonNull private LogWatch logWatch; @@ -104,7 +112,9 @@ protected KubernetesPeonLifecycle( KubernetesPeonClient kubernetesClient, TaskLogs taskLogs, ObjectMapper mapper, - TaskStateListener stateListener + TaskStateListener stateListener, + long logWatchInitializationTimeoutMs, + long logWatchCopyLogTimeoutMs ) { this.task = task; @@ -114,6 +124,11 @@ protected KubernetesPeonLifecycle( this.mapper = mapper; this.stateListener = stateListener; this.taskStartedSuccessfullyFuture = SettableFuture.create(); + this.logWatchInitializationTimeoutMs = logWatchInitializationTimeoutMs; + this.logWatchCopyLogTimeoutMs = logWatchCopyLogTimeoutMs; + this.logWatchExecutor = Executors.newSingleThreadExecutor( + Execs.makeThreadFactory("k8s-logwatch-" + taskId.getOriginalTaskId() + "-%d") + ); } /** @@ -334,15 +349,27 @@ protected void startWatchingLogs() if (logWatch != null) { log.debug("There is already a log watcher for %s", taskId.getOriginalTaskId()); return; + } else if (logWatchExecutor.isShutdown()) { + log.warn("Log watch executor is shutdown for %s. Cannot start log watcher.", taskId.getOriginalTaskId()); + return; } try { - Optional maybeLogWatch = kubernetesClient.getPeonLogWatcher(taskId); + Future> future = logWatchExecutor.submit(() -> kubernetesClient.getPeonLogWatcher(taskId)); + Optional maybeLogWatch = future.get(logWatchInitializationTimeoutMs, TimeUnit.MILLISECONDS); if (maybeLogWatch.isPresent()) { logWatch = maybeLogWatch.get(); } } + catch (TimeoutException e) { + log.warn("Initializing log watcher timed out after %d ms for task [%s]. LogWatch not initialized.", + logWatchInitializationTimeoutMs, taskId.getOriginalTaskId()); + } + catch (InterruptedException e) { + Thread.currentThread().interrupt(); + log.warn("Initializing log watcher was interrupted for task [%s]. LogWatch not initialized.", taskId.getOriginalTaskId()); + } catch (Exception e) { - log.error(e, "Error watching logs from task: %s", taskId); + log.error(e, "Error watching logs from task: %s. LogWatch not initialized.", taskId); } } @@ -352,8 +379,37 @@ protected void saveLogs() Path file = Files.createTempFile(taskId.getOriginalTaskId(), "log"); try { startWatchingLogs(); - if (logWatch != null) { - FileUtils.copyInputStreamToFile(logWatch.getOutput(), file.toFile()); + if (logWatch != null && !logWatchExecutor.isShutdown()) { + // Copy log output with timeout protection + Future copyFuture = logWatchExecutor.submit(() -> { + try { + FileUtils.copyInputStreamToFile(logWatch.getOutput(), file.toFile()); + } + catch (IOException e) { + throw new RuntimeException(e); + } + }); + + try { + copyFuture.get(logWatchCopyLogTimeoutMs, TimeUnit.MILLISECONDS); + } + catch (TimeoutException e) { + log.warn("Copying task logs timed out after %d ms for task [%s]. Logs may be incomplete. This does not have any impact on the" + + " work done by the task, but the logs may be innaccessible. If this continues to happen, check" + + " Kubernetes server logs for potential errors.", + logWatchCopyLogTimeoutMs, taskId.getOriginalTaskId()); + } + catch (InterruptedException e) { + Thread.currentThread().interrupt(); + log.warn("Copying task logs was interrupted for task [%s]. Logs may be incomplete. This does not have any impact on the" + + " work done by the task, but the logs may be innaccessible. If this continues to happen, check" + + " Kubernetes server logs for potential errors.", taskId.getOriginalTaskId()); + } + catch (Exception e) { + log.error(e, "Failed to copy task logs for task [%s]. This does not have any impact on the" + + " work done by the task, but the logs may be innaccessible. If this continues to happen, check" + + " Kubernetes server logs for potential errors.", taskId.getOriginalTaskId()); + } } else { log.debug("Log stream not found for %s", taskId.getOriginalTaskId()); FileUtils.writeStringToFile( @@ -388,6 +444,8 @@ private void stopTask() if (!State.STOPPED.equals(state.get())) { updateState(new State[]{State.NOT_STARTED, State.PENDING, State.RUNNING}, State.STOPPED); } + // Shutdown the executor to release resources and interrupt any hanging log operations + logWatchExecutor.shutdownNow(); } private void updateState(State[] acceptedStates, State targetState) diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java index 71c37619357f..1fc2a21e0beb 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java @@ -30,16 +30,22 @@ public class KubernetesPeonLifecycleFactory implements PeonLifecycleFactory private final KubernetesPeonClient client; private final TaskLogs taskLogs; private final ObjectMapper mapper; + private final long logSaveTimeoutMs; + private final long logWatchInitTimeoutMs; public KubernetesPeonLifecycleFactory( KubernetesPeonClient client, TaskLogs taskLogs, - ObjectMapper mapper + ObjectMapper mapper, + long logSaveTimeoutMs, + long logWatchInitTimeoutMs ) { this.client = client; this.taskLogs = taskLogs; this.mapper = mapper; + this.logSaveTimeoutMs = logSaveTimeoutMs; + this.logWatchInitTimeoutMs = logWatchInitTimeoutMs; } @Override @@ -51,7 +57,9 @@ public KubernetesPeonLifecycle build(Task task, K8sTaskId k8sTaskId, KubernetesP client, taskLogs, mapper, - stateListener + stateListener, + logSaveTimeoutMs, + logWatchInitTimeoutMs ); } } diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java index ef4f0740e056..c46a8ec2a079 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java @@ -105,6 +105,16 @@ public class KubernetesTaskRunnerConfig // how long to wait for the peon k8s job to launch private Period k8sjobLaunchTimeout = new Period("PT1H"); + @JsonProperty + @NotNull + // how long to wait for log saving operations to complete + private Period logSaveTimeout = new Period("PT300S"); + + @JsonProperty + @NotNull + // how long to wait for log saving operations to complete + private Period logWatchInitializationTimeout = new Period("PT30S"); + @JsonProperty // ForkingTaskRunner inherits the monitors from the MM, in k8s mode // the peon inherits the monitors from the overlord, so if someone specifies @@ -158,7 +168,9 @@ private KubernetesTaskRunnerConfig( Map labels, Map annotations, Integer capacity, - Period taskJoinTimeout + Period taskJoinTimeout, + Period logSaveTimeout, + Period logWatchInitTimeout ) { this.namespace = namespace; @@ -232,6 +244,14 @@ private KubernetesTaskRunnerConfig( capacity, this.capacity ); + this.logSaveTimeout = ObjectUtils.defaultIfNull( + logSaveTimeout, + this.logSaveTimeout + ); + this.logWatchInitializationTimeout = ObjectUtils.defaultIfNull( + logWatchInitTimeout, + this.logWatchInitializationTimeout + ); } public String getNamespace() @@ -336,6 +356,16 @@ public Integer getCapacity() return capacity; } + public Period getLogSaveTimeout() + { + return logSaveTimeout; + } + + public Period getLogWatchInitializationTimeout() + { + return logWatchInitializationTimeout; + } + public static Builder builder() { return new Builder(); @@ -363,6 +393,8 @@ public static class Builder private Map annotations; private Integer capacity; private Period taskJoinTimeout; + private Period logSaveTimeout; + private Period logWatchInitializationTimeout; public Builder() { @@ -489,6 +521,18 @@ public Builder withTaskJoinTimeout(Period taskJoinTimeout) return this; } + public Builder withLogSaveTimeout(Period logSaveTimeout) + { + this.logSaveTimeout = logSaveTimeout; + return this; + } + + public Builder withLogWatchInitializationTimeout(Period logWatchInitTimeout) + { + this.logWatchInitializationTimeout = logWatchInitTimeout; + return this; + } + public KubernetesTaskRunnerConfig build() { return new KubernetesTaskRunnerConfig( @@ -511,7 +555,9 @@ public KubernetesTaskRunnerConfig build() this.labels, this.annotations, this.capacity, - this.taskJoinTimeout + this.taskJoinTimeout, + this.logSaveTimeout, + this.logWatchInitializationTimeout ); } } diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java index aac79e8d665d..ce21a8814907 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java @@ -93,7 +93,13 @@ public KubernetesTaskRunner build() kubernetesTaskRunnerConfig, peonClient, httpClient, - new KubernetesPeonLifecycleFactory(peonClient, taskLogs, smileMapper), + new KubernetesPeonLifecycleFactory( + peonClient, + taskLogs, + smileMapper, + kubernetesTaskRunnerConfig.getLogSaveTimeout().toStandardDuration().getMillis(), + kubernetesTaskRunnerConfig.getLogWatchInitializationTimeout().toStandardDuration().getMillis() + ), emitter ); return runner; diff --git a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java index 669df02b5894..379a464dccbb 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java +++ b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java @@ -41,6 +41,7 @@ import org.easymock.EasyMockRunner; import org.easymock.EasyMockSupport; import org.easymock.Mock; +import org.joda.time.Period; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -59,6 +60,10 @@ public class KubernetesPeonLifecycleTest extends EasyMockSupport { private static final String ID = "id"; private static final TaskStatus SUCCESS = TaskStatus.success(ID); + private static final Period LOG_SAVE_TIMEOUT = new Period("PT300S"); + private static final Period SHORT_LOG_SAVE_TIMEOUT = new Period("PT1S"); + private static final Period LOGWATCH_INIT_TIMEOUT = new Period("PT300S"); + private static final Period SHORT_LOGWATCH_INIT_TIMEOUT = new Period("PT1S"); @Mock KubernetesPeonClient kubernetesClient; @Mock TaskLogs taskLogs; @@ -88,7 +93,9 @@ public void test_run() throws IOException kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -134,7 +141,9 @@ public void test_run_useTaskManager() throws IOException kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -179,7 +188,9 @@ public void test_run_whenCalledMultipleTimes_raisesIllegalStateException() throw kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -229,7 +240,9 @@ public void test_run_whenExceptionRaised_setsRunnerTaskStateToNone() kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -274,7 +287,9 @@ public void test_run_whenExceptionRaised_setsStartStatusFutureToFalse() throws E kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -322,7 +337,9 @@ public void test_join_withoutJob_returnsFailedTaskStatus() throws IOException kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); EasyMock.expect(kubernetesClient.waitForPeonJobCompletion( @@ -361,7 +378,9 @@ public void test_join() throws IOException kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); Assert.assertFalse(peonLifecycle.getTaskStartedSuccessfullyFuture().isDone()); @@ -415,7 +434,9 @@ public void test_join_whenCalledMultipleTimes_raisesIllegalStateException() thro kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -476,7 +497,9 @@ public void test_join_withoutTaskStatus_returnsFailedTaskStatus() throws IOExcep kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -524,7 +547,9 @@ public void test_join_whenIOExceptionThrownWhileStreamingTaskStatus_returnsFaile kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -575,7 +600,9 @@ public void test_join_whenIOExceptionThrownWhileStreamingTaskLogs_isIgnored() th kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -626,7 +653,9 @@ public void test_join_whenRuntimeExceptionThrownWhileWaitingForKubernetesJob_thr kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); EasyMock.expect(kubernetesClient.waitForPeonJobCompletion( @@ -666,7 +695,9 @@ public void test_shutdown_withNotStartedTaskState() kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); peonLifecycle.shutdown(); } @@ -680,7 +711,9 @@ public void test_shutdown_withPendingTaskState() throws NoSuchFieldException, Il kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.PENDING); @@ -702,7 +735,9 @@ public void test_shutdown_withRunningTaskState() throws NoSuchFieldException, Il kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -724,7 +759,9 @@ public void test_shutdown_withStoppedTaskState() throws NoSuchFieldException, Il kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.STOPPED); @@ -740,7 +777,9 @@ public void test_streamLogs_withNotStartedTaskState() throws NoSuchFieldExceptio kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.NOT_STARTED); @@ -756,7 +795,9 @@ public void test_streamLogs_withPendingTaskState() throws NoSuchFieldException, kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.PENDING); @@ -772,7 +813,9 @@ public void test_streamLogs_withRunningTaskState() throws NoSuchFieldException, kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -796,7 +839,9 @@ public void test_streamLogs_withStoppedTaskState() throws NoSuchFieldException, kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.STOPPED); @@ -813,7 +858,9 @@ public void test_getTaskLocation_withNotStartedTaskState_returnsUnknown() kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.NOT_STARTED); @@ -830,7 +877,9 @@ public void test_getTaskLocation_withPendingTaskState_returnsUnknown() kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.PENDING); @@ -847,7 +896,9 @@ public void test_getTaskLocation_withRunningTaskState_withoutPeonPod_returnsUnkn kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -870,7 +921,9 @@ public void test_getTaskLocation_withRunningTaskState_withPeonPodWithoutStatus_r kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -899,7 +952,9 @@ public void test_getTaskLocation_withRunningTaskState_withPeonPodWithStatus_retu kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -936,7 +991,9 @@ public void test_getTaskLocation_saveTaskLocation() kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -973,7 +1030,9 @@ public void test_getTaskLocation_withRunningTaskState_withPeonPodWithStatusWithT kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -1011,7 +1070,9 @@ public void test_getTaskLocation_withStoppedTaskState_returnsUnknown() kubernetesClient, taskLogs, mapper, - stateListener + stateListener, + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), + LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.STOPPED); EasyMock.expect(kubernetesClient.getPeonPod(k8sTaskId.getK8sJobName())).andReturn(Optional.absent()).once(); diff --git a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java index bfda735011b1..ed1fedb85c77 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java +++ b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java @@ -28,6 +28,7 @@ import org.easymock.EasyMockRunner; import org.easymock.EasyMockSupport; import org.easymock.Mock; +import org.joda.time.Period; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -39,6 +40,8 @@ public class KubernetesWorkItemTest extends EasyMockSupport private KubernetesWorkItem workItem; private Task task; private K8sTaskId taskId; + private static final Period LOG_SAVE_TIMEOUT = new Period("PT300S"); + private static final Period LOG_WATCH_INIT_TIMEOUT = new Period("PT30S"); @Mock KubernetesPeonLifecycle kubernetesPeonLifecycle; @@ -130,7 +133,9 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_returnsPending() null, null, null, - null + null, + LOG_SAVE_TIMEOUT.getMillis(), + LOG_WATCH_INIT_TIMEOUT.getMillis() ); workItem = new KubernetesWorkItem(task, null, peonLifecycle); @@ -146,7 +151,9 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_inPendingState_r null, null, null, - null + null, + LOG_SAVE_TIMEOUT.getMillis(), + LOG_WATCH_INIT_TIMEOUT.getMillis() ) { @Override protected State getState() @@ -169,7 +176,9 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_inRunningState_r null, null, null, - null + null, + LOG_SAVE_TIMEOUT.getMillis(), + LOG_WATCH_INIT_TIMEOUT.getMillis() ) { @Override protected State getState() @@ -192,7 +201,9 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_inStoppedState_r null, null, null, - null + null, + LOG_SAVE_TIMEOUT.getMillis(), + LOG_WATCH_INIT_TIMEOUT.getMillis() ) { @Override protected State getState() @@ -215,7 +226,9 @@ public void test_streamTaskLogs_withKubernetesPeonLifecycle() null, null, null, - null + null, + LOG_SAVE_TIMEOUT.getMillis(), + LOG_WATCH_INIT_TIMEOUT.getMillis() ); workItem = new KubernetesWorkItem(task, null, peonLifecycle); Assert.assertFalse(workItem.streamTaskLogs().isPresent()); @@ -230,7 +243,9 @@ public void test_getLocation_withKubernetesPeonLifecycle() null, null, null, - null + null, + LOG_SAVE_TIMEOUT.getMillis(), + LOG_WATCH_INIT_TIMEOUT.getMillis() ); workItem = new KubernetesWorkItem(task, null, peonLifecycle); From dfcd7f26ef6ef2d0e3fe2fe0c62b0f59864183fc Mon Sep 17 00:00:00 2001 From: capistrant Date: Tue, 30 Sep 2025 12:43:46 -0500 Subject: [PATCH 03/17] update doc --- docs/development/extensions-core/k8s-jobs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/development/extensions-core/k8s-jobs.md b/docs/development/extensions-core/k8s-jobs.md index c5ba9edb7dba..c0c0ebb769a3 100644 --- a/docs/development/extensions-core/k8s-jobs.md +++ b/docs/development/extensions-core/k8s-jobs.md @@ -793,7 +793,7 @@ Should you require the needed permissions for interacting across Kubernetes name | `druid.indexer.runner.capacity` | `Integer` | Number of concurrent jobs that can be sent to Kubernetes. | `2147483647` | No | | `druid.indexer.runner.cpuCoreInMicro` | `Integer` | Number of CPU micro core for the task. | `1000` | No | | `druid.indexer.runner.logSaveTimeout` | `Duration` | How long to wait for task logs to be saved before giving up. | `PT300S` | NO | -| `druid.indexer.runner.logWatchInitializationTimeout` | `Duration` | How long to wait for Initializing a LogWatch on k8s peon pod before giving up. | `PT30S` | NO | +| `druid.indexer.runner.logWatchInitializationTimeout` | `Duration` | How long to wait when initializing a log watch for a peon pod before giving up. | `PT30S` | NO | ### Metrics added From bc0f1b41c05193871d018e715d13a586324728cd Mon Sep 17 00:00:00 2001 From: capistrant Date: Wed, 1 Oct 2025 11:31:59 -0500 Subject: [PATCH 04/17] Reduce memory hit on overlord for the per pod Executors by creating them on demand The idea is that these will only exist for short time per task, so creating them eagerly can be wasteful and cause memory pressure --- docs/development/extensions-core/k8s-jobs.md | 3 +- .../k8s/overlord/KubernetesPeonLifecycle.java | 34 +++---- .../KubernetesPeonLifecycleFactory.java | 12 +-- .../overlord/KubernetesTaskRunnerConfig.java | 45 +++------- .../overlord/KubernetesTaskRunnerFactory.java | 3 +- .../overlord/KubernetesPeonLifecycleTest.java | 89 ++++++------------- .../k8s/overlord/KubernetesWorkItemTest.java | 21 ++--- 7 files changed, 65 insertions(+), 142 deletions(-) diff --git a/docs/development/extensions-core/k8s-jobs.md b/docs/development/extensions-core/k8s-jobs.md index c0c0ebb769a3..119eca150ee3 100644 --- a/docs/development/extensions-core/k8s-jobs.md +++ b/docs/development/extensions-core/k8s-jobs.md @@ -792,8 +792,7 @@ Should you require the needed permissions for interacting across Kubernetes name | `druid.indexer.runner.graceTerminationPeriodSeconds` | `Long` | Number of seconds you want to wait after a sigterm for container lifecycle hooks to complete. Keep at a smaller value if you want tasks to hold locks for shorter periods. | `PT30S` (K8s default) | No | | `druid.indexer.runner.capacity` | `Integer` | Number of concurrent jobs that can be sent to Kubernetes. | `2147483647` | No | | `druid.indexer.runner.cpuCoreInMicro` | `Integer` | Number of CPU micro core for the task. | `1000` | No | -| `druid.indexer.runner.logSaveTimeout` | `Duration` | How long to wait for task logs to be saved before giving up. | `PT300S` | NO | -| `druid.indexer.runner.logWatchInitializationTimeout` | `Duration` | How long to wait when initializing a log watch for a peon pod before giving up. | `PT30S` | NO | +| `druid.indexer.runner.podLogOperationTimeout` | `Duration` | Timeout for async operations that interact with `k8s` pod logs | `PT300S` | NO | ### Metrics added diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java index 249d66e0a7fe..5b5be8f9cfb0 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java @@ -97,9 +97,7 @@ protected enum State private final ObjectMapper mapper; private final TaskStateListener stateListener; private final SettableFuture taskStartedSuccessfullyFuture; - private final ExecutorService logWatchExecutor; - private final long logWatchInitializationTimeoutMs; - private final long logWatchCopyLogTimeoutMs; + private final long logWatchOperationTimeoutMs; @MonotonicNonNull private LogWatch logWatch; @@ -113,8 +111,7 @@ protected KubernetesPeonLifecycle( TaskLogs taskLogs, ObjectMapper mapper, TaskStateListener stateListener, - long logWatchInitializationTimeoutMs, - long logWatchCopyLogTimeoutMs + long logWatchOperationTimeoutMs ) { this.task = task; @@ -124,11 +121,7 @@ protected KubernetesPeonLifecycle( this.mapper = mapper; this.stateListener = stateListener; this.taskStartedSuccessfullyFuture = SettableFuture.create(); - this.logWatchInitializationTimeoutMs = logWatchInitializationTimeoutMs; - this.logWatchCopyLogTimeoutMs = logWatchCopyLogTimeoutMs; - this.logWatchExecutor = Executors.newSingleThreadExecutor( - Execs.makeThreadFactory("k8s-logwatch-" + taskId.getOriginalTaskId() + "-%d") - ); + this.logWatchOperationTimeoutMs = logWatchOperationTimeoutMs; } /** @@ -349,20 +342,18 @@ protected void startWatchingLogs() if (logWatch != null) { log.debug("There is already a log watcher for %s", taskId.getOriginalTaskId()); return; - } else if (logWatchExecutor.isShutdown()) { - log.warn("Log watch executor is shutdown for %s. Cannot start log watcher.", taskId.getOriginalTaskId()); - return; } try { - Future> future = logWatchExecutor.submit(() -> kubernetesClient.getPeonLogWatcher(taskId)); - Optional maybeLogWatch = future.get(logWatchInitializationTimeoutMs, TimeUnit.MILLISECONDS); + ExecutorService executor = Executors.newSingleThreadExecutor(Execs.makeThreadFactory("k8s-tasklog-init-watch-%d")); + Future> future = executor.submit(() -> kubernetesClient.getPeonLogWatcher(taskId)); + Optional maybeLogWatch = future.get(logWatchOperationTimeoutMs, TimeUnit.MILLISECONDS); if (maybeLogWatch.isPresent()) { logWatch = maybeLogWatch.get(); } } catch (TimeoutException e) { log.warn("Initializing log watcher timed out after %d ms for task [%s]. LogWatch not initialized.", - logWatchInitializationTimeoutMs, taskId.getOriginalTaskId()); + logWatchOperationTimeoutMs, taskId.getOriginalTaskId()); } catch (InterruptedException e) { Thread.currentThread().interrupt(); @@ -379,9 +370,10 @@ protected void saveLogs() Path file = Files.createTempFile(taskId.getOriginalTaskId(), "log"); try { startWatchingLogs(); - if (logWatch != null && !logWatchExecutor.isShutdown()) { + if (logWatch != null) { // Copy log output with timeout protection - Future copyFuture = logWatchExecutor.submit(() -> { + ExecutorService executor = Executors.newSingleThreadExecutor(Execs.makeThreadFactory("k8s-tasklog-persist-%d")); + Future copyFuture = executor.submit(() -> { try { FileUtils.copyInputStreamToFile(logWatch.getOutput(), file.toFile()); } @@ -391,13 +383,13 @@ protected void saveLogs() }); try { - copyFuture.get(logWatchCopyLogTimeoutMs, TimeUnit.MILLISECONDS); + copyFuture.get(logWatchOperationTimeoutMs, TimeUnit.MILLISECONDS); } catch (TimeoutException e) { log.warn("Copying task logs timed out after %d ms for task [%s]. Logs may be incomplete. This does not have any impact on the" + " work done by the task, but the logs may be innaccessible. If this continues to happen, check" + " Kubernetes server logs for potential errors.", - logWatchCopyLogTimeoutMs, taskId.getOriginalTaskId()); + logWatchOperationTimeoutMs, taskId.getOriginalTaskId()); } catch (InterruptedException e) { Thread.currentThread().interrupt(); @@ -444,8 +436,6 @@ private void stopTask() if (!State.STOPPED.equals(state.get())) { updateState(new State[]{State.NOT_STARTED, State.PENDING, State.RUNNING}, State.STOPPED); } - // Shutdown the executor to release resources and interrupt any hanging log operations - logWatchExecutor.shutdownNow(); } private void updateState(State[] acceptedStates, State targetState) diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java index 1fc2a21e0beb..10ab6ab9ae22 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java @@ -30,22 +30,19 @@ public class KubernetesPeonLifecycleFactory implements PeonLifecycleFactory private final KubernetesPeonClient client; private final TaskLogs taskLogs; private final ObjectMapper mapper; - private final long logSaveTimeoutMs; - private final long logWatchInitTimeoutMs; + private final long podLogOperationTimeout; public KubernetesPeonLifecycleFactory( KubernetesPeonClient client, TaskLogs taskLogs, ObjectMapper mapper, - long logSaveTimeoutMs, - long logWatchInitTimeoutMs + long podLogOperationTimeout ) { this.client = client; this.taskLogs = taskLogs; this.mapper = mapper; - this.logSaveTimeoutMs = logSaveTimeoutMs; - this.logWatchInitTimeoutMs = logWatchInitTimeoutMs; + this.podLogOperationTimeout = podLogOperationTimeout; } @Override @@ -58,8 +55,7 @@ public KubernetesPeonLifecycle build(Task task, K8sTaskId k8sTaskId, KubernetesP taskLogs, mapper, stateListener, - logSaveTimeoutMs, - logWatchInitTimeoutMs + podLogOperationTimeout ); } } diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java index c46a8ec2a079..0ed37a6916bd 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java @@ -108,12 +108,7 @@ public class KubernetesTaskRunnerConfig @JsonProperty @NotNull // how long to wait for log saving operations to complete - private Period logSaveTimeout = new Period("PT300S"); - - @JsonProperty - @NotNull - // how long to wait for log saving operations to complete - private Period logWatchInitializationTimeout = new Period("PT30S"); + private Period podLogOperationTimeout = new Period("PT300S"); @JsonProperty // ForkingTaskRunner inherits the monitors from the MM, in k8s mode @@ -169,8 +164,7 @@ private KubernetesTaskRunnerConfig( Map annotations, Integer capacity, Period taskJoinTimeout, - Period logSaveTimeout, - Period logWatchInitTimeout + Period podLogOperationTimeout ) { this.namespace = namespace; @@ -244,13 +238,9 @@ private KubernetesTaskRunnerConfig( capacity, this.capacity ); - this.logSaveTimeout = ObjectUtils.defaultIfNull( - logSaveTimeout, - this.logSaveTimeout - ); - this.logWatchInitializationTimeout = ObjectUtils.defaultIfNull( - logWatchInitTimeout, - this.logWatchInitializationTimeout + this.podLogOperationTimeout = ObjectUtils.defaultIfNull( + podLogOperationTimeout, + this.podLogOperationTimeout ); } @@ -356,14 +346,9 @@ public Integer getCapacity() return capacity; } - public Period getLogSaveTimeout() - { - return logSaveTimeout; - } - - public Period getLogWatchInitializationTimeout() + public Period getPodLogOperationTimeout() { - return logWatchInitializationTimeout; + return podLogOperationTimeout; } public static Builder builder() @@ -393,8 +378,7 @@ public static class Builder private Map annotations; private Integer capacity; private Period taskJoinTimeout; - private Period logSaveTimeout; - private Period logWatchInitializationTimeout; + private Period podLogOperationTimeout; public Builder() { @@ -521,15 +505,9 @@ public Builder withTaskJoinTimeout(Period taskJoinTimeout) return this; } - public Builder withLogSaveTimeout(Period logSaveTimeout) - { - this.logSaveTimeout = logSaveTimeout; - return this; - } - - public Builder withLogWatchInitializationTimeout(Period logWatchInitTimeout) + public Builder withPodLogOperationTimeout(Period podLogOperationTimeout) { - this.logWatchInitializationTimeout = logWatchInitTimeout; + this.podLogOperationTimeout = podLogOperationTimeout; return this; } @@ -556,8 +534,7 @@ public KubernetesTaskRunnerConfig build() this.annotations, this.capacity, this.taskJoinTimeout, - this.logSaveTimeout, - this.logWatchInitializationTimeout + this.podLogOperationTimeout ); } } diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java index ce21a8814907..3d0e77965f5f 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java @@ -97,8 +97,7 @@ public KubernetesTaskRunner build() peonClient, taskLogs, smileMapper, - kubernetesTaskRunnerConfig.getLogSaveTimeout().toStandardDuration().getMillis(), - kubernetesTaskRunnerConfig.getLogWatchInitializationTimeout().toStandardDuration().getMillis() + kubernetesTaskRunnerConfig.getPodLogOperationTimeout().toStandardDuration().getMillis() ), emitter ); diff --git a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java index 379a464dccbb..fca4fa0a33ba 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java +++ b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java @@ -60,10 +60,7 @@ public class KubernetesPeonLifecycleTest extends EasyMockSupport { private static final String ID = "id"; private static final TaskStatus SUCCESS = TaskStatus.success(ID); - private static final Period LOG_SAVE_TIMEOUT = new Period("PT300S"); - private static final Period SHORT_LOG_SAVE_TIMEOUT = new Period("PT1S"); - private static final Period LOGWATCH_INIT_TIMEOUT = new Period("PT300S"); - private static final Period SHORT_LOGWATCH_INIT_TIMEOUT = new Period("PT1S"); + private static final Period POD_LOG_OPERATION_TIMEOUT = new Period("PT300S"); @Mock KubernetesPeonClient kubernetesClient; @Mock TaskLogs taskLogs; @@ -94,8 +91,7 @@ public void test_run() throws IOException taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -142,8 +138,7 @@ public void test_run_useTaskManager() throws IOException taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -189,8 +184,7 @@ public void test_run_whenCalledMultipleTimes_raisesIllegalStateException() throw taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -241,8 +235,7 @@ public void test_run_whenExceptionRaised_setsRunnerTaskStateToNone() taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -288,8 +281,7 @@ public void test_run_whenExceptionRaised_setsStartStatusFutureToFalse() throws E taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -338,8 +330,7 @@ public void test_join_withoutJob_returnsFailedTaskStatus() throws IOException taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); EasyMock.expect(kubernetesClient.waitForPeonJobCompletion( @@ -379,8 +370,7 @@ public void test_join() throws IOException taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); Assert.assertFalse(peonLifecycle.getTaskStartedSuccessfullyFuture().isDone()); @@ -435,8 +425,7 @@ public void test_join_whenCalledMultipleTimes_raisesIllegalStateException() thro taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -498,8 +487,7 @@ public void test_join_withoutTaskStatus_returnsFailedTaskStatus() throws IOExcep taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -548,8 +536,7 @@ public void test_join_whenIOExceptionThrownWhileStreamingTaskStatus_returnsFaile taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -601,8 +588,7 @@ public void test_join_whenIOExceptionThrownWhileStreamingTaskLogs_isIgnored() th taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -654,8 +640,7 @@ public void test_join_whenRuntimeExceptionThrownWhileWaitingForKubernetesJob_thr taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); EasyMock.expect(kubernetesClient.waitForPeonJobCompletion( @@ -696,8 +681,7 @@ public void test_shutdown_withNotStartedTaskState() taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); peonLifecycle.shutdown(); } @@ -712,8 +696,7 @@ public void test_shutdown_withPendingTaskState() throws NoSuchFieldException, Il taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.PENDING); @@ -736,8 +719,7 @@ public void test_shutdown_withRunningTaskState() throws NoSuchFieldException, Il taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -760,8 +742,7 @@ public void test_shutdown_withStoppedTaskState() throws NoSuchFieldException, Il taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.STOPPED); @@ -778,8 +759,7 @@ public void test_streamLogs_withNotStartedTaskState() throws NoSuchFieldExceptio taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.NOT_STARTED); @@ -796,8 +776,7 @@ public void test_streamLogs_withPendingTaskState() throws NoSuchFieldException, taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.PENDING); @@ -814,8 +793,7 @@ public void test_streamLogs_withRunningTaskState() throws NoSuchFieldException, taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -840,8 +818,7 @@ public void test_streamLogs_withStoppedTaskState() throws NoSuchFieldException, taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.STOPPED); @@ -859,8 +836,7 @@ public void test_getTaskLocation_withNotStartedTaskState_returnsUnknown() taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.NOT_STARTED); @@ -878,8 +854,7 @@ public void test_getTaskLocation_withPendingTaskState_returnsUnknown() taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.PENDING); @@ -897,8 +872,7 @@ public void test_getTaskLocation_withRunningTaskState_withoutPeonPod_returnsUnkn taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -922,8 +896,7 @@ public void test_getTaskLocation_withRunningTaskState_withPeonPodWithoutStatus_r taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -953,8 +926,7 @@ public void test_getTaskLocation_withRunningTaskState_withPeonPodWithStatus_retu taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -992,8 +964,7 @@ public void test_getTaskLocation_saveTaskLocation() taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -1031,8 +1002,7 @@ public void test_getTaskLocation_withRunningTaskState_withPeonPodWithStatusWithT taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -1071,8 +1041,7 @@ public void test_getTaskLocation_withStoppedTaskState_returnsUnknown() taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT.toStandardDuration().getMillis(), - LOGWATCH_INIT_TIMEOUT.toStandardDuration().getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.STOPPED); EasyMock.expect(kubernetesClient.getPeonPod(k8sTaskId.getK8sJobName())).andReturn(Optional.absent()).once(); diff --git a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java index ed1fedb85c77..cbeb2820dfbf 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java +++ b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java @@ -40,8 +40,7 @@ public class KubernetesWorkItemTest extends EasyMockSupport private KubernetesWorkItem workItem; private Task task; private K8sTaskId taskId; - private static final Period LOG_SAVE_TIMEOUT = new Period("PT300S"); - private static final Period LOG_WATCH_INIT_TIMEOUT = new Period("PT30S"); + private static final Period POD_LOG_OPERATION_TIMEOUT = new Period("PT30S"); @Mock KubernetesPeonLifecycle kubernetesPeonLifecycle; @@ -134,8 +133,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_returnsPending() null, null, null, - LOG_SAVE_TIMEOUT.getMillis(), - LOG_WATCH_INIT_TIMEOUT.getMillis() + POD_LOG_OPERATION_TIMEOUT.getMillis() ); workItem = new KubernetesWorkItem(task, null, peonLifecycle); @@ -152,8 +150,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_inPendingState_r null, null, null, - LOG_SAVE_TIMEOUT.getMillis(), - LOG_WATCH_INIT_TIMEOUT.getMillis() + POD_LOG_OPERATION_TIMEOUT.getMillis() ) { @Override protected State getState() @@ -177,8 +174,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_inRunningState_r null, null, null, - LOG_SAVE_TIMEOUT.getMillis(), - LOG_WATCH_INIT_TIMEOUT.getMillis() + POD_LOG_OPERATION_TIMEOUT.getMillis() ) { @Override protected State getState() @@ -202,8 +198,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_inStoppedState_r null, null, null, - LOG_SAVE_TIMEOUT.getMillis(), - LOG_WATCH_INIT_TIMEOUT.getMillis() + POD_LOG_OPERATION_TIMEOUT.getMillis() ) { @Override protected State getState() @@ -227,8 +222,7 @@ public void test_streamTaskLogs_withKubernetesPeonLifecycle() null, null, null, - LOG_SAVE_TIMEOUT.getMillis(), - LOG_WATCH_INIT_TIMEOUT.getMillis() + POD_LOG_OPERATION_TIMEOUT.getMillis() ); workItem = new KubernetesWorkItem(task, null, peonLifecycle); Assert.assertFalse(workItem.streamTaskLogs().isPresent()); @@ -244,8 +238,7 @@ public void test_getLocation_withKubernetesPeonLifecycle() null, null, null, - LOG_SAVE_TIMEOUT.getMillis(), - LOG_WATCH_INIT_TIMEOUT.getMillis() + POD_LOG_OPERATION_TIMEOUT.getMillis() ); workItem = new KubernetesWorkItem(task, null, peonLifecycle); From 633475df4532ffc11c01775ba7cfb483117e2836 Mon Sep 17 00:00:00 2001 From: capistrant Date: Wed, 1 Oct 2025 18:04:26 -0500 Subject: [PATCH 05/17] Test new fabric8 logwatch timeouts and shut down executors --- .../k8s/overlord/KubernetesPeonLifecycle.java | 8 ++- .../overlord/KubernetesPeonLifecycleTest.java | 72 +++++++++++++++++++ .../k8s/overlord/KubernetesWorkItemTest.java | 12 ++-- 3 files changed, 85 insertions(+), 7 deletions(-) diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java index 5b5be8f9cfb0..0c9f96056464 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java @@ -343,8 +343,8 @@ protected void startWatchingLogs() log.debug("There is already a log watcher for %s", taskId.getOriginalTaskId()); return; } + ExecutorService executor = Executors.newSingleThreadExecutor(Execs.makeThreadFactory("k8s-tasklog-init-watch-%d")); try { - ExecutorService executor = Executors.newSingleThreadExecutor(Execs.makeThreadFactory("k8s-tasklog-init-watch-%d")); Future> future = executor.submit(() -> kubernetesClient.getPeonLogWatcher(taskId)); Optional maybeLogWatch = future.get(logWatchOperationTimeoutMs, TimeUnit.MILLISECONDS); if (maybeLogWatch.isPresent()) { @@ -362,6 +362,9 @@ protected void startWatchingLogs() catch (Exception e) { log.error(e, "Error watching logs from task: %s. LogWatch not initialized.", taskId); } + finally { + executor.shutdownNow(); + } } protected void saveLogs() @@ -402,6 +405,9 @@ protected void saveLogs() + " work done by the task, but the logs may be innaccessible. If this continues to happen, check" + " Kubernetes server logs for potential errors.", taskId.getOriginalTaskId()); } + finally { + executor.shutdownNow(); + } } else { log.debug("Log stream not found for %s", taskId.getOriginalTaskId()); FileUtils.writeStringToFile( diff --git a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java index fca4fa0a33ba..92333472c219 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java +++ b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java @@ -49,6 +49,7 @@ import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.lang.reflect.Field; import java.nio.charset.StandardCharsets; import java.util.concurrent.ExecutionException; @@ -61,6 +62,7 @@ public class KubernetesPeonLifecycleTest extends EasyMockSupport private static final String ID = "id"; private static final TaskStatus SUCCESS = TaskStatus.success(ID); private static final Period POD_LOG_OPERATION_TIMEOUT = new Period("PT300S"); + private static final Period POD_LOG_OPERATION_TIMEOUT_SHORT = new Period("PT1S"); @Mock KubernetesPeonClient kubernetesClient; @Mock TaskLogs taskLogs; @@ -1051,6 +1053,76 @@ public void test_getTaskLocation_withStoppedTaskState_returnsUnknown() verifyAll(); } + @Test + public void test_startWatchingLogs_logWatchInitialize_timeout() + { + KubernetesPeonLifecycle peonLifecycle = new KubernetesPeonLifecycle( + task, + k8sTaskId, + kubernetesClient, + taskLogs, + mapper, + stateListener, + POD_LOG_OPERATION_TIMEOUT_SHORT.toStandardDuration().getMillis() + ); + EasyMock.expect(kubernetesClient.getPeonLogWatcher(k8sTaskId)) + .andAnswer(() -> { + Thread.sleep(5000); // Exceeds 1 second timeout + return Optional.of(logWatch); + }); + + replayAll(); + long startTime = System.currentTimeMillis(); + peonLifecycle.startWatchingLogs(); + long duration = System.currentTimeMillis() - startTime; + // Anything less than 5 seconds means the Executor timeed out correctly, because the mock sleeps for 5 seconds + Assert.assertTrue("Test should complete quickly due to timeout", duration < 2500); + verifyAll(); + } + + @Test + public void test_saveLogs_streamLogs_timeout() throws IOException + { + EasyMock.reset(logWatch); + + InputStream slowInputStream = new InputStream() { + @Override + public int read() throws IOException { + try { + Thread.sleep(5000); + } catch (InterruptedException e) { + throw new IOException(e); + } + return -1; + } + }; + + KubernetesPeonLifecycle peonLifecycle = new KubernetesPeonLifecycle( + task, + k8sTaskId, + kubernetesClient, + taskLogs, + mapper, + stateListener, + POD_LOG_OPERATION_TIMEOUT_SHORT.toStandardDuration().getMillis() + ); + EasyMock.expect(kubernetesClient.getPeonLogWatcher(k8sTaskId)).andReturn(Optional.of(logWatch)).once(); + EasyMock.expect(logWatch.getOutput()).andReturn(slowInputStream); + logWatch.close(); + EasyMock.expectLastCall(); + taskLogs.pushTaskLog(EasyMock.eq(ID), EasyMock.anyObject(File.class)); + EasyMock.expectLastCall(); + + replayAll(); + long startTime = System.currentTimeMillis(); + peonLifecycle.saveLogs(); + long duration = System.currentTimeMillis() - startTime; + // Anything less than 5 seconds means the Executor timeed out correctly, because the mock sleeps for 5 seconds + Assert.assertTrue("Test should complete quickly due to timeout", duration < 2500); + verifyAll(); + + } + private void setPeonLifecycleState(KubernetesPeonLifecycle peonLifecycle, KubernetesPeonLifecycle.State state) throws NoSuchFieldException, IllegalAccessException { diff --git a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java index cbeb2820dfbf..5bebb672f508 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java +++ b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java @@ -133,7 +133,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_returnsPending() null, null, null, - POD_LOG_OPERATION_TIMEOUT.getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); workItem = new KubernetesWorkItem(task, null, peonLifecycle); @@ -150,7 +150,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_inPendingState_r null, null, null, - POD_LOG_OPERATION_TIMEOUT.getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ) { @Override protected State getState() @@ -174,7 +174,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_inRunningState_r null, null, null, - POD_LOG_OPERATION_TIMEOUT.getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ) { @Override protected State getState() @@ -198,7 +198,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_inStoppedState_r null, null, null, - POD_LOG_OPERATION_TIMEOUT.getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ) { @Override protected State getState() @@ -222,7 +222,7 @@ public void test_streamTaskLogs_withKubernetesPeonLifecycle() null, null, null, - POD_LOG_OPERATION_TIMEOUT.getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); workItem = new KubernetesWorkItem(task, null, peonLifecycle); Assert.assertFalse(workItem.streamTaskLogs().isPresent()); @@ -238,7 +238,7 @@ public void test_getLocation_withKubernetesPeonLifecycle() null, null, null, - POD_LOG_OPERATION_TIMEOUT.getMillis() + POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() ); workItem = new KubernetesWorkItem(task, null, peonLifecycle); From 391b8f74096c8a760ddf5c83ef8018545f986575 Mon Sep 17 00:00:00 2001 From: capistrant Date: Wed, 1 Oct 2025 18:05:12 -0500 Subject: [PATCH 06/17] fix formatting --- .../druid/k8s/overlord/KubernetesPeonLifecycleTest.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java index 92333472c219..5ae1576b4a8f 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java +++ b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java @@ -1087,10 +1087,12 @@ public void test_saveLogs_streamLogs_timeout() throws IOException InputStream slowInputStream = new InputStream() { @Override - public int read() throws IOException { + public int read() throws IOException + { try { Thread.sleep(5000); - } catch (InterruptedException e) { + } + catch (InterruptedException e) { throw new IOException(e); } return -1; From a8f48bd8cab092a5ce1e0956ee99cc37eb930b4f Mon Sep 17 00:00:00 2001 From: capistrant Date: Thu, 2 Oct 2025 09:08:08 -0500 Subject: [PATCH 07/17] Extract the repetitive try with timeout code into a reusable method --- .../k8s/overlord/KubernetesPeonLifecycle.java | 120 ++++++++++-------- 1 file changed, 66 insertions(+), 54 deletions(-) diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java index 0c9f96056464..0a50de15ddd1 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java @@ -42,6 +42,7 @@ import org.apache.druid.k8s.overlord.common.KubernetesPeonClient; import org.apache.druid.tasklogs.TaskLogs; import org.checkerframework.checker.nullness.qual.MonotonicNonNull; +import org.jetbrains.annotations.Nullable; import java.io.IOException; import java.io.InputStream; @@ -50,6 +51,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; +import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; @@ -337,36 +339,37 @@ private TaskStatus getTaskStatus(long duration) return taskStatus.withDuration(duration); } + /** + * Attempts to initialize a logWatch for the peon pod if one does not already exist. + *

+ * It is not guaranteed that a logWatch will be successfully initialized with this call. + *

+ */ protected void startWatchingLogs() { if (logWatch != null) { log.debug("There is already a log watcher for %s", taskId.getOriginalTaskId()); return; } - ExecutorService executor = Executors.newSingleThreadExecutor(Execs.makeThreadFactory("k8s-tasklog-init-watch-%d")); - try { - Future> future = executor.submit(() -> kubernetesClient.getPeonLogWatcher(taskId)); - Optional maybeLogWatch = future.get(logWatchOperationTimeoutMs, TimeUnit.MILLISECONDS); - if (maybeLogWatch.isPresent()) { - logWatch = maybeLogWatch.get(); - } - } - catch (TimeoutException e) { - log.warn("Initializing log watcher timed out after %d ms for task [%s]. LogWatch not initialized.", - logWatchOperationTimeoutMs, taskId.getOriginalTaskId()); - } - catch (InterruptedException e) { - Thread.currentThread().interrupt(); - log.warn("Initializing log watcher was interrupted for task [%s]. LogWatch not initialized.", taskId.getOriginalTaskId()); - } - catch (Exception e) { - log.error(e, "Error watching logs from task: %s. LogWatch not initialized.", taskId); - } - finally { - executor.shutdownNow(); + Optional maybeLogWatch = executeWithTimeout( + () -> kubernetesClient.getPeonLogWatcher(taskId), + logWatchOperationTimeoutMs, + "initializing K8s LogWatch", + "LogWatch failed to initialize. Peon may not be able to stream and persist task logs." + + " If this continues to happen, check Kubernetes server logs for potential errors." + ); + if (maybeLogWatch != null && maybeLogWatch.isPresent()) { + logWatch = maybeLogWatch.get(); } } + /** + * Saves logs from the peon pod to deep storage via the TaskLogs interface. + *

+ * This method does not gaurantee that logs will be successfully saved. It makes a best-effort attempt to copy + * the logs from the Peon and push them to deep storage. + *

+ */ protected void saveLogs() { try { @@ -375,39 +378,17 @@ protected void saveLogs() startWatchingLogs(); if (logWatch != null) { // Copy log output with timeout protection - ExecutorService executor = Executors.newSingleThreadExecutor(Execs.makeThreadFactory("k8s-tasklog-persist-%d")); - Future copyFuture = executor.submit(() -> { - try { - FileUtils.copyInputStreamToFile(logWatch.getOutput(), file.toFile()); - } - catch (IOException e) { - throw new RuntimeException(e); - } - }); - - try { - copyFuture.get(logWatchOperationTimeoutMs, TimeUnit.MILLISECONDS); - } - catch (TimeoutException e) { - log.warn("Copying task logs timed out after %d ms for task [%s]. Logs may be incomplete. This does not have any impact on the" - + " work done by the task, but the logs may be innaccessible. If this continues to happen, check" - + " Kubernetes server logs for potential errors.", - logWatchOperationTimeoutMs, taskId.getOriginalTaskId()); - } - catch (InterruptedException e) { - Thread.currentThread().interrupt(); - log.warn("Copying task logs was interrupted for task [%s]. Logs may be incomplete. This does not have any impact on the" - + " work done by the task, but the logs may be innaccessible. If this continues to happen, check" - + " Kubernetes server logs for potential errors.", taskId.getOriginalTaskId()); - } - catch (Exception e) { - log.error(e, "Failed to copy task logs for task [%s]. This does not have any impact on the" - + " work done by the task, but the logs may be innaccessible. If this continues to happen, check" - + " Kubernetes server logs for potential errors.", taskId.getOriginalTaskId()); - } - finally { - executor.shutdownNow(); - } + executeWithTimeout( + () -> { + FileUtils.copyInputStreamToFile(logWatch.getOutput(), file.toFile()); + return null; + }, + logWatchOperationTimeoutMs, + "coyping and persisting task logs", + "This failure does not have any impact on the" + + " ingestion work done by the task, but the logs may be partial or innaccessible. If " + + " this continues to happen, check Kubernetes server logs for potential errors." + ); } else { log.debug("Log stream not found for %s", taskId.getOriginalTaskId()); FileUtils.writeStringToFile( @@ -468,4 +449,35 @@ protected ListenableFuture getTaskStartedSuccessfullyFuture() { return taskStartedSuccessfullyFuture; } + + /** + * Executes a callable with a timeout. + *

+ * If the callable does not complete within the specified timeout or another exception occurs, the error will be + * logged and null will be returned. + *

+ */ + private @Nullable T executeWithTimeout(Callable runnable, long timeoutMillis, String operationName, String errorMessage) + { + ExecutorService executor = Executors.newSingleThreadExecutor( + Execs.makeThreadFactory("k8s-peon-lifecycle-util-" + taskId.getOriginalTaskId() + "-%d")); + try { + Future future = executor.submit(runnable); + return future.get(timeoutMillis, TimeUnit.MILLISECONDS); + } + catch (TimeoutException e) { + log.warn("Operation [%s] timed out after %d ms for task [%s]. %s", operationName, timeoutMillis, taskId.getOriginalTaskId(), errorMessage); + } + catch (InterruptedException e) { + Thread.currentThread().interrupt(); + log.warn("Operation [%s] was interrupted for task [%s]. %s", operationName, taskId.getOriginalTaskId(), errorMessage); + } + catch (Exception e) { + log.error(e, "Error during operation [%s] for task: %s. %s", operationName, taskId, errorMessage); + } + finally { + executor.shutdownNow(); + } + return null; + } } From 26b0972fe8b06f96eeccf6cb27cfc1edd0422760 Mon Sep 17 00:00:00 2001 From: capistrant Date: Thu, 2 Oct 2025 09:25:49 -0500 Subject: [PATCH 08/17] refactor config back to original name --- docs/development/extensions-core/k8s-jobs.md | 2 +- .../k8s/overlord/KubernetesPeonLifecycle.java | 10 +-- .../overlord/KubernetesTaskRunnerConfig.java | 22 +++---- .../overlord/KubernetesTaskRunnerFactory.java | 2 +- .../overlord/KubernetesPeonLifecycleTest.java | 64 +++++++++---------- 5 files changed, 50 insertions(+), 50 deletions(-) diff --git a/docs/development/extensions-core/k8s-jobs.md b/docs/development/extensions-core/k8s-jobs.md index 119eca150ee3..cb71fe95edef 100644 --- a/docs/development/extensions-core/k8s-jobs.md +++ b/docs/development/extensions-core/k8s-jobs.md @@ -792,7 +792,7 @@ Should you require the needed permissions for interacting across Kubernetes name | `druid.indexer.runner.graceTerminationPeriodSeconds` | `Long` | Number of seconds you want to wait after a sigterm for container lifecycle hooks to complete. Keep at a smaller value if you want tasks to hold locks for shorter periods. | `PT30S` (K8s default) | No | | `druid.indexer.runner.capacity` | `Integer` | Number of concurrent jobs that can be sent to Kubernetes. | `2147483647` | No | | `druid.indexer.runner.cpuCoreInMicro` | `Integer` | Number of CPU micro core for the task. | `1000` | No | -| `druid.indexer.runner.podLogOperationTimeout` | `Duration` | Timeout for async operations that interact with `k8s` pod logs | `PT300S` | NO | +| `druid.indexer.runner.logSaveTimeout` | `Duration` | The peon executing the ingestion task makes a best effort to persist the pod logs from `k8s` to persistent task log storage. A timeout for this log persist is set to avoid `k8s` connection issues causing the peon to hang indefinitely if issues occur during the persisting of said logs. | `PT300S` | NO | ### Metrics added diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java index 0a50de15ddd1..95e3d4ebc490 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java @@ -99,7 +99,7 @@ protected enum State private final ObjectMapper mapper; private final TaskStateListener stateListener; private final SettableFuture taskStartedSuccessfullyFuture; - private final long logWatchOperationTimeoutMs; + private final long logSaveTimeout; @MonotonicNonNull private LogWatch logWatch; @@ -113,7 +113,7 @@ protected KubernetesPeonLifecycle( TaskLogs taskLogs, ObjectMapper mapper, TaskStateListener stateListener, - long logWatchOperationTimeoutMs + long logSaveTimeout ) { this.task = task; @@ -123,7 +123,7 @@ protected KubernetesPeonLifecycle( this.mapper = mapper; this.stateListener = stateListener; this.taskStartedSuccessfullyFuture = SettableFuture.create(); - this.logWatchOperationTimeoutMs = logWatchOperationTimeoutMs; + this.logSaveTimeout = logSaveTimeout; } /** @@ -353,7 +353,7 @@ protected void startWatchingLogs() } Optional maybeLogWatch = executeWithTimeout( () -> kubernetesClient.getPeonLogWatcher(taskId), - logWatchOperationTimeoutMs, + logSaveTimeout, "initializing K8s LogWatch", "LogWatch failed to initialize. Peon may not be able to stream and persist task logs." + " If this continues to happen, check Kubernetes server logs for potential errors." @@ -383,7 +383,7 @@ protected void saveLogs() FileUtils.copyInputStreamToFile(logWatch.getOutput(), file.toFile()); return null; }, - logWatchOperationTimeoutMs, + logSaveTimeout, "coyping and persisting task logs", "This failure does not have any impact on the" + " ingestion work done by the task, but the logs may be partial or innaccessible. If " diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java index 0ed37a6916bd..6b1827c17ae2 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java @@ -108,7 +108,7 @@ public class KubernetesTaskRunnerConfig @JsonProperty @NotNull // how long to wait for log saving operations to complete - private Period podLogOperationTimeout = new Period("PT300S"); + private Period logSaveTimeout = new Period("PT300S"); @JsonProperty // ForkingTaskRunner inherits the monitors from the MM, in k8s mode @@ -164,7 +164,7 @@ private KubernetesTaskRunnerConfig( Map annotations, Integer capacity, Period taskJoinTimeout, - Period podLogOperationTimeout + Period logSaveTimeout ) { this.namespace = namespace; @@ -238,9 +238,9 @@ private KubernetesTaskRunnerConfig( capacity, this.capacity ); - this.podLogOperationTimeout = ObjectUtils.defaultIfNull( - podLogOperationTimeout, - this.podLogOperationTimeout + this.logSaveTimeout = ObjectUtils.defaultIfNull( + logSaveTimeout, + this.logSaveTimeout ); } @@ -346,9 +346,9 @@ public Integer getCapacity() return capacity; } - public Period getPodLogOperationTimeout() + public Period getLogSaveTimeout() { - return podLogOperationTimeout; + return logSaveTimeout; } public static Builder builder() @@ -378,7 +378,7 @@ public static class Builder private Map annotations; private Integer capacity; private Period taskJoinTimeout; - private Period podLogOperationTimeout; + private Period logSaveTimeout; public Builder() { @@ -505,9 +505,9 @@ public Builder withTaskJoinTimeout(Period taskJoinTimeout) return this; } - public Builder withPodLogOperationTimeout(Period podLogOperationTimeout) + public Builder withLogSaveTimeout(Period logSaveTimeout) { - this.podLogOperationTimeout = podLogOperationTimeout; + this.logSaveTimeout = logSaveTimeout; return this; } @@ -534,7 +534,7 @@ public KubernetesTaskRunnerConfig build() this.annotations, this.capacity, this.taskJoinTimeout, - this.podLogOperationTimeout + this.logSaveTimeout ); } } diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java index 3d0e77965f5f..dd8111ed49ed 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerFactory.java @@ -97,7 +97,7 @@ public KubernetesTaskRunner build() peonClient, taskLogs, smileMapper, - kubernetesTaskRunnerConfig.getPodLogOperationTimeout().toStandardDuration().getMillis() + kubernetesTaskRunnerConfig.getLogSaveTimeout().toStandardDuration().getMillis() ), emitter ); diff --git a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java index 5ae1576b4a8f..1e311800a24f 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java +++ b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java @@ -61,8 +61,8 @@ public class KubernetesPeonLifecycleTest extends EasyMockSupport { private static final String ID = "id"; private static final TaskStatus SUCCESS = TaskStatus.success(ID); - private static final Period POD_LOG_OPERATION_TIMEOUT = new Period("PT300S"); - private static final Period POD_LOG_OPERATION_TIMEOUT_SHORT = new Period("PT1S"); + private static final Period PEON_LOG_SAVE_TIMEOUT = new Period("PT300S"); + private static final Period PEON_LOG_SAVE_TIMEOUT_SHORT = new Period("PT1S"); @Mock KubernetesPeonClient kubernetesClient; @Mock TaskLogs taskLogs; @@ -93,7 +93,7 @@ public void test_run() throws IOException taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -140,7 +140,7 @@ public void test_run_useTaskManager() throws IOException taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -186,7 +186,7 @@ public void test_run_whenCalledMultipleTimes_raisesIllegalStateException() throw taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -237,7 +237,7 @@ public void test_run_whenExceptionRaised_setsRunnerTaskStateToNone() taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -283,7 +283,7 @@ public void test_run_whenExceptionRaised_setsStartStatusFutureToFalse() throws E taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -332,7 +332,7 @@ public void test_join_withoutJob_returnsFailedTaskStatus() throws IOException taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); EasyMock.expect(kubernetesClient.waitForPeonJobCompletion( @@ -372,7 +372,7 @@ public void test_join() throws IOException taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); Assert.assertFalse(peonLifecycle.getTaskStartedSuccessfullyFuture().isDone()); @@ -427,7 +427,7 @@ public void test_join_whenCalledMultipleTimes_raisesIllegalStateException() thro taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -489,7 +489,7 @@ public void test_join_withoutTaskStatus_returnsFailedTaskStatus() throws IOExcep taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -538,7 +538,7 @@ public void test_join_whenIOExceptionThrownWhileStreamingTaskStatus_returnsFaile taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -590,7 +590,7 @@ public void test_join_whenIOExceptionThrownWhileStreamingTaskLogs_isIgnored() th taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -642,7 +642,7 @@ public void test_join_whenRuntimeExceptionThrownWhileWaitingForKubernetesJob_thr taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); EasyMock.expect(kubernetesClient.waitForPeonJobCompletion( @@ -683,7 +683,7 @@ public void test_shutdown_withNotStartedTaskState() taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); peonLifecycle.shutdown(); } @@ -698,7 +698,7 @@ public void test_shutdown_withPendingTaskState() throws NoSuchFieldException, Il taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.PENDING); @@ -721,7 +721,7 @@ public void test_shutdown_withRunningTaskState() throws NoSuchFieldException, Il taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -744,7 +744,7 @@ public void test_shutdown_withStoppedTaskState() throws NoSuchFieldException, Il taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.STOPPED); @@ -761,7 +761,7 @@ public void test_streamLogs_withNotStartedTaskState() throws NoSuchFieldExceptio taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.NOT_STARTED); @@ -778,7 +778,7 @@ public void test_streamLogs_withPendingTaskState() throws NoSuchFieldException, taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.PENDING); @@ -795,7 +795,7 @@ public void test_streamLogs_withRunningTaskState() throws NoSuchFieldException, taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -820,7 +820,7 @@ public void test_streamLogs_withStoppedTaskState() throws NoSuchFieldException, taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.STOPPED); @@ -838,7 +838,7 @@ public void test_getTaskLocation_withNotStartedTaskState_returnsUnknown() taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.NOT_STARTED); @@ -856,7 +856,7 @@ public void test_getTaskLocation_withPendingTaskState_returnsUnknown() taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.PENDING); @@ -874,7 +874,7 @@ public void test_getTaskLocation_withRunningTaskState_withoutPeonPod_returnsUnkn taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -898,7 +898,7 @@ public void test_getTaskLocation_withRunningTaskState_withPeonPodWithoutStatus_r taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -928,7 +928,7 @@ public void test_getTaskLocation_withRunningTaskState_withPeonPodWithStatus_retu taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -966,7 +966,7 @@ public void test_getTaskLocation_saveTaskLocation() taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -1004,7 +1004,7 @@ public void test_getTaskLocation_withRunningTaskState_withPeonPodWithStatusWithT taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -1043,7 +1043,7 @@ public void test_getTaskLocation_withStoppedTaskState_returnsUnknown() taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.STOPPED); EasyMock.expect(kubernetesClient.getPeonPod(k8sTaskId.getK8sJobName())).andReturn(Optional.absent()).once(); @@ -1063,7 +1063,7 @@ public void test_startWatchingLogs_logWatchInitialize_timeout() taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT_SHORT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT_SHORT.toStandardDuration().getMillis() ); EasyMock.expect(kubernetesClient.getPeonLogWatcher(k8sTaskId)) .andAnswer(() -> { @@ -1106,7 +1106,7 @@ public int read() throws IOException taskLogs, mapper, stateListener, - POD_LOG_OPERATION_TIMEOUT_SHORT.toStandardDuration().getMillis() + PEON_LOG_SAVE_TIMEOUT_SHORT.toStandardDuration().getMillis() ); EasyMock.expect(kubernetesClient.getPeonLogWatcher(k8sTaskId)).andReturn(Optional.of(logWatch)).once(); EasyMock.expect(logWatch.getOutput()).andReturn(slowInputStream); From 52605abc1f7a3ae66dcbee7868ae71999308c5e8 Mon Sep 17 00:00:00 2001 From: capistrant Date: Fri, 3 Oct 2025 13:33:22 -0500 Subject: [PATCH 09/17] fix bad import --- .../org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java index 95e3d4ebc490..51550a14d292 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java @@ -42,8 +42,8 @@ import org.apache.druid.k8s.overlord.common.KubernetesPeonClient; import org.apache.druid.tasklogs.TaskLogs; import org.checkerframework.checker.nullness.qual.MonotonicNonNull; -import org.jetbrains.annotations.Nullable; +import javax.annotation.Nullable; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; From 30f8c5c066d84fd8abf5fa3593e51d71f8c5c215 Mon Sep 17 00:00:00 2001 From: capistrant Date: Mon, 6 Oct 2025 14:05:06 -0500 Subject: [PATCH 10/17] Remove setting thread as interrupted in k8speonlifcecyle log watch code --- .../org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java | 1 - 1 file changed, 1 deletion(-) diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java index 51550a14d292..9619b32664ac 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java @@ -469,7 +469,6 @@ protected ListenableFuture getTaskStartedSuccessfullyFuture() log.warn("Operation [%s] timed out after %d ms for task [%s]. %s", operationName, timeoutMillis, taskId.getOriginalTaskId(), errorMessage); } catch (InterruptedException e) { - Thread.currentThread().interrupt(); log.warn("Operation [%s] was interrupted for task [%s]. %s", operationName, taskId.getOriginalTaskId(), errorMessage); } catch (Exception e) { From c2d05cecec7bb4b239235689b5a3e4db3388c126 Mon Sep 17 00:00:00 2001 From: capistrant Date: Mon, 6 Oct 2025 14:11:30 -0500 Subject: [PATCH 11/17] revert some non-functional changes to clean up diff --- .../overlord/KubernetesTaskRunnerConfig.java | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java index 6b1827c17ae2..d810c9ee23a2 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesTaskRunnerConfig.java @@ -157,14 +157,14 @@ private KubernetesTaskRunnerConfig( Period taskCleanupDelay, Period taskCleanupInterval, Period k8sjobLaunchTimeout, + Period logSaveTimeout, List peonMonitors, List javaOptsArray, int cpuCoreInMicro, Map labels, Map annotations, Integer capacity, - Period taskJoinTimeout, - Period logSaveTimeout + Period taskJoinTimeout ) { this.namespace = namespace; @@ -214,6 +214,10 @@ private KubernetesTaskRunnerConfig( taskJoinTimeout, this.taskJoinTimeout ); + this.logSaveTimeout = ObjectUtils.defaultIfNull( + logSaveTimeout, + this.logSaveTimeout + ); this.peonMonitors = ObjectUtils.defaultIfNull( peonMonitors, this.peonMonitors @@ -238,10 +242,6 @@ private KubernetesTaskRunnerConfig( capacity, this.capacity ); - this.logSaveTimeout = ObjectUtils.defaultIfNull( - logSaveTimeout, - this.logSaveTimeout - ); } public String getNamespace() @@ -316,6 +316,11 @@ public Period getTaskLaunchTimeout() return k8sjobLaunchTimeout; } + public Period getLogSaveTimeout() + { + return logSaveTimeout; + } + public List getPeonMonitors() { return peonMonitors; @@ -346,11 +351,6 @@ public Integer getCapacity() return capacity; } - public Period getLogSaveTimeout() - { - return logSaveTimeout; - } - public static Builder builder() { return new Builder(); @@ -527,14 +527,14 @@ public KubernetesTaskRunnerConfig build() this.taskCleanupDelay, this.taskCleanupInterval, this.k8sjobLaunchTimeout, + this.logSaveTimeout, this.peonMonitors, this.javaOptsArray, this.cpuCoreInMicro, this.labels, this.annotations, this.capacity, - this.taskJoinTimeout, - this.logSaveTimeout + this.taskJoinTimeout ); } } From 98f124748066fff588101adf379fee840c0ac002 Mon Sep 17 00:00:00 2001 From: capistrant Date: Mon, 6 Oct 2025 14:16:12 -0500 Subject: [PATCH 12/17] revert change that removed unit info from logSaveTimeoutMs variable. Adds back explicit context --- .../druid/k8s/overlord/KubernetesPeonLifecycle.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java index 9619b32664ac..71429110e832 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java @@ -99,7 +99,7 @@ protected enum State private final ObjectMapper mapper; private final TaskStateListener stateListener; private final SettableFuture taskStartedSuccessfullyFuture; - private final long logSaveTimeout; + private final long logSaveTimeoutMs; @MonotonicNonNull private LogWatch logWatch; @@ -113,7 +113,7 @@ protected KubernetesPeonLifecycle( TaskLogs taskLogs, ObjectMapper mapper, TaskStateListener stateListener, - long logSaveTimeout + long logSaveTimeoutMs ) { this.task = task; @@ -123,7 +123,7 @@ protected KubernetesPeonLifecycle( this.mapper = mapper; this.stateListener = stateListener; this.taskStartedSuccessfullyFuture = SettableFuture.create(); - this.logSaveTimeout = logSaveTimeout; + this.logSaveTimeoutMs = logSaveTimeoutMs; } /** @@ -353,7 +353,7 @@ protected void startWatchingLogs() } Optional maybeLogWatch = executeWithTimeout( () -> kubernetesClient.getPeonLogWatcher(taskId), - logSaveTimeout, + logSaveTimeoutMs, "initializing K8s LogWatch", "LogWatch failed to initialize. Peon may not be able to stream and persist task logs." + " If this continues to happen, check Kubernetes server logs for potential errors." @@ -383,7 +383,7 @@ protected void saveLogs() FileUtils.copyInputStreamToFile(logWatch.getOutput(), file.toFile()); return null; }, - logSaveTimeout, + logSaveTimeoutMs, "coyping and persisting task logs", "This failure does not have any impact on the" + " ingestion work done by the task, but the logs may be partial or innaccessible. If " From da51d5f300e75b909d1fc98621025f32173cbf23 Mon Sep 17 00:00:00 2001 From: Lucas Capistrant Date: Mon, 6 Oct 2025 14:17:58 -0500 Subject: [PATCH 13/17] Apply log message and doc cleanup suggestions from code review Co-authored-by: Kashif Faraz --- docs/development/extensions-core/k8s-jobs.md | 2 +- .../druid/k8s/overlord/KubernetesPeonLifecycle.java | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/development/extensions-core/k8s-jobs.md b/docs/development/extensions-core/k8s-jobs.md index cb71fe95edef..00b6024bb873 100644 --- a/docs/development/extensions-core/k8s-jobs.md +++ b/docs/development/extensions-core/k8s-jobs.md @@ -792,7 +792,7 @@ Should you require the needed permissions for interacting across Kubernetes name | `druid.indexer.runner.graceTerminationPeriodSeconds` | `Long` | Number of seconds you want to wait after a sigterm for container lifecycle hooks to complete. Keep at a smaller value if you want tasks to hold locks for shorter periods. | `PT30S` (K8s default) | No | | `druid.indexer.runner.capacity` | `Integer` | Number of concurrent jobs that can be sent to Kubernetes. | `2147483647` | No | | `druid.indexer.runner.cpuCoreInMicro` | `Integer` | Number of CPU micro core for the task. | `1000` | No | -| `druid.indexer.runner.logSaveTimeout` | `Duration` | The peon executing the ingestion task makes a best effort to persist the pod logs from `k8s` to persistent task log storage. A timeout for this log persist is set to avoid `k8s` connection issues causing the peon to hang indefinitely if issues occur during the persisting of said logs. | `PT300S` | NO | +| `druid.indexer.runner.logSaveTimeout` | `Duration` | The peon executing the ingestion task makes a best effort to persist the pod logs from `k8s` to persistent task log storage. The timeout ensures that `k8s` connection issues do not cause the pod to hang indefinitely thereby blocking Overlord operations. If the timeout occurs before the logs are saved, those logs will not be available in Druid. | `PT300S` | NO | ### Metrics added diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java index 71429110e832..e74b3d4c33e9 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java @@ -457,7 +457,7 @@ protected ListenableFuture getTaskStartedSuccessfullyFuture() * logged and null will be returned. *

*/ - private @Nullable T executeWithTimeout(Callable runnable, long timeoutMillis, String operationName, String errorMessage) + private @Nullable T executeWithTimeout(Callable callable, long timeoutMillis, String operationName, String errorMessage) { ExecutorService executor = Executors.newSingleThreadExecutor( Execs.makeThreadFactory("k8s-peon-lifecycle-util-" + taskId.getOriginalTaskId() + "-%d")); @@ -466,13 +466,13 @@ protected ListenableFuture getTaskStartedSuccessfullyFuture() return future.get(timeoutMillis, TimeUnit.MILLISECONDS); } catch (TimeoutException e) { - log.warn("Operation [%s] timed out after %d ms for task [%s]. %s", operationName, timeoutMillis, taskId.getOriginalTaskId(), errorMessage); + log.warn("Operation[%s] for task[%s] timed out after [%d] ms with error[%s].", operationName, taskId.getOriginalTaskId(), timeoutMillis, errorMessage); } catch (InterruptedException e) { - log.warn("Operation [%s] was interrupted for task [%s]. %s", operationName, taskId.getOriginalTaskId(), errorMessage); + log.warn("Operation[%s] for task[%s] was interrupted with error[%s].", operationName, taskId.getOriginalTaskId(), errorMessage); } catch (Exception e) { - log.error(e, "Error during operation [%s] for task: %s. %s", operationName, taskId, errorMessage); + log.error(e, "Error during operation[%s] for task[%s]: %s", operationName, taskId, errorMessage); } finally { executor.shutdownNow(); From cd71a0db1930a6ca8081df05d7b9910550fbb8bc Mon Sep 17 00:00:00 2001 From: capistrant Date: Mon, 6 Oct 2025 14:22:49 -0500 Subject: [PATCH 14/17] Fix compilation error caused by me accepting a refactor request suggestion without realizing it wouldn't actually refactor the code magically --- .../org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java index e74b3d4c33e9..d15fa3963237 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycle.java @@ -462,7 +462,7 @@ protected ListenableFuture getTaskStartedSuccessfullyFuture() ExecutorService executor = Executors.newSingleThreadExecutor( Execs.makeThreadFactory("k8s-peon-lifecycle-util-" + taskId.getOriginalTaskId() + "-%d")); try { - Future future = executor.submit(runnable); + Future future = executor.submit(callable); return future.get(timeoutMillis, TimeUnit.MILLISECONDS); } catch (TimeoutException e) { From 80412da68cc97ca4e52472a51649632a256be72f Mon Sep 17 00:00:00 2001 From: capistrant Date: Mon, 6 Oct 2025 18:35:16 -0500 Subject: [PATCH 15/17] revert unnecessary diff in a test file --- .../k8s/overlord/KubernetesWorkItemTest.java | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java index 5bebb672f508..5e87db8c3254 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java +++ b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java @@ -37,10 +37,11 @@ @RunWith(EasyMockRunner.class) public class KubernetesWorkItemTest extends EasyMockSupport { + private static final Period LOG_SAVE_TIMEOUT = new Period("PT30S"); + private KubernetesWorkItem workItem; private Task task; private K8sTaskId taskId; - private static final Period POD_LOG_OPERATION_TIMEOUT = new Period("PT30S"); @Mock KubernetesPeonLifecycle kubernetesPeonLifecycle; @@ -133,7 +134,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_returnsPending() null, null, null, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); workItem = new KubernetesWorkItem(task, null, peonLifecycle); @@ -150,7 +151,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_inPendingState_r null, null, null, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ) { @Override protected State getState() @@ -174,7 +175,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_inRunningState_r null, null, null, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ) { @Override protected State getState() @@ -198,7 +199,7 @@ public void test_getRunnerTaskState_withKubernetesPeonLifecycle_inStoppedState_r null, null, null, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ) { @Override protected State getState() @@ -222,7 +223,7 @@ public void test_streamTaskLogs_withKubernetesPeonLifecycle() null, null, null, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); workItem = new KubernetesWorkItem(task, null, peonLifecycle); Assert.assertFalse(workItem.streamTaskLogs().isPresent()); @@ -238,7 +239,7 @@ public void test_getLocation_withKubernetesPeonLifecycle() null, null, null, - POD_LOG_OPERATION_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); workItem = new KubernetesWorkItem(task, null, peonLifecycle); From 291b7f66fecd33b52bf2bcb70221ef6d8ef5cc00 Mon Sep 17 00:00:00 2001 From: capistrant Date: Mon, 6 Oct 2025 18:36:19 -0500 Subject: [PATCH 16/17] remove unnecesarry diff changes in another test file --- .../overlord/KubernetesPeonLifecycleTest.java | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java index 1e311800a24f..8dbf695ebc27 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java +++ b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java @@ -61,8 +61,8 @@ public class KubernetesPeonLifecycleTest extends EasyMockSupport { private static final String ID = "id"; private static final TaskStatus SUCCESS = TaskStatus.success(ID); - private static final Period PEON_LOG_SAVE_TIMEOUT = new Period("PT300S"); - private static final Period PEON_LOG_SAVE_TIMEOUT_SHORT = new Period("PT1S"); + private static final Period LOG_SAVE_TIMEOUT = new Period("PT300S"); + private static final Period LOG_SAVE_TIMEOUT_SHORT = new Period("PT1S"); @Mock KubernetesPeonClient kubernetesClient; @Mock TaskLogs taskLogs; @@ -93,7 +93,7 @@ public void test_run() throws IOException taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -140,7 +140,7 @@ public void test_run_useTaskManager() throws IOException taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -186,7 +186,7 @@ public void test_run_whenCalledMultipleTimes_raisesIllegalStateException() throw taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -237,7 +237,7 @@ public void test_run_whenExceptionRaised_setsRunnerTaskStateToNone() taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -283,7 +283,7 @@ public void test_run_whenExceptionRaised_setsStartStatusFutureToFalse() throws E taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ) { @Override @@ -332,7 +332,7 @@ public void test_join_withoutJob_returnsFailedTaskStatus() throws IOException taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); EasyMock.expect(kubernetesClient.waitForPeonJobCompletion( @@ -372,7 +372,7 @@ public void test_join() throws IOException taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); Assert.assertFalse(peonLifecycle.getTaskStartedSuccessfullyFuture().isDone()); @@ -427,7 +427,7 @@ public void test_join_whenCalledMultipleTimes_raisesIllegalStateException() thro taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -489,7 +489,7 @@ public void test_join_withoutTaskStatus_returnsFailedTaskStatus() throws IOExcep taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -538,7 +538,7 @@ public void test_join_whenIOExceptionThrownWhileStreamingTaskStatus_returnsFaile taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -590,7 +590,7 @@ public void test_join_whenIOExceptionThrownWhileStreamingTaskLogs_isIgnored() th taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); Job job = new JobBuilder() @@ -642,7 +642,7 @@ public void test_join_whenRuntimeExceptionThrownWhileWaitingForKubernetesJob_thr taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); EasyMock.expect(kubernetesClient.waitForPeonJobCompletion( @@ -683,7 +683,7 @@ public void test_shutdown_withNotStartedTaskState() taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); peonLifecycle.shutdown(); } @@ -698,7 +698,7 @@ public void test_shutdown_withPendingTaskState() throws NoSuchFieldException, Il taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.PENDING); @@ -721,7 +721,7 @@ public void test_shutdown_withRunningTaskState() throws NoSuchFieldException, Il taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -744,7 +744,7 @@ public void test_shutdown_withStoppedTaskState() throws NoSuchFieldException, Il taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.STOPPED); @@ -761,7 +761,7 @@ public void test_streamLogs_withNotStartedTaskState() throws NoSuchFieldExceptio taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.NOT_STARTED); @@ -778,7 +778,7 @@ public void test_streamLogs_withPendingTaskState() throws NoSuchFieldException, taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.PENDING); @@ -795,7 +795,7 @@ public void test_streamLogs_withRunningTaskState() throws NoSuchFieldException, taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -820,7 +820,7 @@ public void test_streamLogs_withStoppedTaskState() throws NoSuchFieldException, taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.STOPPED); @@ -838,7 +838,7 @@ public void test_getTaskLocation_withNotStartedTaskState_returnsUnknown() taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.NOT_STARTED); @@ -856,7 +856,7 @@ public void test_getTaskLocation_withPendingTaskState_returnsUnknown() taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.PENDING); @@ -874,7 +874,7 @@ public void test_getTaskLocation_withRunningTaskState_withoutPeonPod_returnsUnkn taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -898,7 +898,7 @@ public void test_getTaskLocation_withRunningTaskState_withPeonPodWithoutStatus_r taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -928,7 +928,7 @@ public void test_getTaskLocation_withRunningTaskState_withPeonPodWithStatus_retu taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -966,7 +966,7 @@ public void test_getTaskLocation_saveTaskLocation() taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -1004,7 +1004,7 @@ public void test_getTaskLocation_withRunningTaskState_withPeonPodWithStatusWithT taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.RUNNING); @@ -1043,7 +1043,7 @@ public void test_getTaskLocation_withStoppedTaskState_returnsUnknown() taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); setPeonLifecycleState(peonLifecycle, KubernetesPeonLifecycle.State.STOPPED); EasyMock.expect(kubernetesClient.getPeonPod(k8sTaskId.getK8sJobName())).andReturn(Optional.absent()).once(); @@ -1063,7 +1063,7 @@ public void test_startWatchingLogs_logWatchInitialize_timeout() taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT_SHORT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT_SHORT.toStandardDuration().getMillis() ); EasyMock.expect(kubernetesClient.getPeonLogWatcher(k8sTaskId)) .andAnswer(() -> { @@ -1106,7 +1106,7 @@ public int read() throws IOException taskLogs, mapper, stateListener, - PEON_LOG_SAVE_TIMEOUT_SHORT.toStandardDuration().getMillis() + LOG_SAVE_TIMEOUT_SHORT.toStandardDuration().getMillis() ); EasyMock.expect(kubernetesClient.getPeonLogWatcher(k8sTaskId)).andReturn(Optional.of(logWatch)).once(); EasyMock.expect(logWatch.getOutput()).andReturn(slowInputStream); From 94ff77478bf3bdbbf085131d63d61c8b76e8621f Mon Sep 17 00:00:00 2001 From: capistrant Date: Mon, 6 Oct 2025 18:39:35 -0500 Subject: [PATCH 17/17] further diff cleanup --- .../k8s/overlord/KubernetesPeonLifecycleFactory.java | 8 ++++---- .../druid/k8s/overlord/KubernetesPeonLifecycleTest.java | 6 +++--- .../apache/druid/k8s/overlord/KubernetesWorkItemTest.java | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java index 10ab6ab9ae22..8bd7db2ebf3e 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java +++ b/extensions-core/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleFactory.java @@ -30,19 +30,19 @@ public class KubernetesPeonLifecycleFactory implements PeonLifecycleFactory private final KubernetesPeonClient client; private final TaskLogs taskLogs; private final ObjectMapper mapper; - private final long podLogOperationTimeout; + private final long logSaveTimeoutMs; public KubernetesPeonLifecycleFactory( KubernetesPeonClient client, TaskLogs taskLogs, ObjectMapper mapper, - long podLogOperationTimeout + long logSaveTimeoutMs ) { this.client = client; this.taskLogs = taskLogs; this.mapper = mapper; - this.podLogOperationTimeout = podLogOperationTimeout; + this.logSaveTimeoutMs = logSaveTimeoutMs; } @Override @@ -55,7 +55,7 @@ public KubernetesPeonLifecycle build(Task task, K8sTaskId k8sTaskId, KubernetesP taskLogs, mapper, stateListener, - podLogOperationTimeout + logSaveTimeoutMs ); } } diff --git a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java index 8dbf695ebc27..df6f81532f7d 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java +++ b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesPeonLifecycleTest.java @@ -62,7 +62,7 @@ public class KubernetesPeonLifecycleTest extends EasyMockSupport private static final String ID = "id"; private static final TaskStatus SUCCESS = TaskStatus.success(ID); private static final Period LOG_SAVE_TIMEOUT = new Period("PT300S"); - private static final Period LOG_SAVE_TIMEOUT_SHORT = new Period("PT1S"); + private static final Period SHORT_LOG_SAVE_TIMEOUT = new Period("PT1S"); @Mock KubernetesPeonClient kubernetesClient; @Mock TaskLogs taskLogs; @@ -1063,7 +1063,7 @@ public void test_startWatchingLogs_logWatchInitialize_timeout() taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT_SHORT.toStandardDuration().getMillis() + SHORT_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); EasyMock.expect(kubernetesClient.getPeonLogWatcher(k8sTaskId)) .andAnswer(() -> { @@ -1106,7 +1106,7 @@ public int read() throws IOException taskLogs, mapper, stateListener, - LOG_SAVE_TIMEOUT_SHORT.toStandardDuration().getMillis() + SHORT_LOG_SAVE_TIMEOUT.toStandardDuration().getMillis() ); EasyMock.expect(kubernetesClient.getPeonLogWatcher(k8sTaskId)).andReturn(Optional.of(logWatch)).once(); EasyMock.expect(logWatch.getOutput()).andReturn(slowInputStream); diff --git a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java index 5e87db8c3254..23c4b444811a 100644 --- a/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java +++ b/extensions-core/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/KubernetesWorkItemTest.java @@ -37,7 +37,7 @@ @RunWith(EasyMockRunner.class) public class KubernetesWorkItemTest extends EasyMockSupport { - private static final Period LOG_SAVE_TIMEOUT = new Period("PT30S"); + private static final Period LOG_SAVE_TIMEOUT = new Period("PT300S"); private KubernetesWorkItem workItem; private Task task;