From c8a598b08a12af05c3e0bf1453cebe8ba7712375 Mon Sep 17 00:00:00 2001 From: laihui <1353307710@qq.com> Date: Thu, 21 Mar 2024 11:52:40 +0800 Subject: [PATCH 1/3] self-adaption backoff timeout --- .../doris/common/InternalErrorCode.java | 4 ++- .../load/routineload/RoutineLoadJob.java | 13 ++++++++ .../load/routineload/RoutineLoadTaskInfo.java | 30 +++++++++++++++++++ 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/InternalErrorCode.java b/fe/fe-core/src/main/java/org/apache/doris/common/InternalErrorCode.java index 2bbd5c58efa02b..214f74a38f475e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/InternalErrorCode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/InternalErrorCode.java @@ -34,7 +34,9 @@ public enum InternalErrorCode { MANUAL_STOP_ERR(101), TOO_MANY_FAILURE_ROWS_ERR(102), CREATE_TASKS_ERR(103), - TASKS_ABORT_ERR(104); + TASKS_ABORT_ERR(104), + CANNOT_RESUME_ERR(105), + TIMEOUT_TOO_MUCH(106); private long errCode; diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadJob.java b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadJob.java index 0d9ae516351597..7dbffb27f23b5b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadJob.java @@ -709,6 +709,18 @@ public void processTimeoutTasks() { // and after renew, the previous task is removed from routineLoadTaskInfoList, // so task can no longer be committed successfully. // the already committed task will not be handled here. + int timeoutBackOffCount = routineLoadTaskInfo.getTimeoutBackOffCount(); + if (timeoutBackOffCount > RoutineLoadTaskInfo.MAX_TIMEOUT_BACK_OFF_COUNT) { + try { + updateState(JobState.PAUSED, new ErrorReason(InternalErrorCode.TIMEOUT_TOO_MUCH, + "task " + routineLoadTaskInfo.getId() + " timeout too much"), false); + } catch (UserException e) { + LOG.warn("update job state to pause failed", e); + } + return; + } + routineLoadTaskInfo.setTimeoutBackOffCount(timeoutBackOffCount + 1); + routineLoadTaskInfo.setTimeoutMs((routineLoadTaskInfo.getTimeoutMs() << 1)); RoutineLoadTaskInfo newTask = unprotectRenewTask(routineLoadTaskInfo); Env.getCurrentEnv().getRoutineLoadTaskScheduler().addTaskInQueue(newTask); } @@ -1212,6 +1224,7 @@ private void executeTaskOnTxnStatusChanged(RoutineLoadTaskInfo routineLoadTaskIn } else if (checkCommitInfo(rlTaskTxnCommitAttachment, txnState, txnStatusChangeReason)) { // step2: update job progress updateProgress(rlTaskTxnCommitAttachment); + routineLoadTaskInfo.selfAdaptTimeout(rlTaskTxnCommitAttachment); } if (rlTaskTxnCommitAttachment != null && !Strings.isNullOrEmpty(rlTaskTxnCommitAttachment.getErrorLogUrl())) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java index 7a5312b2c8f760..500f1e27258713 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java @@ -73,6 +73,9 @@ public abstract class RoutineLoadTaskInfo { protected boolean isMultiTable = false; + protected static final int MAX_TIMEOUT_BACK_OFF_COUNT = 3; + protected int timeoutBackOffCount = 0; + // this status will be set when corresponding transaction's status is changed. // so that user or other logic can know the status of the corresponding txn. protected TransactionStatus txnStatus = TransactionStatus.UNKNOWN; @@ -136,6 +139,10 @@ public void setLastScheduledTime(long lastScheduledTime) { this.lastScheduledTime = lastScheduledTime; } + public void setTimeoutMs(long timeoutMs) { + this.timeoutMs = timeoutMs; + } + public long getTimeoutMs() { return timeoutMs; } @@ -148,6 +155,14 @@ public TransactionStatus getTxnStatus() { return txnStatus; } + public void setTimeoutBackOffCount(int timeoutBackOffCount) { + this.timeoutBackOffCount = timeoutBackOffCount; + } + + public int getTimeoutBackOffCount() { + return timeoutBackOffCount; + } + public boolean isTimeout() { if (txnStatus == TransactionStatus.COMMITTED || txnStatus == TransactionStatus.VISIBLE) { // the corresponding txn is already finished, this task can not be treated as timeout. @@ -162,6 +177,21 @@ public boolean isTimeout() { return false; } + public void selfAdaptTimeout(RLTaskTxnCommitAttachment rlTaskTxnCommitAttachment) { + long taskExecutionTime = rlTaskTxnCommitAttachment.getTaskExecutionTimeMs(); + long timeoutMs = this.timeoutMs; + + while (this.timeoutBackOffCount > 0) { + timeoutMs = timeoutMs >> 1; + if (timeoutMs <= taskExecutionTime) { + this.timeoutMs = timeoutMs << 1; + return; + } + this.timeoutBackOffCount--; + } + this.timeoutMs = timeoutMs; + } + abstract TRoutineLoadTask createRoutineLoadTask() throws UserException; // begin the txn of this task From 6c8139da008e42bb055e52b8f2cad0f2778b4ca2 Mon Sep 17 00:00:00 2001 From: laihui <1353307710@qq.com> Date: Thu, 21 Mar 2024 20:12:01 +0800 Subject: [PATCH 2/3] fix timeout backoff can not work --- .../load/routineload/KafkaRoutineLoadJob.java | 2 +- .../doris/load/routineload/KafkaTaskInfo.java | 18 +++++++++++++++--- .../load/routineload/RoutineLoadTaskInfo.java | 11 +++++++---- .../routineload/KafkaRoutineLoadJobTest.java | 2 +- .../RoutineLoadTaskSchedulerTest.java | 2 +- .../transaction/GlobalTransactionMgrTest.java | 4 ++-- 6 files changed, 27 insertions(+), 12 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaRoutineLoadJob.java b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaRoutineLoadJob.java index 6e6dba068e995a..bc1f1428a919b5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaRoutineLoadJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaRoutineLoadJob.java @@ -227,7 +227,7 @@ public void divideRoutineLoadJob(int currentConcurrentTaskNum) throws UserExcept ((KafkaProgress) progress).getOffsetByPartition(kafkaPartition)); } KafkaTaskInfo kafkaTaskInfo = new KafkaTaskInfo(UUID.randomUUID(), id, clusterName, - maxBatchIntervalS * 2 * 1000, taskKafkaProgress, isMultiTable()); + maxBatchIntervalS * 2 * 1000, 0, taskKafkaProgress, isMultiTable()); routineLoadTaskInfoList.add(kafkaTaskInfo); result.add(kafkaTaskInfo); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaTaskInfo.java b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaTaskInfo.java index d6f0a28705732b..20df8e8439934e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaTaskInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaTaskInfo.java @@ -47,14 +47,16 @@ public class KafkaTaskInfo extends RoutineLoadTaskInfo { private Map partitionIdToOffset; public KafkaTaskInfo(UUID id, long jobId, String clusterName, - long timeoutMs, Map partitionIdToOffset, boolean isMultiTable) { - super(id, jobId, clusterName, timeoutMs, isMultiTable); + long timeoutMs, int timeoutBackOffCount, + Map partitionIdToOffset, boolean isMultiTable) { + super(id, jobId, timeoutMs, timeoutBackOffCount, isMultiTable); this.partitionIdToOffset = partitionIdToOffset; } public KafkaTaskInfo(KafkaTaskInfo kafkaTaskInfo, Map partitionIdToOffset, boolean isMultiTable) { super(UUID.randomUUID(), kafkaTaskInfo.getJobId(), kafkaTaskInfo.getClusterName(), - kafkaTaskInfo.getTimeoutMs(), kafkaTaskInfo.getBeId(), isMultiTable); + kafkaTaskInfo.getTimeoutMs(), kafkaTaskInfo.getTimeoutBackOffCount(), + kafkaTaskInfo.getBeId(), isMultiTable); this.partitionIdToOffset = partitionIdToOffset; } @@ -129,6 +131,11 @@ private TExecPlanFragmentParams rePlan(RoutineLoadJob routineLoadJob) throws Use TExecPlanFragmentParams tExecPlanFragmentParams = routineLoadJob.plan(loadId, txnId); TPlanFragment tPlanFragment = tExecPlanFragmentParams.getFragment(); tPlanFragment.getOutputSink().getOlapTableSink().setTxnId(txnId); + // it need update timeout to make task timeout backoff work + long timeoutS = this.getTimeoutMs() / 1000; + tPlanFragment.getOutputSink().getOlapTableSink().setLoadChannelTimeoutS(timeoutS); + tExecPlanFragmentParams.getQueryOptions().setQueryTimeout((int) timeoutS); + tExecPlanFragmentParams.getQueryOptions().setExecutionTimeout((int) timeoutS); return tExecPlanFragmentParams; } @@ -138,6 +145,11 @@ private TPipelineFragmentParams rePlanForPipeline(RoutineLoadJob routineLoadJob) TPipelineFragmentParams tExecPlanFragmentParams = routineLoadJob.planForPipeline(loadId, txnId); TPlanFragment tPlanFragment = tExecPlanFragmentParams.getFragment(); tPlanFragment.getOutputSink().getOlapTableSink().setTxnId(txnId); + // it need update timeout to make task timeout backoff work + long timeoutS = this.getTimeoutMs() / 1000; + tPlanFragment.getOutputSink().getOlapTableSink().setLoadChannelTimeoutS(timeoutS); + tExecPlanFragmentParams.getQueryOptions().setQueryTimeout((int) timeoutS); + tExecPlanFragmentParams.getQueryOptions().setExecutionTimeout((int) timeoutS); return tExecPlanFragmentParams; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java index 500f1e27258713..10d57e66d67334 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java @@ -80,18 +80,21 @@ public abstract class RoutineLoadTaskInfo { // so that user or other logic can know the status of the corresponding txn. protected TransactionStatus txnStatus = TransactionStatus.UNKNOWN; - public RoutineLoadTaskInfo(UUID id, long jobId, String clusterName, long timeoutMs, boolean isMultiTable) { + + public RoutineLoadTaskInfo(UUID id, long jobId, String clusterName, long timeoutMs, + int timeoutBackOffCount, boolean isMultiTable) { this.id = id; this.jobId = jobId; this.clusterName = clusterName; this.createTimeMs = System.currentTimeMillis(); this.timeoutMs = timeoutMs; + this.timeoutBackOffCount = timeoutBackOffCount; this.isMultiTable = isMultiTable; } - public RoutineLoadTaskInfo(UUID id, long jobId, String clusterName, long timeoutMs, long previousBeId, - boolean isMultiTable) { - this(id, jobId, clusterName, timeoutMs, isMultiTable); + public RoutineLoadTaskInfo(UUID id, long jobId, String clusterName, long timeoutMs, + int timeoutBackOffCount, long previousBeId, boolean isMultiTable) { + this(id, jobId, clusterName, timeoutMs, timeoutBackOffCount, isMultiTable); this.previousBeId = previousBeId; } diff --git a/fe/fe-core/src/test/java/org/apache/doris/load/routineload/KafkaRoutineLoadJobTest.java b/fe/fe-core/src/test/java/org/apache/doris/load/routineload/KafkaRoutineLoadJobTest.java index 57ded401bd9bfd..73213fc7ffca8b 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/load/routineload/KafkaRoutineLoadJobTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/load/routineload/KafkaRoutineLoadJobTest.java @@ -224,7 +224,7 @@ public void testProcessTimeOutTasks(@Injectable GlobalTransactionMgr globalTrans Map partitionIdsToOffset = Maps.newHashMap(); partitionIdsToOffset.put(100, 0L); KafkaTaskInfo kafkaTaskInfo = new KafkaTaskInfo(new UUID(1, 1), 1L, "default_cluster", - maxBatchIntervalS * 2 * 1000, partitionIdsToOffset, false); + maxBatchIntervalS * 2 * 1000, 0, partitionIdsToOffset, false); kafkaTaskInfo.setExecuteStartTimeMs(System.currentTimeMillis() - maxBatchIntervalS * 2 * 1000 - 1); routineLoadTaskInfoList.add(kafkaTaskInfo); diff --git a/fe/fe-core/src/test/java/org/apache/doris/load/routineload/RoutineLoadTaskSchedulerTest.java b/fe/fe-core/src/test/java/org/apache/doris/load/routineload/RoutineLoadTaskSchedulerTest.java index e0fdd92f73757b..02db47538fb699 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/load/routineload/RoutineLoadTaskSchedulerTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/load/routineload/RoutineLoadTaskSchedulerTest.java @@ -69,7 +69,7 @@ public void testRunOneCycle(@Injectable KafkaRoutineLoadJob kafkaRoutineLoadJob1 Deencapsulation.setField(kafkaProgress, "partitionIdToOffset", partitionIdToOffset); Queue routineLoadTaskInfoQueue = Queues.newLinkedBlockingQueue(); - KafkaTaskInfo routineLoadTaskInfo1 = new KafkaTaskInfo(new UUID(1, 1), 1L, "default_cluster", 20000, + KafkaTaskInfo routineLoadTaskInfo1 = new KafkaTaskInfo(new UUID(1, 1), 1L, "default_cluster", 20000, 0, partitionIdToOffset, false); routineLoadTaskInfoQueue.add(routineLoadTaskInfo1); diff --git a/fe/fe-core/src/test/java/org/apache/doris/transaction/GlobalTransactionMgrTest.java b/fe/fe-core/src/test/java/org/apache/doris/transaction/GlobalTransactionMgrTest.java index a819c4f030178f..414f5cf03c4db5 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/transaction/GlobalTransactionMgrTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/transaction/GlobalTransactionMgrTest.java @@ -318,7 +318,7 @@ public void testCommitRoutineLoadTransaction(@Injectable TabletCommitInfo tablet List routineLoadTaskInfoList = Deencapsulation.getField(routineLoadJob, "routineLoadTaskInfoList"); Map partitionIdToOffset = Maps.newHashMap(); partitionIdToOffset.put(1, 0L); - KafkaTaskInfo routineLoadTaskInfo = new KafkaTaskInfo(UUID.randomUUID(), 1L, "default_cluster", 20000, + KafkaTaskInfo routineLoadTaskInfo = new KafkaTaskInfo(UUID.randomUUID(), 1L, "default_cluster", 20000, 0, partitionIdToOffset, false); Deencapsulation.setField(routineLoadTaskInfo, "txnId", 1L); routineLoadTaskInfoList.add(routineLoadTaskInfo); @@ -390,7 +390,7 @@ public void testCommitRoutineLoadTransactionWithErrorMax(@Injectable TabletCommi List routineLoadTaskInfoList = Deencapsulation.getField(routineLoadJob, "routineLoadTaskInfoList"); Map partitionIdToOffset = Maps.newHashMap(); partitionIdToOffset.put(1, 0L); - KafkaTaskInfo routineLoadTaskInfo = new KafkaTaskInfo(UUID.randomUUID(), 1L, "defualt_cluster", 20000, + KafkaTaskInfo routineLoadTaskInfo = new KafkaTaskInfo(UUID.randomUUID(), 1L, "defualt_cluster", 20000, 0, partitionIdToOffset, false); Deencapsulation.setField(routineLoadTaskInfo, "txnId", 1L); routineLoadTaskInfoList.add(routineLoadTaskInfo); From 75b189d68ff345643b03f86eae9d6fb74f3f6906 Mon Sep 17 00:00:00 2001 From: laihui <1353307710@qq.com> Date: Thu, 21 Mar 2024 20:23:22 +0800 Subject: [PATCH 3/3] update --- .../java/org/apache/doris/load/routineload/KafkaTaskInfo.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaTaskInfo.java b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaTaskInfo.java index 20df8e8439934e..de1cf5096d2a7e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaTaskInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaTaskInfo.java @@ -49,7 +49,7 @@ public class KafkaTaskInfo extends RoutineLoadTaskInfo { public KafkaTaskInfo(UUID id, long jobId, String clusterName, long timeoutMs, int timeoutBackOffCount, Map partitionIdToOffset, boolean isMultiTable) { - super(id, jobId, timeoutMs, timeoutBackOffCount, isMultiTable); + super(id, jobId, clusterName, timeoutMs, timeoutBackOffCount, isMultiTable); this.partitionIdToOffset = partitionIdToOffset; }