From 36cd94bec8e87c6b031ab4164f5b5800d7859af6 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Mon, 7 Jul 2025 10:30:58 +0800 Subject: [PATCH] [Opt](cloud-mow) Retry to commit txn when encounter stale calc delete bitmap response regardless of status code (#52547) https://github.com/apache/doris/pull/49710 add a check in MS to forbid stale calc delete bitmap task to wrongly update delete bitmaps in MS. But this may lead to load fail due to the check on FE. This PR let FE retry to commit the txn when encounter stale calc delete bitmap response regardless of task's status code to avoid the problem. --- .../java/org/apache/doris/master/MasterImpl.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java b/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java index 213984ae982ae6..ad4f1a1bacdb09 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java +++ b/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java @@ -683,12 +683,8 @@ private void finishCalcDeleteBitmap(AgentTask task, TFinishTaskRequest request) // and if meta is missing, we no longer need to resend this task try { CalcDeleteBitmapTask calcDeleteBitmapTask = (CalcDeleteBitmapTask) task; - if (request.getTaskStatus().getStatusCode() != TStatusCode.OK) { - calcDeleteBitmapTask.countDownToZero(request.getTaskStatus().getStatusCode(), - "backend: " + task.getBackendId() + ", error_tablet_size: " - + request.getErrorTabletIdsSize() + ", err_msg: " - + request.getTaskStatus().getErrorMsgs().toString()); - } else if (request.isSetRespPartitions() + // check if the request is stale first, if so, let it retry regardless of the status code + if (request.isSetRespPartitions() && calcDeleteBitmapTask.isFinishRequestStale(request.getRespPartitions())) { LOG.warn("get staled response from backend: {}, report version: {}. calcDeleteBitmapTask's" + "partitionInfos: {}. response's partitionInfos: {}", task.getBackendId(), @@ -699,6 +695,11 @@ private void finishCalcDeleteBitmap(AgentTask task, TFinishTaskRequest request) calcDeleteBitmapTask.countDownToZero(TStatusCode.DELETE_BITMAP_LOCK_ERROR, "get staled response from backend " + task.getBackendId() + ", report version: " + request.getReportVersion()); + } else if (request.getTaskStatus().getStatusCode() != TStatusCode.OK) { + calcDeleteBitmapTask.countDownToZero(request.getTaskStatus().getStatusCode(), + "backend: " + task.getBackendId() + ", error_tablet_size: " + request.getErrorTabletIdsSize() + + ", error_tablets: " + request.getErrorTabletIds() + + ", err_msg: " + request.getTaskStatus().getErrorMsgs().toString()); } else { calcDeleteBitmapTask.countDownLatch(task.getBackendId(), calcDeleteBitmapTask.getTransactionId()); if (LOG.isDebugEnabled()) {