From 2ac2cf0ed7773c25ad93d3e93e5cbcd7486a3790 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Mon, 7 Jul 2025 10:30:58 +0800 Subject: [PATCH] [Opt](cloud-mow) Retry to commit txn when encounter stale calc delete bitmap response regardless of status code (#52547) https://github.com/apache/doris/pull/49710 add a check in MS to forbid stale calc delete bitmap task to wrongly update delete bitmaps in MS. But this may lead to load fail due to the check on FE. This PR let FE retry to commit the txn when encounter stale calc delete bitmap response regardless of task's status code to avoid the problem. --- .../java/org/apache/doris/master/MasterImpl.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java b/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java index 213984ae982ae6..ad4f1a1bacdb09 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java +++ b/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java @@ -683,12 +683,8 @@ private void finishCalcDeleteBitmap(AgentTask task, TFinishTaskRequest request) // and if meta is missing, we no longer need to resend this task try { CalcDeleteBitmapTask calcDeleteBitmapTask = (CalcDeleteBitmapTask) task; - if (request.getTaskStatus().getStatusCode() != TStatusCode.OK) { - calcDeleteBitmapTask.countDownToZero(request.getTaskStatus().getStatusCode(), - "backend: " + task.getBackendId() + ", error_tablet_size: " - + request.getErrorTabletIdsSize() + ", err_msg: " - + request.getTaskStatus().getErrorMsgs().toString()); - } else if (request.isSetRespPartitions() + // check if the request is stale first, if so, let it retry regardless of the status code + if (request.isSetRespPartitions() && calcDeleteBitmapTask.isFinishRequestStale(request.getRespPartitions())) { LOG.warn("get staled response from backend: {}, report version: {}. calcDeleteBitmapTask's" + "partitionInfos: {}. response's partitionInfos: {}", task.getBackendId(), @@ -699,6 +695,11 @@ private void finishCalcDeleteBitmap(AgentTask task, TFinishTaskRequest request) calcDeleteBitmapTask.countDownToZero(TStatusCode.DELETE_BITMAP_LOCK_ERROR, "get staled response from backend " + task.getBackendId() + ", report version: " + request.getReportVersion()); + } else if (request.getTaskStatus().getStatusCode() != TStatusCode.OK) { + calcDeleteBitmapTask.countDownToZero(request.getTaskStatus().getStatusCode(), + "backend: " + task.getBackendId() + ", error_tablet_size: " + request.getErrorTabletIdsSize() + + ", error_tablets: " + request.getErrorTabletIds() + + ", err_msg: " + request.getTaskStatus().getErrorMsgs().toString()); } else { calcDeleteBitmapTask.countDownLatch(task.getBackendId(), calcDeleteBitmapTask.getTransactionId()); if (LOG.isDebugEnabled()) {