From a39bde40b9d1897b36683af92eece32ea9adbe82 Mon Sep 17 00:00:00 2001 From: yujun777 Date: Thu, 9 Nov 2023 15:53:39 +0800 Subject: [PATCH 1/4] fix wait publish txn too long --- .../transaction/DatabaseTransactionMgr.java | 9 +++----- .../transaction/PublishVersionDaemon.java | 2 +- .../doris/transaction/TransactionState.java | 21 +++++++++---------- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java index e6d266e43eeca8..b6c271d28c2be0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java @@ -944,10 +944,10 @@ public void finishTransaction(long transactionId) throws UserException { Map publishTasks = transactionState.getPublishVersionTasks(); long now = System.currentTimeMillis(); - long firstPublishOneSuccTime = transactionState.getFirstPublishOneSuccTime(); + long firstPublishVersionTime = transactionState.getFirstPublishVersionTime(); boolean allowPublishOneSucc = false; - if (Config.publish_wait_time_second > 0 && firstPublishOneSuccTime > 0 - && now >= firstPublishOneSuccTime + Config.publish_wait_time_second * 1000L) { + if (Config.publish_wait_time_second > 0 && firstPublishVersionTime > 0 + && now >= firstPublishVersionTime + Config.publish_wait_time_second * 1000L) { allowPublishOneSucc = true; } @@ -1100,9 +1100,6 @@ public void finishTransaction(long transactionId) throws UserException { } } } - if (allTabletsLeastOneSucc && firstPublishOneSuccTime <= 0) { - transactionState.setFirstPublishOneSuccTime(now); - } if (publishResult == PublishResult.FAILED) { return; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/PublishVersionDaemon.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/PublishVersionDaemon.java index 922e8645d9f17a..55265411300313 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/PublishVersionDaemon.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/PublishVersionDaemon.java @@ -121,7 +121,7 @@ private void publishVersion() { batchTask.addTask(task); transactionState.addPublishVersionTask(backendId, task); } - transactionState.setHasSendTask(true); + transactionState.setSendedTask(); LOG.info("send publish tasks for transaction: {}, db: {}", transactionState.getTransactionId(), transactionState.getDbId()); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/TransactionState.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/TransactionState.java index 2b01a612534c3f..c4c7354a1810df 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/TransactionState.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/TransactionState.java @@ -224,9 +224,9 @@ public String toString() { private TransactionStatus preStatus = null; // When publish txn, if every tablet has at least 1 replica published succ, but not quorum replicas succ, - // and time since firstPublishOneSuccTime has exceeds Config.publish_wait_time_second, + // and time since firstPublishVersionTime has exceeds Config.publish_wait_time_second, // then this transaction will become visible. - private long firstPublishOneSuccTime = -1; + private long firstPublishVersionTime = -1; @SerializedName(value = "callbackId") private long callbackId = -1; @@ -339,13 +339,16 @@ public void addPublishVersionTask(Long backendId, PublishVersionTask task) { this.publishVersionTasks.put(backendId, task); } - public void setHasSendTask(boolean hasSendTask) { - this.hasSendTask = hasSendTask; - this.publishVersionTime = System.currentTimeMillis(); + public void setSendedTask() { + this.hasSendTask = true; + updateSendTaskTime(); } public void updateSendTaskTime() { this.publishVersionTime = System.currentTimeMillis(); + if (this.firstPublishVersionTime <= 0) { + this.firstPublishVersionTime = publishVersionTime; + } } public long getPublishVersionTime() { @@ -420,12 +423,8 @@ public String getErrorLogUrl() { return errorLogUrl; } - public long getFirstPublishOneSuccTime() { - return firstPublishOneSuccTime; - } - - public void setFirstPublishOneSuccTime(long firstPublishOneSuccTime) { - this.firstPublishOneSuccTime = firstPublishOneSuccTime; + public long getFirstPublishVersionTime() { + return firstPublishVersionTime; } public void setTransactionStatus(TransactionStatus transactionStatus) { From 033765ace429fcd08f479ddcdb28e88d0c9b0cac Mon Sep 17 00:00:00 2001 From: yujun777 Date: Thu, 9 Nov 2023 16:13:45 +0800 Subject: [PATCH 2/4] fix wait publish txn too long --- .../transaction/DatabaseTransactionMgr.java | 2 +- .../transaction/PublishVersionDaemon.java | 2 +- .../doris/transaction/TransactionState.java | 21 ++++++++++--------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java index b6c271d28c2be0..4016b07912d7ed 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java @@ -294,7 +294,7 @@ private void getTxnStateInfo(TransactionState txnState, List info) { info.add(TimeUtils.longToTimeString(txnState.getPrepareTime())); info.add(TimeUtils.longToTimeString(txnState.getPreCommitTime())); info.add(TimeUtils.longToTimeString(txnState.getCommitTime())); - info.add(TimeUtils.longToTimeString(txnState.getPublishVersionTime())); + info.add(TimeUtils.longToTimeString(txnState.getLastPublishVersionTime())); info.add(TimeUtils.longToTimeString(txnState.getFinishTime())); info.add(txnState.getReason()); info.add(String.valueOf(txnState.getErrorReplicas().size())); diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/PublishVersionDaemon.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/PublishVersionDaemon.java index 55265411300313..06fa71303c14db 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/PublishVersionDaemon.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/PublishVersionDaemon.java @@ -174,7 +174,7 @@ private void publishVersion() { AgentTaskQueue.removeTask(task.getBackendId(), TTaskType.PUBLISH_VERSION, task.getSignature()); } if (MetricRepo.isInit) { - long publishTime = transactionState.getPublishVersionTime() - transactionState.getCommitTime(); + long publishTime = transactionState.getLastPublishVersionTime() - transactionState.getCommitTime(); MetricRepo.HISTO_TXN_PUBLISH_LATENCY.update(publishTime); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/TransactionState.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/TransactionState.java index c4c7354a1810df..17e9f53d6091d1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/TransactionState.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/TransactionState.java @@ -220,7 +220,6 @@ public String toString() { // this state need not be serialized private Map publishVersionTasks; private boolean hasSendTask; - private long publishVersionTime = -1; private TransactionStatus preStatus = null; // When publish txn, if every tablet has at least 1 replica published succ, but not quorum replicas succ, @@ -228,6 +227,8 @@ public String toString() { // then this transaction will become visible. private long firstPublishVersionTime = -1; + private long lastPublishVersionTime = -1; + @SerializedName(value = "callbackId") private long callbackId = -1; @@ -345,14 +346,18 @@ public void setSendedTask() { } public void updateSendTaskTime() { - this.publishVersionTime = System.currentTimeMillis(); + this.lastPublishVersionTime = System.currentTimeMillis(); if (this.firstPublishVersionTime <= 0) { - this.firstPublishVersionTime = publishVersionTime; + this.firstPublishVersionTime = lastPublishVersionTime; } } - public long getPublishVersionTime() { - return this.publishVersionTime; + public long getFirstPublishVersionTime() { + return firstPublishVersionTime; + } + + public long getLastPublishVersionTime() { + return this.lastPublishVersionTime; } public boolean hasSendTask() { @@ -423,10 +428,6 @@ public String getErrorLogUrl() { return errorLogUrl; } - public long getFirstPublishVersionTime() { - return firstPublishVersionTime; - } - public void setTransactionStatus(TransactionStatus transactionStatus) { // status changed this.preStatus = this.transactionStatus; @@ -645,7 +646,7 @@ public boolean isPublishTimeout() { if (prolongPublishTimeout) { timeoutMillis *= 2; } - return System.currentTimeMillis() - publishVersionTime > timeoutMillis; + return System.currentTimeMillis() - lastPublishVersionTime > timeoutMillis; } public void prolongPublishTimeout() { From 68fbd045a625873eaf0a2fdab53fb2217b189f05 Mon Sep 17 00:00:00 2001 From: yujun777 Date: Thu, 9 Nov 2023 16:41:05 +0800 Subject: [PATCH 3/4] remove nouse code --- .../org/apache/doris/transaction/DatabaseTransactionMgr.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java index 4016b07912d7ed..fe0009aa3b4780 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java @@ -970,7 +970,6 @@ public void finishTransaction(long transactionId) throws UserException { tableList = MetaLockUtils.writeLockTablesIfExist(tableList); PublishResult publishResult = PublishResult.QUORUM_SUCC; try { - boolean allTabletsLeastOneSucc = true; Iterator tableCommitInfoIterator = transactionState.getIdToTableCommitInfos().values().iterator(); while (tableCommitInfoIterator.hasNext()) { @@ -1058,10 +1057,6 @@ public void finishTransaction(long transactionId) throws UserException { continue; } - if (healthReplicaNum == 0) { - allTabletsLeastOneSucc = false; - } - String writeDetail = getTabletWriteDetail(tabletSuccReplicas, tabletWriteFailedReplicas, tabletVersionFailedReplicas); if (allowPublishOneSucc && healthReplicaNum > 0) { From 85e4bc7e7a49a67da35a0bd40dfa716f8e019919 Mon Sep 17 00:00:00 2001 From: yujun777 Date: Thu, 9 Nov 2023 18:47:47 +0800 Subject: [PATCH 4/4] add test --- .../load/insert/test_publish_one_succ.out | 10 +++++ .../load/insert/test_publish_one_succ.groovy | 45 +++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 regression-test/data/load/insert/test_publish_one_succ.out create mode 100644 regression-test/suites/load/insert/test_publish_one_succ.groovy diff --git a/regression-test/data/load/insert/test_publish_one_succ.out b/regression-test/data/load/insert/test_publish_one_succ.out new file mode 100644 index 00000000000000..c82f0c7f1dac34 --- /dev/null +++ b/regression-test/data/load/insert/test_publish_one_succ.out @@ -0,0 +1,10 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_1 -- + +-- !select_2 -- +1 10 +2 20 +3 30 +4 40 +5 50 + diff --git a/regression-test/suites/load/insert/test_publish_one_succ.groovy b/regression-test/suites/load/insert/test_publish_one_succ.groovy new file mode 100644 index 00000000000000..4e331b5c8d2515 --- /dev/null +++ b/regression-test/suites/load/insert/test_publish_one_succ.groovy @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType + +suite('test_publish_one_succ') { + def options = new ClusterOptions() + options.enableDebugPoints() + docker(options) { + cluster.injectDebugPoints(NodeType.FE, ['PublishVersionDaemon.stop_publish':null]) + + sql 'SET GLOBAL insert_visible_timeout_ms = 1000' + sql "ADMIN SET FRONTEND CONFIG ('publish_wait_time_second' = '1000000')" + sql 'CREATE TABLE tbl (k1 int, k2 int)' + for (def i = 1; i <= 5; i++) { + sql "INSERT INTO tbl VALUES (${i}, ${10 * i})" + } + + cluster.stopBackends(2, 3) + cluster.checkBeIsAlive(2, false) + cluster.checkBeIsAlive(3, false) + cluster.clearFrontendDebugPoints() + + sleep(1000) + order_qt_select_1 'SELECT * FROM tbl' + sql "ADMIN SET FRONTEND CONFIG ('publish_wait_time_second' = '2')" + sleep(2000) + order_qt_select_2 'SELECT * FROM tbl' + } +}