From 309081292144e030681152d27bc0b1fdfc0df4d7 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Mon, 10 Mar 2025 17:22:45 +0800 Subject: [PATCH 1/3] add re-produce case --- be/src/cloud/cloud_tablet.cpp | 2 + ...est_cloud_publish_skip_calc_cache_miss.out | 15 ++++ ..._cloud_publish_skip_calc_cache_miss.groovy | 78 +++++++++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 regression-test/data/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.out create mode 100644 regression-test/suites/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.groovy diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 5a367810d0698e..8ce45ac3c235b5 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -732,6 +732,8 @@ Status CloudTablet::save_delete_bitmap(const TabletTxnInfo* txn_info, int64_t tx DBUG_EXECUTE_IF("CloudTablet::save_delete_bitmap.injected_error", { auto retry = dp->param("retry", false); + auto sleep_sec = dp->param("sleep", 0); + std::this_thread::sleep_for(std::chrono::seconds(sleep_sec)); if (retry) { // return DELETE_BITMAP_LOCK_ERROR to let it retry return Status::Error( "injected DELETE_BITMAP_LOCK_ERROR"); diff --git a/regression-test/data/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.out b/regression-test/data/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.out new file mode 100644 index 00000000000000..e6b0aec165c456 --- /dev/null +++ b/regression-test/data/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.out @@ -0,0 +1,15 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1 1 1 +2 2 2 +3 3 3 + +-- !dup_key_count -- +1 + +-- !sql -- +1 1 1 +1 999 999 +2 2 2 +3 3 3 + diff --git a/regression-test/suites/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.groovy b/regression-test/suites/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.groovy new file mode 100644 index 00000000000000..b9e257236b3df5 --- /dev/null +++ b/regression-test/suites/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.groovy @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_cloud_publish_skip_calc_cache_miss", "nonConcurrent") { + if (!isCloudMode()) { + return + } + + def customFeConfig = [ + delete_bitmap_lock_expiration_seconds : 10, + calculate_delete_bitmap_task_timeout_seconds : 5, + ] + + setFeConfigTemporary(customFeConfig) { + + def table1 = "test_cloud_publish_skip_calc_cache_miss" + sql "DROP TABLE IF EXISTS ${table1} FORCE;" + sql """ CREATE TABLE IF NOT EXISTS ${table1} ( + `k1` int NOT NULL, + `c1` int, + `c2` int + )UNIQUE KEY(k1) + DISTRIBUTED BY HASH(k1) BUCKETS 1 + PROPERTIES ( + "enable_unique_key_merge_on_write" = "true", + "disable_auto_compaction" = "true", + "replication_num" = "1"); """ + + sql "insert into ${table1} values(1,1,1);" + sql "insert into ${table1} values(2,2,2);" + sql "insert into ${table1} values(3,3,3);" + sql "sync;" + order_qt_sql "select * from ${table1};" + + try { + GetDebugPoint().clearDebugPointsForAllFEs() + GetDebugPoint().clearDebugPointsForAllBEs() + + GetDebugPoint().enableDebugPointForAllBEs("CloudTxnDeleteBitmapCache::get_delete_bitmap.cache_miss") + // inject failure after saving its result in BE's delete bitmap cache successfully, so that later retry will + // try to skip to calculate + GetDebugPoint().enableDebugPointForAllBEs("CloudTablet::save_delete_bitmap.injected_error", [retry: true, sleep: 5]) + + def t1 = Thread.start { + sql "insert into ${table1} values(1,999,999);" + } + + Thread.sleep(3000) + + GetDebugPoint().disableDebugPointForAllBEs("CloudEngineCalcDeleteBitmapTask.handle.inject_sleep") + + t1.join() + + qt_dup_key_count "select count() from (select k1,count() as cnt from ${table1} group by k1 having cnt > 1) A;" + qt_sql "select * from ${table1} order by k1,c1,c2;" + } catch(Exception e) { + logger.info(e.getMessage()) + throw e + } finally { + GetDebugPoint().clearDebugPointsForAllBEs() + GetDebugPoint().clearDebugPointsForAllFEs() + } + } +} From cb5a55924f7b58945e8952123df0c9345863e634 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Mon, 10 Mar 2025 17:35:23 +0800 Subject: [PATCH 2/3] fix and correct case --- be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp | 6 ++++-- be/src/cloud/cloud_txn_delete_bitmap_cache.cpp | 11 ++++++++++- be/src/cloud/cloud_txn_delete_bitmap_cache.h | 2 +- .../cloud/test_cloud_publish_skip_calc_cache_miss.out | 3 +-- .../test_cloud_publish_skip_calc_cache_miss.groovy | 2 +- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp index 06ebf249edb0ef..7eb4c36145f696 100644 --- a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp +++ b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp @@ -280,9 +280,10 @@ Status CloudTabletCalcDeleteBitmapTask::_handle_rowset( std::shared_ptr publish_status; int64_t txn_expiration; TxnPublishInfo previous_publish_info; + bool cache_hit {false}; Status status = _engine.txn_delete_bitmap_cache().get_tablet_txn_info( transaction_id, _tablet_id, &rowset, &delete_bitmap, &rowset_ids, &txn_expiration, - &partial_update_info, &publish_status, &previous_publish_info); + &partial_update_info, &publish_status, &previous_publish_info, &cache_hit); if (status != Status::OK()) { LOG(WARNING) << "failed to get tablet txn info. tablet_id=" << _tablet_id << ", " << txn_str << ", status=" << status; @@ -301,7 +302,8 @@ Status CloudTabletCalcDeleteBitmapTask::_handle_rowset( .base_compaction_cnt = _ms_base_compaction_cnt, .cumulative_compaction_cnt = _ms_cumulative_compaction_cnt, .cumulative_point = _ms_cumulative_point}; - if (txn_info.publish_status && (*(txn_info.publish_status) == PublishStatus::SUCCEED) && + if (cache_hit && txn_info.publish_status && + (*(txn_info.publish_status) == PublishStatus::SUCCEED) && version == previous_publish_info.publish_version && _ms_base_compaction_cnt == previous_publish_info.base_compaction_cnt && _ms_cumulative_compaction_cnt == previous_publish_info.cumulative_compaction_cnt && diff --git a/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp b/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp index 95077f5c8884c6..6d383c230bf5bd 100644 --- a/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp +++ b/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp @@ -56,7 +56,8 @@ Status CloudTxnDeleteBitmapCache::get_tablet_txn_info( TTransactionId transaction_id, int64_t tablet_id, RowsetSharedPtr* rowset, DeleteBitmapPtr* delete_bitmap, RowsetIdUnorderedSet* rowset_ids, int64_t* txn_expiration, std::shared_ptr* partial_update_info, - std::shared_ptr* publish_status, TxnPublishInfo* previous_publish_info) { + std::shared_ptr* publish_status, TxnPublishInfo* previous_publish_info, + bool* cache_hit) { { std::shared_lock rlock(_rwlock); TxnKey key(transaction_id, tablet_id); @@ -75,6 +76,14 @@ Status CloudTxnDeleteBitmapCache::get_tablet_txn_info( auto st = get_delete_bitmap(transaction_id, tablet_id, delete_bitmap, rowset_ids, nullptr); + if (cache_hit != nullptr) { + if (st.ok()) { + *cache_hit = true; + } else { + *cache_hit = false; + } + } + if (st.is()) { // Because of the rowset_ids become empty, all delete bitmap // will be recalculate in CalcDeleteBitmapTask diff --git a/be/src/cloud/cloud_txn_delete_bitmap_cache.h b/be/src/cloud/cloud_txn_delete_bitmap_cache.h index 91a0531c60ae04..a12b40ac51beae 100644 --- a/be/src/cloud/cloud_txn_delete_bitmap_cache.h +++ b/be/src/cloud/cloud_txn_delete_bitmap_cache.h @@ -43,7 +43,7 @@ class CloudTxnDeleteBitmapCache : public LRUCachePolicy { RowsetIdUnorderedSet* rowset_ids, int64_t* txn_expiration, std::shared_ptr* partial_update_info, std::shared_ptr* publish_status, - TxnPublishInfo* previous_publish_info); + TxnPublishInfo* previous_publish_info, bool* cache_hit); void set_tablet_txn_info(TTransactionId transaction_id, int64_t tablet_id, DeleteBitmapPtr delete_bitmap, const RowsetIdUnorderedSet& rowset_ids, diff --git a/regression-test/data/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.out b/regression-test/data/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.out index e6b0aec165c456..5444c51dbc883d 100644 --- a/regression-test/data/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.out +++ b/regression-test/data/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.out @@ -5,10 +5,9 @@ 3 3 3 -- !dup_key_count -- -1 +0 -- !sql -- -1 1 1 1 999 999 2 2 2 3 3 3 diff --git a/regression-test/suites/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.groovy b/regression-test/suites/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.groovy index b9e257236b3df5..f11c8cc790f14b 100644 --- a/regression-test/suites/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.groovy +++ b/regression-test/suites/fault_injection_p0/cloud/test_cloud_publish_skip_calc_cache_miss.groovy @@ -61,7 +61,7 @@ suite("test_cloud_publish_skip_calc_cache_miss", "nonConcurrent") { Thread.sleep(3000) - GetDebugPoint().disableDebugPointForAllBEs("CloudEngineCalcDeleteBitmapTask.handle.inject_sleep") + GetDebugPoint().disableDebugPointForAllBEs("CloudTablet::save_delete_bitmap.injected_error") t1.join() From 569dbebc7ebfc9821e59a482071bc5bd2c568cf8 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Tue, 11 Mar 2025 17:21:02 +0800 Subject: [PATCH 3/3] fix --- .../cloud/cloud_engine_calc_delete_bitmap_task.cpp | 6 ++---- be/src/cloud/cloud_txn_delete_bitmap_cache.cpp | 14 ++++---------- be/src/cloud/cloud_txn_delete_bitmap_cache.h | 2 +- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp index 7eb4c36145f696..06ebf249edb0ef 100644 --- a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp +++ b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp @@ -280,10 +280,9 @@ Status CloudTabletCalcDeleteBitmapTask::_handle_rowset( std::shared_ptr publish_status; int64_t txn_expiration; TxnPublishInfo previous_publish_info; - bool cache_hit {false}; Status status = _engine.txn_delete_bitmap_cache().get_tablet_txn_info( transaction_id, _tablet_id, &rowset, &delete_bitmap, &rowset_ids, &txn_expiration, - &partial_update_info, &publish_status, &previous_publish_info, &cache_hit); + &partial_update_info, &publish_status, &previous_publish_info); if (status != Status::OK()) { LOG(WARNING) << "failed to get tablet txn info. tablet_id=" << _tablet_id << ", " << txn_str << ", status=" << status; @@ -302,8 +301,7 @@ Status CloudTabletCalcDeleteBitmapTask::_handle_rowset( .base_compaction_cnt = _ms_base_compaction_cnt, .cumulative_compaction_cnt = _ms_cumulative_compaction_cnt, .cumulative_point = _ms_cumulative_point}; - if (cache_hit && txn_info.publish_status && - (*(txn_info.publish_status) == PublishStatus::SUCCEED) && + if (txn_info.publish_status && (*(txn_info.publish_status) == PublishStatus::SUCCEED) && version == previous_publish_info.publish_version && _ms_base_compaction_cnt == previous_publish_info.base_compaction_cnt && _ms_cumulative_compaction_cnt == previous_publish_info.cumulative_compaction_cnt && diff --git a/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp b/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp index 6d383c230bf5bd..9a8b669e0ae2d9 100644 --- a/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp +++ b/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp @@ -56,8 +56,7 @@ Status CloudTxnDeleteBitmapCache::get_tablet_txn_info( TTransactionId transaction_id, int64_t tablet_id, RowsetSharedPtr* rowset, DeleteBitmapPtr* delete_bitmap, RowsetIdUnorderedSet* rowset_ids, int64_t* txn_expiration, std::shared_ptr* partial_update_info, - std::shared_ptr* publish_status, TxnPublishInfo* previous_publish_info, - bool* cache_hit) { + std::shared_ptr* publish_status, TxnPublishInfo* previous_publish_info) { { std::shared_lock rlock(_rwlock); TxnKey key(transaction_id, tablet_id); @@ -76,20 +75,15 @@ Status CloudTxnDeleteBitmapCache::get_tablet_txn_info( auto st = get_delete_bitmap(transaction_id, tablet_id, delete_bitmap, rowset_ids, nullptr); - if (cache_hit != nullptr) { - if (st.ok()) { - *cache_hit = true; - } else { - *cache_hit = false; - } - } - if (st.is()) { // Because of the rowset_ids become empty, all delete bitmap // will be recalculate in CalcDeleteBitmapTask if (delete_bitmap != nullptr) { *delete_bitmap = std::make_shared(tablet_id); } + // to avoid to skip calculating + **publish_status = PublishStatus::INIT; + return Status::OK(); } return st; diff --git a/be/src/cloud/cloud_txn_delete_bitmap_cache.h b/be/src/cloud/cloud_txn_delete_bitmap_cache.h index a12b40ac51beae..91a0531c60ae04 100644 --- a/be/src/cloud/cloud_txn_delete_bitmap_cache.h +++ b/be/src/cloud/cloud_txn_delete_bitmap_cache.h @@ -43,7 +43,7 @@ class CloudTxnDeleteBitmapCache : public LRUCachePolicy { RowsetIdUnorderedSet* rowset_ids, int64_t* txn_expiration, std::shared_ptr* partial_update_info, std::shared_ptr* publish_status, - TxnPublishInfo* previous_publish_info, bool* cache_hit); + TxnPublishInfo* previous_publish_info); void set_tablet_txn_info(TTransactionId transaction_id, int64_t tablet_id, DeleteBitmapPtr delete_bitmap, const RowsetIdUnorderedSet& rowset_ids,