From 9fe8b02af5f6e493a37ea076f354a4eb879aa2ed Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 25 Sep 2024 15:28:29 +0800 Subject: [PATCH 1/8] add case --- be/src/cloud/cloud_cumulative_compaction.cpp | 9 ++ .../cloud/cloud_txn_delete_bitmap_cache.cpp | 7 ++ ...n_get_delete_bitmap_from_cache_fail.groovy | 82 +++++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100644 regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy diff --git a/be/src/cloud/cloud_cumulative_compaction.cpp b/be/src/cloud/cloud_cumulative_compaction.cpp index ea5fa7cc340158..1a2703c3e2041c 100644 --- a/be/src/cloud/cloud_cumulative_compaction.cpp +++ b/be/src/cloud/cloud_cumulative_compaction.cpp @@ -255,6 +255,15 @@ Status CloudCumulativeCompaction::modify_rowsets() { compaction_job->add_txn_id(_output_rowset->txn_id()); compaction_job->add_output_rowset_ids(_output_rowset->rowset_id().to_string()); + DBUG_EXECUTE_IF("CloudCumulativeCompaction::modify_rowsets.enable_spin_wait", { + auto token = dp->param("token", "invalid_token"); + LOG(INFO) << "CloudCumulativeCompaction::modify_rowsets.enable_spin_wait, start"; + while (DebugPoints::instance()->is_enable("CloudCumulativeCompaction::modify_rowsets.block")) { + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + LOG(INFO) << "CloudCumulativeCompaction::modify_rowsets.enable_spin_wait, exit"; + }); + DeleteBitmapPtr output_rowset_delete_bitmap = nullptr; int64_t initiator = HashUtil::hash64(_uuid.data(), _uuid.size(), 0) & std::numeric_limits::max(); diff --git a/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp b/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp index 63a21bc0714beb..cf64bcd8cb0042 100644 --- a/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp +++ b/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp @@ -95,6 +95,13 @@ Status CloudTxnDeleteBitmapCache::get_delete_bitmap( CacheKey key(key_str); Cache::Handle* handle = lookup(key); + DBUG_EXECUTE_IF("CloudTxnDeleteBitmapCache::get_delete_bitmap.cache_miss", { + handle = nullptr; + LOG(INFO) << "CloudTxnDeleteBitmapCache::get_delete_bitmap.cache_miss, make cache missed " + "when get delete bitmap, txn_id:" + << transaction_id << ", tablet_id: " << tablet_id; + }); + DeleteBitmapCacheValue* val = handle == nullptr ? nullptr : reinterpret_cast(value(handle)); if (val) { diff --git a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy new file mode 100644 index 00000000000000..deb68cf4060a04 --- /dev/null +++ b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail", "nonConcurrent") { + if (!isCloudMode()) { + return + } + + def tableName = "test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail" + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ CREATE TABLE ${tableName} + (k int, v1 int, v2 int ) + UNIQUE KEY(k) + DISTRIBUTED BY HASH (k) + BUCKETS 1 PROPERTIES( + "replication_num" = "1", + "enable_unique_key_merge_on_write"="true", + "disable_auto_compaction" = "true"); """ + + sql "insert into ${tableName} values(1,1,1);" + sql "insert into ${tableName} values(2,2,2);" + sql "insert into ${tableName} values(3,3,3);" + sql "insert into ${tableName} values(4,4,4);" + sql "insert into ${tableName} values(5,5,5);" + sql "sync;" + order_qt_sql "select * from ${tableName};" + + GetDebugPoint().clearDebugPointsForAllFEs() + GetDebugPoint().clearDebugPointsForAllBEs() + + try { + def inject_spin_wait = 'CloudCumulativeCompaction::modify_rowsets.enable_spin_wait' + def inject_spin_block = 'CloudCumulativeCompaction::modify_rowsets.block' + def inject_cache_miss = 'CloudTxnDeleteBitmapCache::get_delete_bitmap.cache_miss' + def injectBe = null + def backends = sql_return_maparray('show backends') + def array = sql_return_maparray("SHOW TABLETS FROM ${tableName}") + def injectBeId = array[0].BackendId + def tabletId = array[0].TabletId + injectBe = backends.stream().filter(be -> be.BackendId == injectBeId).findFirst().orElse(null) + + DebugPoint.enableDebugPoint(injectBe.Host, injectBe.HttpPort.toInteger(), NodeType.BE, inject_spin_wait) + DebugPoint.enableDebugPoint(injectBe.Host, injectBe.HttpPort.toInteger(), NodeType.BE, inject_spin_block) + logger.info("run compaction:" + originTabletId) + (code, out, err) = be_run_cumulative_compaction(injectBe.Host, injectBe.HttpPort, tabletId) + logger.info("Run compaction: code=" + code + ", out=" + out + ", err=" + err) + + // Concurrent inserts + sql "insert into ${tableName} values(1,2,3);" + sql "insert into ${tableName} values(2,3,4);" + sql "insert into ${tableName} values(3,4,5);" + sql "sync;" + order_qt_sql "set use_fix_replica=0; select * from ${tableName};" + + // let compaction continue + DebugPoint.enableDebugPoint(injectBe.Host, injectBe.HttpPort.toInteger(), NodeType.BE, inject_cache_miss) + DebugPoint.disableDebugPoint(injectBe.Host, injectBe.HttpPort.toInteger(), NodeType.BE, inject_spin_block) + + order_qt_sql "set use_fix_replica=0; select * from ${tableName};" + } catch (Exception e) { + logger.info(e.getMessage()) + AssertTrue(false) + } finally { + GetDebugPoint().clearDebugPointsForAllFEs() + GetDebugPoint().clearDebugPointsForAllBEs() + } +} + From 07cc34e0ceab832daf1729839bdb19326e733e5e Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 25 Sep 2024 16:08:53 +0800 Subject: [PATCH 2/8] update --- ...oud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy index deb68cf4060a04..4b493c507c84fc 100644 --- a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy +++ b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy @@ -29,7 +29,8 @@ suite("test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail", "nonConcurr BUCKETS 1 PROPERTIES( "replication_num" = "1", "enable_unique_key_merge_on_write"="true", - "disable_auto_compaction" = "true"); """ + "disable_auto_compaction" = "true"); + """ sql "insert into ${tableName} values(1,1,1);" sql "insert into ${tableName} values(2,2,2);" From 23cdb899012a906a1fac3552ad2d8e518f7ab057 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 25 Sep 2024 20:14:39 +0800 Subject: [PATCH 3/8] update --- ...tion_get_delete_bitmap_from_cache_fail.groovy | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy index 4b493c507c84fc..e07ae3c6f063df 100644 --- a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy +++ b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +import org.apache.doris.regression.util.DebugPoint +import org.apache.doris.regression.util.NodeType + suite("test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail", "nonConcurrent") { if (!isCloudMode()) { return @@ -71,10 +74,21 @@ suite("test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail", "nonConcurr DebugPoint.enableDebugPoint(injectBe.Host, injectBe.HttpPort.toInteger(), NodeType.BE, inject_cache_miss) DebugPoint.disableDebugPoint(injectBe.Host, injectBe.HttpPort.toInteger(), NodeType.BE, inject_spin_block) + do { + Thread.sleep(100) + (code, out, err) = be_get_compaction_status(injectBe.Host, injectBe.HttpPort, tabletId) + logger.info("Get compaction status: code=" + code + ", out=" + out + ", err=" + err) + assertEquals(code, 0) + def compactionStatus = parseJson(out.trim()) + assertEquals("success", compactionStatus.status.toLowerCase()) + running = compactionStatus.run_status + } while (running) + + Thread.sleep(200) order_qt_sql "set use_fix_replica=0; select * from ${tableName};" } catch (Exception e) { logger.info(e.getMessage()) - AssertTrue(false) + assertTrue(false) } finally { GetDebugPoint().clearDebugPointsForAllFEs() GetDebugPoint().clearDebugPointsForAllBEs() From 5360df2846f5bc750daf783f890bb2e5b5c0cecc Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 25 Sep 2024 21:14:34 +0800 Subject: [PATCH 4/8] add debug log --- be/src/cloud/cloud_txn_delete_bitmap_cache.cpp | 2 ++ ...loud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp b/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp index cf64bcd8cb0042..ba0a68a21ff940 100644 --- a/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp +++ b/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp @@ -94,6 +94,7 @@ Status CloudTxnDeleteBitmapCache::get_delete_bitmap( std::string key_str = fmt::format("{}/{}", transaction_id, tablet_id); CacheKey key(key_str); Cache::Handle* handle = lookup(key); + LOG(INFO) << "DEBUG: get_delete_bitmap()"; DBUG_EXECUTE_IF("CloudTxnDeleteBitmapCache::get_delete_bitmap.cache_miss", { handle = nullptr; @@ -105,6 +106,7 @@ Status CloudTxnDeleteBitmapCache::get_delete_bitmap( DeleteBitmapCacheValue* val = handle == nullptr ? nullptr : reinterpret_cast(value(handle)); if (val) { + LOG(INFO) << "DEBUG: get_delete_bitmap(), found in cache"; *delete_bitmap = val->delete_bitmap; if (rowset_ids) { *rowset_ids = val->rowset_ids; diff --git a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy index e07ae3c6f063df..50f730b96bba79 100644 --- a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy +++ b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy @@ -59,7 +59,7 @@ suite("test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail", "nonConcurr DebugPoint.enableDebugPoint(injectBe.Host, injectBe.HttpPort.toInteger(), NodeType.BE, inject_spin_wait) DebugPoint.enableDebugPoint(injectBe.Host, injectBe.HttpPort.toInteger(), NodeType.BE, inject_spin_block) - logger.info("run compaction:" + originTabletId) + logger.info("run compaction:" + tabletId) (code, out, err) = be_run_cumulative_compaction(injectBe.Host, injectBe.HttpPort, tabletId) logger.info("Run compaction: code=" + code + ", out=" + out + ", err=" + err) From 50df8ef5c96d189e1802be546d69b0c2e797892e Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 25 Sep 2024 21:36:37 +0800 Subject: [PATCH 5/8] add fix code --- .../cloud/cloud_txn_delete_bitmap_cache.cpp | 23 ++++++++++++------- ...n_get_delete_bitmap_from_cache_fail.groovy | 2 +- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp b/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp index ba0a68a21ff940..55f76316fb7f26 100644 --- a/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp +++ b/be/src/cloud/cloud_txn_delete_bitmap_cache.cpp @@ -72,9 +72,18 @@ Status CloudTxnDeleteBitmapCache::get_tablet_txn_info( *publish_status = iter->second.publish_status; *previous_publish_info = iter->second.publish_info; } - RETURN_IF_ERROR( - get_delete_bitmap(transaction_id, tablet_id, delete_bitmap, rowset_ids, nullptr)); - return Status::OK(); + + auto st = get_delete_bitmap(transaction_id, tablet_id, delete_bitmap, rowset_ids, nullptr); + + if (st.is()) { + // Because of the rowset_ids become empty, all delete bitmap + // will be recalculate in CalcDeleteBitmapTask + if (delete_bitmap != nullptr) { + *delete_bitmap = std::make_shared(tablet_id); + } + return Status::OK(); + } + return st; } Status CloudTxnDeleteBitmapCache::get_delete_bitmap( @@ -94,7 +103,6 @@ Status CloudTxnDeleteBitmapCache::get_delete_bitmap( std::string key_str = fmt::format("{}/{}", transaction_id, tablet_id); CacheKey key(key_str); Cache::Handle* handle = lookup(key); - LOG(INFO) << "DEBUG: get_delete_bitmap()"; DBUG_EXECUTE_IF("CloudTxnDeleteBitmapCache::get_delete_bitmap.cache_miss", { handle = nullptr; @@ -106,7 +114,6 @@ Status CloudTxnDeleteBitmapCache::get_delete_bitmap( DeleteBitmapCacheValue* val = handle == nullptr ? nullptr : reinterpret_cast(value(handle)); if (val) { - LOG(INFO) << "DEBUG: get_delete_bitmap(), found in cache"; *delete_bitmap = val->delete_bitmap; if (rowset_ids) { *rowset_ids = val->rowset_ids; @@ -118,9 +125,9 @@ Status CloudTxnDeleteBitmapCache::get_delete_bitmap( LOG_INFO("cache missed when get delete bitmap") .tag("txn_id", transaction_id) .tag("tablet_id", tablet_id); - // Because of the rowset_ids become empty, all delete bitmap - // will be recalculate in CalcDeleteBitmapTask - *delete_bitmap = std::make_shared(tablet_id); + return Status::Error( + "cache missed when get delete bitmap, tablet_id={}, transaction_id={}", tablet_id, + transaction_id); } return Status::OK(); } diff --git a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy index 50f730b96bba79..72fda5eea2680c 100644 --- a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy +++ b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.groovy @@ -59,6 +59,7 @@ suite("test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail", "nonConcurr DebugPoint.enableDebugPoint(injectBe.Host, injectBe.HttpPort.toInteger(), NodeType.BE, inject_spin_wait) DebugPoint.enableDebugPoint(injectBe.Host, injectBe.HttpPort.toInteger(), NodeType.BE, inject_spin_block) + DebugPoint.enableDebugPoint(injectBe.Host, injectBe.HttpPort.toInteger(), NodeType.BE, inject_cache_miss) logger.info("run compaction:" + tabletId) (code, out, err) = be_run_cumulative_compaction(injectBe.Host, injectBe.HttpPort, tabletId) logger.info("Run compaction: code=" + code + ", out=" + out + ", err=" + err) @@ -71,7 +72,6 @@ suite("test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail", "nonConcurr order_qt_sql "set use_fix_replica=0; select * from ${tableName};" // let compaction continue - DebugPoint.enableDebugPoint(injectBe.Host, injectBe.HttpPort.toInteger(), NodeType.BE, inject_cache_miss) DebugPoint.disableDebugPoint(injectBe.Host, injectBe.HttpPort.toInteger(), NodeType.BE, inject_spin_block) do { From 4e9a20849ce3180608a1524977b4ad96e69b0838 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 25 Sep 2024 21:58:51 +0800 Subject: [PATCH 6/8] update --- be/src/cloud/cloud_cumulative_compaction.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/be/src/cloud/cloud_cumulative_compaction.cpp b/be/src/cloud/cloud_cumulative_compaction.cpp index 1a2703c3e2041c..b70cc40f750057 100644 --- a/be/src/cloud/cloud_cumulative_compaction.cpp +++ b/be/src/cloud/cloud_cumulative_compaction.cpp @@ -256,7 +256,6 @@ Status CloudCumulativeCompaction::modify_rowsets() { compaction_job->add_output_rowset_ids(_output_rowset->rowset_id().to_string()); DBUG_EXECUTE_IF("CloudCumulativeCompaction::modify_rowsets.enable_spin_wait", { - auto token = dp->param("token", "invalid_token"); LOG(INFO) << "CloudCumulativeCompaction::modify_rowsets.enable_spin_wait, start"; while (DebugPoints::instance()->is_enable("CloudCumulativeCompaction::modify_rowsets.block")) { std::this_thread::sleep_for(std::chrono::milliseconds(50)); From 263b263e2292ad7c34869a70f7e79d7433770890 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 25 Sep 2024 22:01:39 +0800 Subject: [PATCH 7/8] add regression case output --- ...tion_get_delete_bitmap_from_cache_fail.out | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 regression-test/data/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.out diff --git a/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.out b/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.out new file mode 100644 index 00000000000000..f3c19536c9b830 --- /dev/null +++ b/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_compaction_get_delete_bitmap_from_cache_fail.out @@ -0,0 +1,22 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 + +-- !sql -- +1 2 3 +2 3 4 +3 4 5 +4 4 4 +5 5 5 + +-- !sql -- +1 2 3 +2 3 4 +3 4 5 +4 4 4 +5 5 5 + From 09978221b4add3434992638b88596b3bc0e4bf56 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 25 Sep 2024 22:03:06 +0800 Subject: [PATCH 8/8] reformat code --- be/src/cloud/cloud_cumulative_compaction.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/be/src/cloud/cloud_cumulative_compaction.cpp b/be/src/cloud/cloud_cumulative_compaction.cpp index b70cc40f750057..cb0c4e7f37686c 100644 --- a/be/src/cloud/cloud_cumulative_compaction.cpp +++ b/be/src/cloud/cloud_cumulative_compaction.cpp @@ -257,7 +257,8 @@ Status CloudCumulativeCompaction::modify_rowsets() { DBUG_EXECUTE_IF("CloudCumulativeCompaction::modify_rowsets.enable_spin_wait", { LOG(INFO) << "CloudCumulativeCompaction::modify_rowsets.enable_spin_wait, start"; - while (DebugPoints::instance()->is_enable("CloudCumulativeCompaction::modify_rowsets.block")) { + while (DebugPoints::instance()->is_enable( + "CloudCumulativeCompaction::modify_rowsets.block")) { std::this_thread::sleep_for(std::chrono::milliseconds(50)); } LOG(INFO) << "CloudCumulativeCompaction::modify_rowsets.enable_spin_wait, exit";