From 606f0fe4cc13760728ffb1950a5c13aaf92727e6 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Mon, 13 Jan 2025 14:27:50 +0800 Subject: [PATCH] [Fix](cloud) Remove pending delete bitmap's `lock_id` check when commit txn in MS (#46841) Related PR: https://github.com/apache/doris/pull/46039 Problem Summary: https://github.com/apache/doris/pull/46039 add a defensive check when commit_txn in MS to check whether the `lock_id` of pending delete bitmaps on tablets involved in the txn is the current txn's `lock_id`. But this may report a false negative in the following circumstance: 1. heavy schema change begins and add shadow index to table. 2. txn A load data to base index and shadow index. 3. txn A write its pending delete bitmaps on MS. This includes tablets of base index and shadow index. 4. txn A failed to remove its pending delete bitmaps for some reson(e.g. `commit_txn()` failed due to too large value) 5. txn B load data to base index and shadow index. 6. schema change failed for some reason and **remove shadow index on table.** 7. txn B send delete bitmap calculation task to BE. **Note that this will not involved tablets under shadow index because these tablets have been dropped.** **So these tablets' pending delete bitmaps will still be txn A's**. 8. txn B commit txn on MS and find that pending delete bitmaps' `lock_id` on tablets under shadow index not match. And txn B will failed. We can see that the checks on these dropped tablets are useless so we remove the mandatory check to avoid this false negative and print a warning log instead to help locate problems. --- cloud/src/meta-service/meta_service_txn.cpp | 6 +++--- cloud/test/meta_service_test.cpp | 17 ++++++++++++++++- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/cloud/src/meta-service/meta_service_txn.cpp b/cloud/src/meta-service/meta_service_txn.cpp index d624d428ef9ea1..74092080e2f011 100644 --- a/cloud/src/meta-service/meta_service_txn.cpp +++ b/cloud/src/meta-service/meta_service_txn.cpp @@ -1252,12 +1252,12 @@ void commit_txn_immediately( return; } if (pending_info.has_lock_id() && pending_info.lock_id() != lock_id) { - code = MetaServiceCode::PENDING_DELETE_BITMAP_WRONG; - msg = fmt::format( + TEST_SYNC_POINT_CALLBACK("commit_txn:check_pending_delete_bitmap_lock_id", + &tablet_id); + LOG_WARNING( "wrong lock_id in pending delete bitmap infos, expect lock_id={}, but " "found {} tablet_id={} instance_id={}", lock_id, pending_info.lock_id(), tablet_id, instance_id); - return; } txn->remove(pending_key); diff --git a/cloud/test/meta_service_test.cpp b/cloud/test/meta_service_test.cpp index 49b1587228c912..b7004716035a46 100644 --- a/cloud/test/meta_service_test.cpp +++ b/cloud/test/meta_service_test.cpp @@ -5479,6 +5479,15 @@ TEST(MetaServiceTest, WrongPendingBitmapTest) { const std::string& cloud_unique_id); auto instance_id = get_instance_id(meta_service->resource_mgr(), "test_cloud_unique_id"); + std::set real_wrong_pending_delete_bitmap_tablet_ids; + std::set expected_wrong_pending_delete_bitmap_tablet_ids; + auto sp = SyncPoint::get_instance(); + sp->set_call_back("commit_txn:check_pending_delete_bitmap_lock_id", [&](auto&& args) { + auto* tablet_id = try_any_cast(args[0]); + real_wrong_pending_delete_bitmap_tablet_ids.insert(*tablet_id); + }); + sp->enable_processing(); + // case: first version of rowset { int64_t txn_id = 56789; @@ -5587,6 +5596,8 @@ TEST(MetaServiceTest, WrongPendingBitmapTest) { ASSERT_TRUE(pending_info.SerializeToString(&pending_val)); txn->put(pending_key, pending_val); ASSERT_EQ(txn->commit(), TxnErrorCode::TXN_OK); + + expected_wrong_pending_delete_bitmap_tablet_ids.insert(tablet_id_base); } // commit txn @@ -5600,9 +5611,13 @@ TEST(MetaServiceTest, WrongPendingBitmapTest) { CommitTxnResponse res; meta_service->commit_txn(reinterpret_cast<::google::protobuf::RpcController*>(&cntl), &req, &res, nullptr); - ASSERT_EQ(res.status().code(), MetaServiceCode::PENDING_DELETE_BITMAP_WRONG); + ASSERT_EQ(expected_wrong_pending_delete_bitmap_tablet_ids, + real_wrong_pending_delete_bitmap_tablet_ids); } } + + SyncPoint::get_instance()->disable_processing(); + SyncPoint::get_instance()->clear_all_call_backs(); } TEST(MetaServiceTest, GetDeleteBitmapWithRetryTest1) {