diff --git a/cloud/src/common/config.h b/cloud/src/common/config.h index 2467e99ba1f5ab..b89bd5b363dc19 100644 --- a/cloud/src/common/config.h +++ b/cloud/src/common/config.h @@ -97,9 +97,6 @@ CONF_Int32(recycle_pool_parallelism, "40"); CONF_Bool(enable_inverted_check, "false"); // Currently only used for recycler test CONF_Bool(enable_delete_bitmap_inverted_check, "false"); -// checks if https://github.com/apache/doris/pull/40204 works as expected -CONF_Bool(enable_delete_bitmap_storage_optimize_check, "false"); -CONF_mInt64(delete_bitmap_storage_optimize_check_version_gap, "1000"); CONF_Bool(enable_delete_bitmap_storage_optimize_v2_check, "false"); CONF_mInt64(delete_bitmap_storage_optimize_v2_check_skip_seconds, "300"); // 5min // interval for scanning instances to do checks and inspections diff --git a/cloud/src/recycler/checker.cpp b/cloud/src/recycler/checker.cpp index 28243add46d77b..67e358cfa3505a 100644 --- a/cloud/src/recycler/checker.cpp +++ b/cloud/src/recycler/checker.cpp @@ -189,12 +189,6 @@ int Checker::start() { } } - if (config::enable_delete_bitmap_storage_optimize_check) { - if (int ret = checker->do_delete_bitmap_storage_optimize_check(); ret != 0) { - success = false; - } - } - if (config::enable_mow_job_key_check) { if (int ret = checker->do_mow_job_key_check(); ret != 0) { success = false; @@ -1221,113 +1215,6 @@ int InstanceChecker::do_delete_bitmap_inverted_check() { return (leaked_delete_bitmaps > 0 || abnormal_delete_bitmaps > 0) ? 1 : 0; } -int InstanceChecker::check_delete_bitmap_storage_optimize(int64_t tablet_id) { - using Version = std::pair; - struct RowsetDigest { - std::string rowset_id; - Version version; - doris::SegmentsOverlapPB segments_overlap; - - bool operator<(const RowsetDigest& other) const { - return version.first < other.version.first; - } - - bool produced_by_compaction() const { - return (version.first < version.second) || - ((version.first == version.second) && segments_overlap == NONOVERLAPPING); - } - }; - - // number of rowsets which may have problems - int64_t abnormal_rowsets_num {0}; - - std::vector tablet_rowsets {}; - // Get all visible rowsets of this tablet - auto collect_cb = [&tablet_rowsets](const doris::RowsetMetaCloudPB& rowset) { - if (rowset.start_version() == 0 && rowset.end_version() == 1) { - // ignore dummy rowset [0-1] - return; - } - tablet_rowsets.emplace_back( - rowset.rowset_id_v2(), - std::make_pair(rowset.start_version(), rowset.end_version()), - rowset.segments_overlap_pb()); - }; - if (int ret = collect_tablet_rowsets(tablet_id, collect_cb); ret != 0) { - return ret; - } - - std::sort(tablet_rowsets.begin(), tablet_rowsets.end()); - - // find right-most rowset which is produced by compaction - auto it = std::find_if( - tablet_rowsets.crbegin(), tablet_rowsets.crend(), - [](const RowsetDigest& rowset) { return rowset.produced_by_compaction(); }); - if (it == tablet_rowsets.crend()) { - LOG(INFO) << fmt::format( - "[delete bitmap checker] skip to check delete bitmap storage optimize for " - "tablet_id={} because it doesn't have compacted rowsets.", - tablet_id); - return 0; - } - - int64_t start_version = it->version.first; - int64_t pre_min_version = it->version.second; - - // after BE sweeping stale rowsets, all rowsets in this tablet before - // should not have delete bitmaps with versions lower than `pre_min_version` - if (config::delete_bitmap_storage_optimize_check_version_gap > 0) { - pre_min_version -= config::delete_bitmap_storage_optimize_check_version_gap; - if (pre_min_version <= 1) { - LOG(INFO) << fmt::format( - "[delete bitmap checker] skip to check delete bitmap storage optimize for " - "tablet_id={} because pre_min_version is too small.", - tablet_id); - return 0; - } - } - - auto check_func = [pre_min_version, instance_id = instance_id_]( - int64_t tablet_id, std::string_view rowset_id, int64_t version, - int64_t segment_id) -> int { - if (version < pre_min_version) { - LOG(WARNING) << fmt::format( - "[delete bitmap check fails] delete bitmap storage optimize check fail for " - "instance_id={}, tablet_id={}, rowset_id={}, found delete bitmap with " - "version={} < pre_min_version={}", - instance_id, tablet_id, rowset_id, version, pre_min_version); - return 1; - } - return 0; - }; - - for (const auto& rowset : tablet_rowsets) { - // check for all rowsets before the max compacted rowset - if (rowset.version.second < start_version) { - auto rowset_id = rowset.rowset_id; - int ret = traverse_rowset_delete_bitmaps(tablet_id, rowset_id, check_func); - if (ret < 0) { - return ret; - } - - if (ret != 0) { - ++abnormal_rowsets_num; - TEST_SYNC_POINT_CALLBACK( - "InstanceChecker::check_delete_bitmap_storage_optimize.get_abnormal_rowset", - &tablet_id, &rowset_id); - } - } - } - - LOG(INFO) << fmt::format( - "[delete bitmap checker] finish check delete bitmap storage optimize for " - "instance_id={}, tablet_id={}, rowsets_num={}, abnormal_rowsets_num={}, " - "pre_min_version={}", - instance_id_, tablet_id, tablet_rowsets.size(), abnormal_rowsets_num, pre_min_version); - - return (abnormal_rowsets_num > 1 ? 1 : 0); -} - int InstanceChecker::get_pending_delete_bitmap_keys( int64_t tablet_id, std::unordered_set& pending_delete_bitmaps) { std::unique_ptr txn; @@ -1520,6 +1407,9 @@ int InstanceChecker::check_delete_bitmap_storage_optimize_v2( } int InstanceChecker::do_delete_bitmap_storage_optimize_check(int version) { + if (version != 2) { + return -1; + } int64_t total_tablets_num {0}; int64_t failed_tablets_num {0}; @@ -1531,20 +1421,13 @@ int InstanceChecker::do_delete_bitmap_storage_optimize_check(int version) { int ret = traverse_mow_tablet([&](int64_t tablet_id) { ++total_tablets_num; int64_t rowsets_with_useless_delete_bitmap_version = 0; - int res = 0; - if (version == 1) { - res = check_delete_bitmap_storage_optimize(tablet_id); - } else if (version == 2) { - res = check_delete_bitmap_storage_optimize_v2( - tablet_id, rowsets_with_useless_delete_bitmap_version); - if (rowsets_with_useless_delete_bitmap_version > - max_rowsets_with_useless_delete_bitmap_version) { - max_rowsets_with_useless_delete_bitmap_version = - rowsets_with_useless_delete_bitmap_version; - tablet_id_with_max_rowsets_with_useless_delete_bitmap_version = tablet_id; - } - } else { - return -1; + int res = check_delete_bitmap_storage_optimize_v2( + tablet_id, rowsets_with_useless_delete_bitmap_version); + if (rowsets_with_useless_delete_bitmap_version > + max_rowsets_with_useless_delete_bitmap_version) { + max_rowsets_with_useless_delete_bitmap_version = + rowsets_with_useless_delete_bitmap_version; + tablet_id_with_max_rowsets_with_useless_delete_bitmap_version = tablet_id; } failed_tablets_num += (res != 0); return res; @@ -1554,20 +1437,16 @@ int InstanceChecker::do_delete_bitmap_storage_optimize_check(int version) { return ret; } - if (version == 2) { - g_bvar_max_rowsets_with_useless_delete_bitmap_version.put( - instance_id_, max_rowsets_with_useless_delete_bitmap_version); - } + g_bvar_max_rowsets_with_useless_delete_bitmap_version.put( + instance_id_, max_rowsets_with_useless_delete_bitmap_version); std::stringstream ss; ss << "[delete bitmap checker] check delete bitmap storage optimize v" << version << " for instance_id=" << instance_id_ << ", total_tablets_num=" << total_tablets_num - << ", failed_tablets_num=" << failed_tablets_num; - if (version == 2) { - ss << ". max_rowsets_with_useless_delete_bitmap_version=" - << max_rowsets_with_useless_delete_bitmap_version - << ", tablet_id=" << tablet_id_with_max_rowsets_with_useless_delete_bitmap_version; - } + << ", failed_tablets_num=" << failed_tablets_num + << ". max_rowsets_with_useless_delete_bitmap_version=" + << max_rowsets_with_useless_delete_bitmap_version + << ", tablet_id=" << tablet_id_with_max_rowsets_with_useless_delete_bitmap_version; LOG(INFO) << ss.str(); return (failed_tablets_num > 0) ? 1 : 0; diff --git a/cloud/src/recycler/checker.h b/cloud/src/recycler/checker.h index d1b909a42030d3..afa96ca5ae4cdd 100644 --- a/cloud/src/recycler/checker.h +++ b/cloud/src/recycler/checker.h @@ -103,7 +103,7 @@ class InstanceChecker { // NOTE: stale rowsets will be lost after BE restarts, so there may be some stale delete bitmaps // which will not be cleared. // version = 2 : https://github.com/apache/doris/pull/49822 - int do_delete_bitmap_storage_optimize_check(int version = 1); + int do_delete_bitmap_storage_optimize_check(int version = 2); int do_mow_job_key_check(); @@ -130,8 +130,6 @@ class InstanceChecker { const std::function& collect_cb); int get_pending_delete_bitmap_keys(int64_t tablet_id, std::unordered_set& pending_delete_bitmaps); - - int check_delete_bitmap_storage_optimize(int64_t tablet_id); int check_delete_bitmap_storage_optimize_v2(int64_t tablet_id, int64_t& abnormal_rowsets_num); std::atomic_bool stopped_ {false}; diff --git a/cloud/test/recycler_test.cpp b/cloud/test/recycler_test.cpp index 5a182223f94bc6..cd23ca53c5a232 100644 --- a/cloud/test/recycler_test.cpp +++ b/cloud/test/recycler_test.cpp @@ -3130,8 +3130,6 @@ TEST(CheckerTest, delete_bitmap_inverted_check_abnormal) { } TEST(CheckerTest, delete_bitmap_storage_optimize_check_normal) { - config::delete_bitmap_storage_optimize_check_version_gap = 0; - auto txn_kv = std::make_shared(); ASSERT_EQ(txn_kv->init(), 0); @@ -3237,73 +3235,9 @@ TEST(CheckerTest, delete_bitmap_storage_optimize_check_normal) { } ASSERT_EQ(TxnErrorCode::TXN_OK, txn->commit()); - ASSERT_EQ(checker.do_delete_bitmap_storage_optimize_check(), 0); ASSERT_EQ(checker.do_delete_bitmap_storage_optimize_check(2), 0); } -TEST(CheckerTest, delete_bitmap_storage_optimize_check_abnormal) { - config::delete_bitmap_storage_optimize_check_version_gap = 0; - // abnormal case, some rowsets' delete bitmaps are not deleted as expected - auto txn_kv = std::make_shared(); - ASSERT_EQ(txn_kv->init(), 0); - - InstanceInfoPB instance; - instance.set_instance_id(instance_id); - auto obj_info = instance.add_obj_info(); - obj_info->set_id("1"); - - InstanceChecker checker(txn_kv, instance_id); - ASSERT_EQ(checker.init(instance), 0); - auto accessor = checker.accessor_map_.begin()->second; - - // tablet_id -> [rowset_id] - std::map> expected_abnormal_rowsets {}; - std::map> real_abnormal_rowsets {}; - auto sp = SyncPoint::get_instance(); - DORIS_CLOUD_DEFER { - SyncPoint::get_instance()->clear_all_call_backs(); - }; - sp->set_call_back("InstanceChecker::check_delete_bitmap_storage_optimize.get_abnormal_rowset", - [&real_abnormal_rowsets](auto&& args) { - int64_t tablet_id = *try_any_cast(args[0]); - std::string rowset_id = *try_any_cast(args[1]); - real_abnormal_rowsets[tablet_id].insert(rowset_id); - }); - sp->enable_processing(); - - std::unique_ptr txn; - ASSERT_EQ(TxnErrorCode::TXN_OK, txn_kv->create_txn(&txn)); - - constexpr int table_id = 10000, index_id = 10001, partition_id = 10002; - - int64_t rowset_start_id = 700; - for (int tablet_id = 900001; tablet_id <= 900005; ++tablet_id) { - ASSERT_EQ(0, - create_tablet(txn_kv.get(), table_id, index_id, partition_id, tablet_id, true)); - std::vector> rowset_vers {{2, 2}, {3, 3}, {4, 4}, {5, 5}, - {6, 7}, {8, 8}, {9, 9}}; - std::vector> delete_bitmaps_vers { - {2, 9}, {7, 9}, {4, 9}, {7, 9}, {7, 9}, {8, 9}, {9, 9}}; - std::vector segments_overlap {true, true, true, true, false, true, true}; - for (size_t i {0}; i < 7; i++) { - std::string rowset_id = std::to_string(rowset_start_id++); - create_committed_rowset_with_rowset_id(txn_kv.get(), accessor.get(), "1", tablet_id, - rowset_vers[i].first, rowset_vers[i].second, - rowset_id, segments_overlap[i], 1); - create_delete_bitmaps(txn.get(), tablet_id, rowset_id, delete_bitmaps_vers[i].first, - delete_bitmaps_vers[i].second); - if (delete_bitmaps_vers[i].first < 7) { - expected_abnormal_rowsets[tablet_id].insert(rowset_id); - } - } - } - - ASSERT_EQ(TxnErrorCode::TXN_OK, txn->commit()); - - ASSERT_EQ(checker.do_delete_bitmap_storage_optimize_check(), 1); - ASSERT_EQ(expected_abnormal_rowsets, real_abnormal_rowsets); -} - std::unique_ptr get_meta_service() { int ret = 0; // MemKv