Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions cloud/src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,6 @@ CONF_Int32(recycle_pool_parallelism, "40");
CONF_Bool(enable_inverted_check, "false");
// Currently only used for recycler test
CONF_Bool(enable_delete_bitmap_inverted_check, "false");
// checks if https://github.com/apache/doris/pull/40204 works as expected
CONF_Bool(enable_delete_bitmap_storage_optimize_check, "false");
CONF_mInt64(delete_bitmap_storage_optimize_check_version_gap, "1000");
CONF_Bool(enable_delete_bitmap_storage_optimize_v2_check, "false");
CONF_mInt64(delete_bitmap_storage_optimize_v2_check_skip_seconds, "300"); // 5min
// interval for scanning instances to do checks and inspections
Expand Down
153 changes: 16 additions & 137 deletions cloud/src/recycler/checker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,12 +189,6 @@ int Checker::start() {
}
}

if (config::enable_delete_bitmap_storage_optimize_check) {
if (int ret = checker->do_delete_bitmap_storage_optimize_check(); ret != 0) {
success = false;
}
}

if (config::enable_mow_job_key_check) {
if (int ret = checker->do_mow_job_key_check(); ret != 0) {
success = false;
Expand Down Expand Up @@ -1221,113 +1215,6 @@ int InstanceChecker::do_delete_bitmap_inverted_check() {
return (leaked_delete_bitmaps > 0 || abnormal_delete_bitmaps > 0) ? 1 : 0;
}

int InstanceChecker::check_delete_bitmap_storage_optimize(int64_t tablet_id) {
using Version = std::pair<int64_t, int64_t>;
struct RowsetDigest {
std::string rowset_id;
Version version;
doris::SegmentsOverlapPB segments_overlap;

bool operator<(const RowsetDigest& other) const {
return version.first < other.version.first;
}

bool produced_by_compaction() const {
return (version.first < version.second) ||
((version.first == version.second) && segments_overlap == NONOVERLAPPING);
}
};

// number of rowsets which may have problems
int64_t abnormal_rowsets_num {0};

std::vector<RowsetDigest> tablet_rowsets {};
// Get all visible rowsets of this tablet
auto collect_cb = [&tablet_rowsets](const doris::RowsetMetaCloudPB& rowset) {
if (rowset.start_version() == 0 && rowset.end_version() == 1) {
// ignore dummy rowset [0-1]
return;
}
tablet_rowsets.emplace_back(
rowset.rowset_id_v2(),
std::make_pair<int64_t, int64_t>(rowset.start_version(), rowset.end_version()),
rowset.segments_overlap_pb());
};
if (int ret = collect_tablet_rowsets(tablet_id, collect_cb); ret != 0) {
return ret;
}

std::sort(tablet_rowsets.begin(), tablet_rowsets.end());

// find right-most rowset which is produced by compaction
auto it = std::find_if(
tablet_rowsets.crbegin(), tablet_rowsets.crend(),
[](const RowsetDigest& rowset) { return rowset.produced_by_compaction(); });
if (it == tablet_rowsets.crend()) {
LOG(INFO) << fmt::format(
"[delete bitmap checker] skip to check delete bitmap storage optimize for "
"tablet_id={} because it doesn't have compacted rowsets.",
tablet_id);
return 0;
}

int64_t start_version = it->version.first;
int64_t pre_min_version = it->version.second;

// after BE sweeping stale rowsets, all rowsets in this tablet before
// should not have delete bitmaps with versions lower than `pre_min_version`
if (config::delete_bitmap_storage_optimize_check_version_gap > 0) {
pre_min_version -= config::delete_bitmap_storage_optimize_check_version_gap;
if (pre_min_version <= 1) {
LOG(INFO) << fmt::format(
"[delete bitmap checker] skip to check delete bitmap storage optimize for "
"tablet_id={} because pre_min_version is too small.",
tablet_id);
return 0;
}
}

auto check_func = [pre_min_version, instance_id = instance_id_](
int64_t tablet_id, std::string_view rowset_id, int64_t version,
int64_t segment_id) -> int {
if (version < pre_min_version) {
LOG(WARNING) << fmt::format(
"[delete bitmap check fails] delete bitmap storage optimize check fail for "
"instance_id={}, tablet_id={}, rowset_id={}, found delete bitmap with "
"version={} < pre_min_version={}",
instance_id, tablet_id, rowset_id, version, pre_min_version);
return 1;
}
return 0;
};

for (const auto& rowset : tablet_rowsets) {
// check for all rowsets before the max compacted rowset
if (rowset.version.second < start_version) {
auto rowset_id = rowset.rowset_id;
int ret = traverse_rowset_delete_bitmaps(tablet_id, rowset_id, check_func);
if (ret < 0) {
return ret;
}

if (ret != 0) {
++abnormal_rowsets_num;
TEST_SYNC_POINT_CALLBACK(
"InstanceChecker::check_delete_bitmap_storage_optimize.get_abnormal_rowset",
&tablet_id, &rowset_id);
}
}
}

LOG(INFO) << fmt::format(
"[delete bitmap checker] finish check delete bitmap storage optimize for "
"instance_id={}, tablet_id={}, rowsets_num={}, abnormal_rowsets_num={}, "
"pre_min_version={}",
instance_id_, tablet_id, tablet_rowsets.size(), abnormal_rowsets_num, pre_min_version);

return (abnormal_rowsets_num > 1 ? 1 : 0);
}

int InstanceChecker::get_pending_delete_bitmap_keys(
int64_t tablet_id, std::unordered_set<std::string>& pending_delete_bitmaps) {
std::unique_ptr<Transaction> txn;
Expand Down Expand Up @@ -1520,6 +1407,9 @@ int InstanceChecker::check_delete_bitmap_storage_optimize_v2(
}

int InstanceChecker::do_delete_bitmap_storage_optimize_check(int version) {
if (version != 2) {
return -1;
}
int64_t total_tablets_num {0};
int64_t failed_tablets_num {0};

Expand All @@ -1531,20 +1421,13 @@ int InstanceChecker::do_delete_bitmap_storage_optimize_check(int version) {
int ret = traverse_mow_tablet([&](int64_t tablet_id) {
++total_tablets_num;
int64_t rowsets_with_useless_delete_bitmap_version = 0;
int res = 0;
if (version == 1) {
res = check_delete_bitmap_storage_optimize(tablet_id);
} else if (version == 2) {
res = check_delete_bitmap_storage_optimize_v2(
tablet_id, rowsets_with_useless_delete_bitmap_version);
if (rowsets_with_useless_delete_bitmap_version >
max_rowsets_with_useless_delete_bitmap_version) {
max_rowsets_with_useless_delete_bitmap_version =
rowsets_with_useless_delete_bitmap_version;
tablet_id_with_max_rowsets_with_useless_delete_bitmap_version = tablet_id;
}
} else {
return -1;
int res = check_delete_bitmap_storage_optimize_v2(
tablet_id, rowsets_with_useless_delete_bitmap_version);
if (rowsets_with_useless_delete_bitmap_version >
max_rowsets_with_useless_delete_bitmap_version) {
max_rowsets_with_useless_delete_bitmap_version =
rowsets_with_useless_delete_bitmap_version;
tablet_id_with_max_rowsets_with_useless_delete_bitmap_version = tablet_id;
}
failed_tablets_num += (res != 0);
return res;
Expand All @@ -1554,20 +1437,16 @@ int InstanceChecker::do_delete_bitmap_storage_optimize_check(int version) {
return ret;
}

if (version == 2) {
g_bvar_max_rowsets_with_useless_delete_bitmap_version.put(
instance_id_, max_rowsets_with_useless_delete_bitmap_version);
}
g_bvar_max_rowsets_with_useless_delete_bitmap_version.put(
instance_id_, max_rowsets_with_useless_delete_bitmap_version);

std::stringstream ss;
ss << "[delete bitmap checker] check delete bitmap storage optimize v" << version
<< " for instance_id=" << instance_id_ << ", total_tablets_num=" << total_tablets_num
<< ", failed_tablets_num=" << failed_tablets_num;
if (version == 2) {
ss << ". max_rowsets_with_useless_delete_bitmap_version="
<< max_rowsets_with_useless_delete_bitmap_version
<< ", tablet_id=" << tablet_id_with_max_rowsets_with_useless_delete_bitmap_version;
}
<< ", failed_tablets_num=" << failed_tablets_num
<< ". max_rowsets_with_useless_delete_bitmap_version="
<< max_rowsets_with_useless_delete_bitmap_version
<< ", tablet_id=" << tablet_id_with_max_rowsets_with_useless_delete_bitmap_version;
LOG(INFO) << ss.str();

return (failed_tablets_num > 0) ? 1 : 0;
Expand Down
4 changes: 1 addition & 3 deletions cloud/src/recycler/checker.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class InstanceChecker {
// NOTE: stale rowsets will be lost after BE restarts, so there may be some stale delete bitmaps
// which will not be cleared.
// version = 2 : https://github.com/apache/doris/pull/49822
int do_delete_bitmap_storage_optimize_check(int version = 1);
int do_delete_bitmap_storage_optimize_check(int version = 2);

int do_mow_job_key_check();

Expand All @@ -130,8 +130,6 @@ class InstanceChecker {
const std::function<void(const doris::RowsetMetaCloudPB&)>& collect_cb);
int get_pending_delete_bitmap_keys(int64_t tablet_id,
std::unordered_set<std::string>& pending_delete_bitmaps);

int check_delete_bitmap_storage_optimize(int64_t tablet_id);
int check_delete_bitmap_storage_optimize_v2(int64_t tablet_id, int64_t& abnormal_rowsets_num);

std::atomic_bool stopped_ {false};
Expand Down
66 changes: 0 additions & 66 deletions cloud/test/recycler_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3130,8 +3130,6 @@ TEST(CheckerTest, delete_bitmap_inverted_check_abnormal) {
}

TEST(CheckerTest, delete_bitmap_storage_optimize_check_normal) {
config::delete_bitmap_storage_optimize_check_version_gap = 0;

auto txn_kv = std::make_shared<MemTxnKv>();
ASSERT_EQ(txn_kv->init(), 0);

Expand Down Expand Up @@ -3237,73 +3235,9 @@ TEST(CheckerTest, delete_bitmap_storage_optimize_check_normal) {
}

ASSERT_EQ(TxnErrorCode::TXN_OK, txn->commit());
ASSERT_EQ(checker.do_delete_bitmap_storage_optimize_check(), 0);
ASSERT_EQ(checker.do_delete_bitmap_storage_optimize_check(2), 0);
}

TEST(CheckerTest, delete_bitmap_storage_optimize_check_abnormal) {
config::delete_bitmap_storage_optimize_check_version_gap = 0;
// abnormal case, some rowsets' delete bitmaps are not deleted as expected
auto txn_kv = std::make_shared<MemTxnKv>();
ASSERT_EQ(txn_kv->init(), 0);

InstanceInfoPB instance;
instance.set_instance_id(instance_id);
auto obj_info = instance.add_obj_info();
obj_info->set_id("1");

InstanceChecker checker(txn_kv, instance_id);
ASSERT_EQ(checker.init(instance), 0);
auto accessor = checker.accessor_map_.begin()->second;

// tablet_id -> [rowset_id]
std::map<std::int64_t, std::set<std::string>> expected_abnormal_rowsets {};
std::map<std::int64_t, std::set<std::string>> real_abnormal_rowsets {};
auto sp = SyncPoint::get_instance();
DORIS_CLOUD_DEFER {
SyncPoint::get_instance()->clear_all_call_backs();
};
sp->set_call_back("InstanceChecker::check_delete_bitmap_storage_optimize.get_abnormal_rowset",
[&real_abnormal_rowsets](auto&& args) {
int64_t tablet_id = *try_any_cast<int64_t*>(args[0]);
std::string rowset_id = *try_any_cast<std::string*>(args[1]);
real_abnormal_rowsets[tablet_id].insert(rowset_id);
});
sp->enable_processing();

std::unique_ptr<Transaction> txn;
ASSERT_EQ(TxnErrorCode::TXN_OK, txn_kv->create_txn(&txn));

constexpr int table_id = 10000, index_id = 10001, partition_id = 10002;

int64_t rowset_start_id = 700;
for (int tablet_id = 900001; tablet_id <= 900005; ++tablet_id) {
ASSERT_EQ(0,
create_tablet(txn_kv.get(), table_id, index_id, partition_id, tablet_id, true));
std::vector<std::pair<int64_t, int64_t>> rowset_vers {{2, 2}, {3, 3}, {4, 4}, {5, 5},
{6, 7}, {8, 8}, {9, 9}};
std::vector<std::pair<int64_t, int64_t>> delete_bitmaps_vers {
{2, 9}, {7, 9}, {4, 9}, {7, 9}, {7, 9}, {8, 9}, {9, 9}};
std::vector<bool> segments_overlap {true, true, true, true, false, true, true};
for (size_t i {0}; i < 7; i++) {
std::string rowset_id = std::to_string(rowset_start_id++);
create_committed_rowset_with_rowset_id(txn_kv.get(), accessor.get(), "1", tablet_id,
rowset_vers[i].first, rowset_vers[i].second,
rowset_id, segments_overlap[i], 1);
create_delete_bitmaps(txn.get(), tablet_id, rowset_id, delete_bitmaps_vers[i].first,
delete_bitmaps_vers[i].second);
if (delete_bitmaps_vers[i].first < 7) {
expected_abnormal_rowsets[tablet_id].insert(rowset_id);
}
}
}

ASSERT_EQ(TxnErrorCode::TXN_OK, txn->commit());

ASSERT_EQ(checker.do_delete_bitmap_storage_optimize_check(), 1);
ASSERT_EQ(expected_abnormal_rowsets, real_abnormal_rowsets);
}

std::unique_ptr<MetaServiceProxy> get_meta_service() {
int ret = 0;
// MemKv
Expand Down
Loading