From 18a3297e26999dc769e6355e144e480e5816eea4 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Sun, 8 Oct 2023 12:54:10 +0800 Subject: [PATCH 1/7] [enhancement](merge-on-write) add detailed trace while publish --- be/src/olap/tablet.cpp | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index ecb1021beb5e25..e4541ea8003acd 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -153,7 +153,7 @@ static bvar::Adder g_tablet_pk_not_found("doris_pk", "lookup_not_found static bvar::PerSecond> g_tablet_pk_not_found_per_second( "doris_pk", "lookup_not_found_per_second", &g_tablet_pk_not_found, 60); -const std::chrono::seconds TRACE_TABLET_LOCK_THRESHOLD = 10s; +const std::chrono::seconds TRACE_TABLET_LOCK_THRESHOLD = 3s; DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(flush_bytes, MetricUnit::BYTES); DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(flush_finish_count, MetricUnit::OPERATIONS); @@ -3314,8 +3314,10 @@ Status Tablet::update_delete_bitmap(const RowsetSharedPtr& rowset, RowsetIdUnorderedSet rowset_ids_to_del; int64_t cur_version = rowset->start_version(); + OlapStopWatch watch; std::vector segments; static_cast(_load_rowset_segments(rowset, &segments)); + auto t1 = watch.get_elapse_time_us(); { std::shared_lock meta_rlock(_meta_lock); @@ -3328,6 +3330,8 @@ Status Tablet::update_delete_bitmap(const RowsetSharedPtr& rowset, } cur_rowset_ids = all_rs_id(cur_version - 1); } + auto t2 = watch.get_elapse_time_us(); + _rowset_ids_difference(cur_rowset_ids, pre_rowset_ids, &rowset_ids_to_add, &rowset_ids_to_del); for (const auto& to_del : rowset_ids_to_del) { delete_bitmap->remove({to_del, 0, 0}, {to_del, UINT32_MAX, INT64_MAX}); @@ -3338,13 +3342,23 @@ Status Tablet::update_delete_bitmap(const RowsetSharedPtr& rowset, std::shared_lock meta_rlock(_meta_lock); specified_rowsets = get_rowset_by_ids(&rowset_ids_to_add); } + auto t3 = watch.get_elapse_time_us(); - OlapStopWatch watch; auto token = StorageEngine::instance()->calc_delete_bitmap_executor()->create_token(); RETURN_IF_ERROR(calc_delete_bitmap(rowset, segments, specified_rowsets, delete_bitmap, cur_version - 1, token.get(), rowset_writer)); RETURN_IF_ERROR(token->wait()); RETURN_IF_ERROR(token->get_delete_bitmap(delete_bitmap)); + + std::stringstream ss; + if (watch.get_elapse_time_us() < 1 * 1000 * 1000) { + ss << "cost: " << watch.get_elapse_time_us() - t3 << "(us)"; + } else { + ss << "cost(us): (load segments: " << t1 << ", get all rsid: " << t2 - t1 + << ", get rowsets: " << t3 - t2 + << ", calc delete bitmap: " << watch.get_elapse_time_us() - t3 << ")"; + } + size_t total_rows = std::accumulate( segments.begin(), segments.end(), 0, [](size_t sum, const segment_v2::SegmentSharedPtr& s) { return sum += s->num_rows(); }); @@ -3352,7 +3366,7 @@ Status Tablet::update_delete_bitmap(const RowsetSharedPtr& rowset, << ", rowset_ids to add: " << rowset_ids_to_add.size() << ", rowset_ids to del: " << rowset_ids_to_del.size() << ", cur max_version: " << cur_version << ", transaction_id: " << txn_id - << ", cost: " << watch.get_elapse_time_us() << "(us), total rows: " << total_rows; + << "," << ss.str() << " , total rows: " << total_rows; if (config::enable_merge_on_write_correctness_check && rowset->num_rows() != 0) { // only do correctness check if the rowset has at least one row written From d7778b080e56808f721efb4795e1ca8bd0c098be Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Sun, 8 Oct 2023 16:54:51 +0800 Subject: [PATCH 2/7] update --- be/src/olap/tablet.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index e4541ea8003acd..e07055c065f698 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -3316,7 +3316,7 @@ Status Tablet::update_delete_bitmap(const RowsetSharedPtr& rowset, OlapStopWatch watch; std::vector segments; - static_cast(_load_rowset_segments(rowset, &segments)); + RETURN_IF_ERROR(_load_rowset_segments(rowset, &segments)); auto t1 = watch.get_elapse_time_us(); { From 916a2d1c2fc0abf717ebc3478e9c1f27456937d7 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Sun, 8 Oct 2023 19:43:52 +0800 Subject: [PATCH 3/7] reformat code --- be/src/olap/tablet.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index e07055c065f698..47967f5d90e10f 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -3365,8 +3365,8 @@ Status Tablet::update_delete_bitmap(const RowsetSharedPtr& rowset, LOG(INFO) << "[Publish] construct delete bitmap tablet: " << tablet_id() << ", rowset_ids to add: " << rowset_ids_to_add.size() << ", rowset_ids to del: " << rowset_ids_to_del.size() - << ", cur max_version: " << cur_version << ", transaction_id: " << txn_id - << "," << ss.str() << " , total rows: " << total_rows; + << ", cur max_version: " << cur_version << ", transaction_id: " << txn_id << "," + << ss.str() << " , total rows: " << total_rows; if (config::enable_merge_on_write_correctness_check && rowset->num_rows() != 0) { // only do correctness check if the rowset has at least one row written From 30b1f53b092a92370d48f124c813a19d3459533a Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Mon, 9 Oct 2023 18:17:31 +0800 Subject: [PATCH 4/7] refine lock --- be/src/olap/tablet.cpp | 252 +++++++++++++++++++----------------- be/src/olap/tablet_meta.cpp | 10 ++ 2 files changed, 140 insertions(+), 122 deletions(-) diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 47967f5d90e10f..f5168835ab32fe 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -678,153 +678,161 @@ void Tablet::_delete_stale_rowset_by_version(const Version& version) { void Tablet::delete_expired_stale_rowset() { int64_t now = UnixSeconds(); - std::lock_guard wrlock(_meta_lock); - SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); - // Compute the end time to delete rowsets, when a expired rowset createtime less then this time, it will be deleted. - double expired_stale_sweep_endtime = - ::difftime(now, config::tablet_rowset_stale_sweep_time_sec); - if (config::tablet_rowset_stale_sweep_by_size) { - expired_stale_sweep_endtime = now; - } - - std::vector path_id_vec; - // capture the path version to delete - _timestamped_version_tracker.capture_expired_paths( - static_cast(expired_stale_sweep_endtime), &path_id_vec); - - if (path_id_vec.empty()) { - return; - } - - const RowsetSharedPtr lastest_delta = rowset_with_max_version(); - if (lastest_delta == nullptr) { - LOG(WARNING) << "lastest_delta is null " << tablet_id(); - return; - } - - // fetch missing version before delete - std::vector missed_versions; - calc_missed_versions_unlocked(lastest_delta->end_version(), &missed_versions); - - if (!missed_versions.empty()) { - LOG(WARNING) << "tablet:" << full_name() - << ", missed version for version:" << lastest_delta->end_version(); - _print_missed_versions(missed_versions); - return; - } + // hold write lock while processing stable rowset + { + std::lock_guard wrlock(_meta_lock); + SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); + // Compute the end time to delete rowsets, when a expired rowset createtime less then this time, it will be deleted. + double expired_stale_sweep_endtime = + ::difftime(now, config::tablet_rowset_stale_sweep_time_sec); + if (config::tablet_rowset_stale_sweep_by_size) { + expired_stale_sweep_endtime = now; + } - // do check consistent operation - auto path_id_iter = path_id_vec.begin(); + std::vector path_id_vec; + // capture the path version to delete + _timestamped_version_tracker.capture_expired_paths( + static_cast(expired_stale_sweep_endtime), &path_id_vec); - std::map stale_version_path_map; - while (path_id_iter != path_id_vec.end()) { - PathVersionListSharedPtr version_path = - _timestamped_version_tracker.fetch_and_delete_path_by_id(*path_id_iter); + if (path_id_vec.empty()) { + return; + } - Version test_version = Version(0, lastest_delta->end_version()); - stale_version_path_map[*path_id_iter] = version_path; + const RowsetSharedPtr lastest_delta = rowset_with_max_version(); + if (lastest_delta == nullptr) { + LOG(WARNING) << "lastest_delta is null " << tablet_id(); + return; + } - Status status = capture_consistent_versions(test_version, nullptr); - // 1. When there is no consistent versions, we must reconstruct the tracker. - if (!status.ok()) { - // 2. fetch missing version after delete - std::vector after_missed_versions; - calc_missed_versions_unlocked(lastest_delta->end_version(), &after_missed_versions); + // fetch missing version before delete + std::vector missed_versions; + calc_missed_versions_unlocked(lastest_delta->end_version(), &missed_versions); - // 2.1 check whether missed_versions and after_missed_versions are the same. - // when they are the same, it means we can delete the path securely. - bool is_missing = missed_versions.size() != after_missed_versions.size(); + if (!missed_versions.empty()) { + LOG(WARNING) << "tablet:" << full_name() + << ", missed version for version:" << lastest_delta->end_version(); + _print_missed_versions(missed_versions); + return; + } - if (!is_missing) { - for (int ver_index = 0; ver_index < missed_versions.size(); ver_index++) { - if (missed_versions[ver_index] != after_missed_versions[ver_index]) { - is_missing = true; - break; - } - } - } + // do check consistent operation + auto path_id_iter = path_id_vec.begin(); - if (is_missing) { - LOG(WARNING) << "The consistent version check fails, there are bugs. " - << "Reconstruct the tracker to recover versions in tablet=" - << tablet_id(); + std::map stale_version_path_map; + while (path_id_iter != path_id_vec.end()) { + PathVersionListSharedPtr version_path = + _timestamped_version_tracker.fetch_and_delete_path_by_id(*path_id_iter); - // 3. try to recover - _timestamped_version_tracker.recover_versioned_tracker(stale_version_path_map); + Version test_version = Version(0, lastest_delta->end_version()); + stale_version_path_map[*path_id_iter] = version_path; - // 4. double check the consistent versions - // fetch missing version after recover - std::vector recover_missed_versions; - calc_missed_versions_unlocked(lastest_delta->end_version(), - &recover_missed_versions); + Status status = capture_consistent_versions(test_version, nullptr); + // 1. When there is no consistent versions, we must reconstruct the tracker. + if (!status.ok()) { + // 2. fetch missing version after delete + std::vector after_missed_versions; + calc_missed_versions_unlocked(lastest_delta->end_version(), &after_missed_versions); - // 4.1 check whether missed_versions and recover_missed_versions are the same. - // when they are the same, it means we recover successfully. - bool is_recover_missing = missed_versions.size() != recover_missed_versions.size(); + // 2.1 check whether missed_versions and after_missed_versions are the same. + // when they are the same, it means we can delete the path securely. + bool is_missing = missed_versions.size() != after_missed_versions.size(); - if (!is_recover_missing) { + if (!is_missing) { for (int ver_index = 0; ver_index < missed_versions.size(); ver_index++) { - if (missed_versions[ver_index] != recover_missed_versions[ver_index]) { - is_recover_missing = true; + if (missed_versions[ver_index] != after_missed_versions[ver_index]) { + is_missing = true; break; } } } - // 5. check recover fail, version is mission - if (is_recover_missing) { - if (!config::ignore_rowset_stale_unconsistent_delete) { - LOG(FATAL) << "rowset stale unconsistent delete. tablet= " << tablet_id(); - } else { - LOG(WARNING) << "rowset stale unconsistent delete. tablet= " << tablet_id(); + if (is_missing) { + LOG(WARNING) << "The consistent version check fails, there are bugs. " + << "Reconstruct the tracker to recover versions in tablet=" + << tablet_id(); + + // 3. try to recover + _timestamped_version_tracker.recover_versioned_tracker(stale_version_path_map); + + // 4. double check the consistent versions + // fetch missing version after recover + std::vector recover_missed_versions; + calc_missed_versions_unlocked(lastest_delta->end_version(), + &recover_missed_versions); + + // 4.1 check whether missed_versions and recover_missed_versions are the same. + // when they are the same, it means we recover successfully. + bool is_recover_missing = + missed_versions.size() != recover_missed_versions.size(); + + if (!is_recover_missing) { + for (int ver_index = 0; ver_index < missed_versions.size(); ver_index++) { + if (missed_versions[ver_index] != recover_missed_versions[ver_index]) { + is_recover_missing = true; + break; + } + } + } + + // 5. check recover fail, version is mission + if (is_recover_missing) { + if (!config::ignore_rowset_stale_unconsistent_delete) { + LOG(FATAL) + << "rowset stale unconsistent delete. tablet= " << tablet_id(); + } else { + LOG(WARNING) + << "rowset stale unconsistent delete. tablet= " << tablet_id(); + } } } + return; } - return; - } - path_id_iter++; - } - - auto old_size = _stale_rs_version_map.size(); - auto old_meta_size = _tablet_meta->all_stale_rs_metas().size(); - - // do delete operation - auto to_delete_iter = stale_version_path_map.begin(); - while (to_delete_iter != stale_version_path_map.end()) { - std::vector& to_delete_version = - to_delete_iter->second->timestamped_versions(); - for (auto& timestampedVersion : to_delete_version) { - auto it = _stale_rs_version_map.find(timestampedVersion->version()); - if (it != _stale_rs_version_map.end()) { - // delete rowset - StorageEngine::instance()->add_unused_rowset(it->second); - _stale_rs_version_map.erase(it); - VLOG_NOTICE << "delete stale rowset tablet=" << full_name() << " version[" - << timestampedVersion->version().first << "," - << timestampedVersion->version().second - << "] move to unused_rowset success " << std::fixed - << expired_stale_sweep_endtime; - } else { - LOG(WARNING) << "delete stale rowset tablet=" << full_name() << " version[" - << timestampedVersion->version().first << "," - << timestampedVersion->version().second - << "] not find in stale rs version map"; + path_id_iter++; + } + + auto old_size = _stale_rs_version_map.size(); + auto old_meta_size = _tablet_meta->all_stale_rs_metas().size(); + + // do delete operation + auto to_delete_iter = stale_version_path_map.begin(); + while (to_delete_iter != stale_version_path_map.end()) { + std::vector& to_delete_version = + to_delete_iter->second->timestamped_versions(); + for (auto& timestampedVersion : to_delete_version) { + auto it = _stale_rs_version_map.find(timestampedVersion->version()); + if (it != _stale_rs_version_map.end()) { + // delete rowset + StorageEngine::instance()->add_unused_rowset(it->second); + _stale_rs_version_map.erase(it); + VLOG_NOTICE << "delete stale rowset tablet=" << full_name() << " version[" + << timestampedVersion->version().first << "," + << timestampedVersion->version().second + << "] move to unused_rowset success " << std::fixed + << expired_stale_sweep_endtime; + } else { + LOG(WARNING) << "delete stale rowset tablet=" << full_name() << " version[" + << timestampedVersion->version().first << "," + << timestampedVersion->version().second + << "] not find in stale rs version map"; + } + _delete_stale_rowset_by_version(timestampedVersion->version()); } - _delete_stale_rowset_by_version(timestampedVersion->version()); + to_delete_iter++; } - to_delete_iter++; - } - - bool reconstructed = _reconstruct_version_tracker_if_necessary(); - VLOG_NOTICE << "delete stale rowset _stale_rs_version_map tablet=" << full_name() - << " current_size=" << _stale_rs_version_map.size() << " old_size=" << old_size - << " current_meta_size=" << _tablet_meta->all_stale_rs_metas().size() - << " old_meta_size=" << old_meta_size << " sweep endtime " << std::fixed - << expired_stale_sweep_endtime << ", reconstructed=" << reconstructed; + bool reconstructed = _reconstruct_version_tracker_if_necessary(); + VLOG_NOTICE << "delete stale rowset _stale_rs_version_map tablet=" << full_name() + << " current_size=" << _stale_rs_version_map.size() << " old_size=" << old_size + << " current_meta_size=" << _tablet_meta->all_stale_rs_metas().size() + << " old_meta_size=" << old_meta_size << " sweep endtime " << std::fixed + << expired_stale_sweep_endtime << ", reconstructed=" << reconstructed; + } #ifndef BE_TEST - save_meta(); + { + std::shared_lock rlock(_meta_lock); + save_meta(); + } #endif } diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index 74d7f1e681baeb..fb9b4510129020 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -38,6 +38,7 @@ #include "olap/tablet_meta_manager.h" #include "olap/utils.h" #include "util/string_util.h" +#include "util/time.h" #include "util/uid_util.h" using std::string; @@ -437,12 +438,21 @@ Status TabletMeta::_save_meta(DataDir* data_dir) { << " tablet=" << full_name() << " _tablet_uid=" << _tablet_uid.to_string(); } string meta_binary; + + auto t1 = MonotonicMicros(); RETURN_IF_ERROR(serialize(&meta_binary)); + auto t2 = MonotonicMicros(); Status status = TabletMetaManager::save(data_dir, tablet_id(), schema_hash(), meta_binary); if (!status.ok()) { LOG(FATAL) << "fail to save tablet_meta. status=" << status << ", tablet_id=" << tablet_id() << ", schema_hash=" << schema_hash(); } + auto t3 = MonotonicMicros(); + auto cost = t3 - t1; + if (cost > 1 * 1000 * 1000) { + LOG(INFO) << "Save tablet(" << full_name() << ") meta too slow. serialize cost " << t2 - t1 + << "(us), write rocksdb cost " << t3 - t2 << "(us)"; + } return status; } From 74e29d3d06c9ca4182d78332d0279c3f0755860e Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 10 Oct 2023 11:00:33 +0800 Subject: [PATCH 5/7] update log --- be/src/olap/tablet_meta.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index fb9b4510129020..57f0416c0c65a5 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -451,7 +451,8 @@ Status TabletMeta::_save_meta(DataDir* data_dir) { auto cost = t3 - t1; if (cost > 1 * 1000 * 1000) { LOG(INFO) << "Save tablet(" << full_name() << ") meta too slow. serialize cost " << t2 - t1 - << "(us), write rocksdb cost " << t3 - t2 << "(us)"; + << "(us), serialized binary size: " << meta_binary.length() + << "(bytes), write rocksdb cost " << t3 - t2 << "(us)"; } return status; } From 3e51d154dc763369fb1be63f42870369c9018caf Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 10 Oct 2023 20:18:10 +0800 Subject: [PATCH 6/7] update --- be/src/olap/tablet.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index f5168835ab32fe..90ee40635ca7e0 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -153,7 +153,7 @@ static bvar::Adder g_tablet_pk_not_found("doris_pk", "lookup_not_found static bvar::PerSecond> g_tablet_pk_not_found_per_second( "doris_pk", "lookup_not_found_per_second", &g_tablet_pk_not_found, 60); -const std::chrono::seconds TRACE_TABLET_LOCK_THRESHOLD = 3s; +const std::chrono::seconds TRACE_TABLET_LOCK_THRESHOLD = 1s; DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(flush_bytes, MetricUnit::BYTES); DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(flush_finish_count, MetricUnit::OPERATIONS); From 771def7da86cf64964078fb0912ac88a143194a8 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 10 Oct 2023 22:06:17 +0800 Subject: [PATCH 7/7] update --- be/src/olap/tablet_meta.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index 57f0416c0c65a5..c118737210d23c 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -450,7 +450,7 @@ Status TabletMeta::_save_meta(DataDir* data_dir) { auto t3 = MonotonicMicros(); auto cost = t3 - t1; if (cost > 1 * 1000 * 1000) { - LOG(INFO) << "Save tablet(" << full_name() << ") meta too slow. serialize cost " << t2 - t1 + LOG(INFO) << "save tablet(" << full_name() << ") meta too slow. serialize cost " << t2 - t1 << "(us), serialized binary size: " << meta_binary.length() << "(bytes), write rocksdb cost " << t3 - t2 << "(us)"; }