From 0335524abf4cac496e204d523c7e0e73b6ef51d7 Mon Sep 17 00:00:00 2001 From: meiyi Date: Mon, 17 Mar 2025 19:16:01 +0800 Subject: [PATCH] [fix](mow) check correctness for mow get_agg --- be/src/common/config.cpp | 2 ++ be/src/common/config.h | 2 ++ be/src/olap/tablet_meta.cpp | 61 ++++++++++++++++++++++++++++++++----- be/src/olap/tablet_meta.h | 1 + 4 files changed, 59 insertions(+), 7 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index bc8b13b8a5a9ae..b349c7430b85b4 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1197,6 +1197,8 @@ DEFINE_mInt64(mow_primary_key_index_max_size_in_memory, "52428800"); DEFINE_mInt32(publish_version_gap_logging_threshold, "200"); // get agg by cache for mow table DEFINE_mBool(enable_mow_get_agg_by_cache, "true"); +// get agg correctness check for mow table +DEFINE_mBool(enable_mow_get_agg_correctness_check_core, "false"); // The secure path with user files, used in the `local` table function. DEFINE_mString(user_files_secure_path, "${DORIS_HOME}"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 8105e9fc48c7d6..1c6948d4ee8716 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1264,6 +1264,8 @@ DECLARE_mInt64(mow_primary_key_index_max_size_in_memory); DECLARE_mInt32(publish_version_gap_logging_threshold); // get agg by cache for mow table DECLARE_mBool(enable_mow_get_agg_by_cache); +// get agg correctness check for mow table +DECLARE_mBool(enable_mow_get_agg_correctness_check_core); // The secure path with user files, used in the `local` table function. DECLARE_mString(user_files_secure_path); diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index bc0db84d525d2c..4b76cf63e3a5c2 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -944,12 +945,17 @@ void TabletMeta::delete_stale_rs_meta_by_version(const Version& version) { it++; } } - if (_enable_unique_key_merge_on_write) { - DCHECK(rowset_cache_version_size <= _rs_metas.size() + _stale_rs_metas.size()) - << "tablet: " << _tablet_id - << ", rowset_cache_version size: " << rowset_cache_version_size - << ", _rs_metas size: " << _rs_metas.size() - << ", _stale_rs_metas size:" << _stale_rs_metas.size(); + if (_enable_unique_key_merge_on_write && + rowset_cache_version_size > _rs_metas.size() + _stale_rs_metas.size()) { + std::string err_msg = fmt::format( + ". tablet: {}, rowset_cache_version size: {}, " + "_rs_metas size: {}, _stale_rs_metas size: {}", + _tablet_id, rowset_cache_version_size, _rs_metas.size(), _stale_rs_metas.size()); + if (config::enable_mow_get_agg_correctness_check_core) { + CHECK(false) << err_msg; + } else { + DCHECK(false) << err_msg; + } } } @@ -1347,7 +1353,20 @@ std::shared_ptr DeleteBitmap::get_agg(const BitmapKey& bmk) co if (start_version > 0) { Cache::Handle* handle2 = _agg_cache->repr()->lookup( agg_cache_key(_tablet_id, {std::get<0>(bmk), std::get<1>(bmk), start_version})); - if (handle2 == nullptr) { + + DBUG_EXECUTE_IF("DeleteBitmap::get_agg.cache_miss", { + if (handle2 != nullptr) { + auto p = dp->param("percent", 0.3); + std::mt19937 gen {std::random_device {}()}; + std::bernoulli_distribution inject_fault {p}; + if (inject_fault(gen)) { + LOG_INFO("injection DeleteBitmap::get_agg.cache_miss, tablet_id={}", + _tablet_id); + handle2 = nullptr; + } + } + }); + if (handle2 == nullptr || start_version > std::get<2>(bmk)) { start_version = 0; } else { val->bitmap |= @@ -1384,6 +1403,19 @@ std::shared_ptr DeleteBitmap::get_agg(const BitmapKey& bmk) co << ", rowset=" << std::get<0>(bmk).to_string() << ", segment=" << std::get<1>(bmk); } + if (start_version > 0 && config::enable_mow_get_agg_correctness_check_core) { + std::shared_ptr bitmap = get_agg_without_cache(bmk); + if (val->bitmap != *bitmap) { + CHECK(false) << ". get agg correctness check failed for tablet=" << _tablet_id + << ", rowset=" << std::get<0>(bmk).to_string() + << ", segment=" << std::get<1>(bmk) << ", version=" << std::get<2>(bmk) + << ". start_version from cache=" << start_version + << ", delete_bitmap cardinality with cache=" + << val->bitmap.cardinality() + << ", delete_bitmap cardinality without cache=" + << bitmap->cardinality(); + } + } } // It is natural for the cache to reclaim the underlying memory @@ -1391,6 +1423,21 @@ std::shared_ptr DeleteBitmap::get_agg(const BitmapKey& bmk) co &val->bitmap, [this, handle](...) { _agg_cache->repr()->release(handle); }); } +std::shared_ptr DeleteBitmap::get_agg_without_cache(const BitmapKey& bmk) const { + std::shared_ptr bitmap = std::make_shared(); + std::shared_lock l(lock); + DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), 0}; + for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) { + auto& [k, bm] = *it; + if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) || + std::get<2>(k) > std::get<2>(bmk)) { + break; + } + *bitmap |= bm; + } + return bitmap; +} + std::atomic DeleteBitmap::AggCache::s_repr {nullptr}; std::string tablet_state_name(TabletState state) { diff --git a/be/src/olap/tablet_meta.h b/be/src/olap/tablet_meta.h index 94890a7bf85c9d..e4d2505b506dd4 100644 --- a/be/src/olap/tablet_meta.h +++ b/be/src/olap/tablet_meta.h @@ -545,6 +545,7 @@ class DeleteBitmap { * @return shared_ptr to a bitmap, which may be empty */ std::shared_ptr get_agg(const BitmapKey& bmk) const; + std::shared_ptr get_agg_without_cache(const BitmapKey& bmk) const; void remove_sentinel_marks();